In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression


# 데이터 확인

In [2]:
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [3]:
df.isnull().sum()

Unnamed: 0                             0
id                                     0
Gender                                 0
Customer Type                          0
Age                                    0
Type of Travel                         0
Class                                  0
Flight Distance                        0
Inflight wifi service                  0
Departure/Arrival time convenient      0
Ease of Online booking                 0
Gate location                          0
Food and drink                         0
Online boarding                        0
Seat comfort                           0
Inflight entertainment                 0
On-board service                       0
Leg room service                       0
Baggage handling                       0
Checkin service                        0
Inflight service                       0
Cleanliness                            0
Departure Delay in Minutes             0
Arrival Delay in Minutes             310
satisfaction    

In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Unnamed: 0,103904.0,51951.5,29994.645522,0.0,25975.75,51951.5,77927.25,103903.0
id,103904.0,64924.210502,37463.812252,1.0,32533.75,64856.5,97368.25,129880.0
Age,103904.0,39.379706,15.114964,7.0,27.0,40.0,51.0,85.0
Flight Distance,103904.0,1189.448375,997.147281,31.0,414.0,843.0,1743.0,4983.0
Inflight wifi service,103904.0,2.729683,1.327829,0.0,2.0,3.0,4.0,5.0
Departure/Arrival time convenient,103904.0,3.060296,1.525075,0.0,2.0,3.0,4.0,5.0
Ease of Online booking,103904.0,2.756901,1.398929,0.0,2.0,3.0,4.0,5.0
Gate location,103904.0,2.976883,1.277621,0.0,2.0,3.0,4.0,5.0
Food and drink,103904.0,3.202129,1.329533,0.0,2.0,3.0,4.0,5.0
Online boarding,103904.0,3.250375,1.349509,0.0,2.0,3.0,4.0,5.0


# 결측치 처리: 회귀 대체 (Regression Imputation)

In [5]:
# 1. 상관계수 확인 (근거 확보)
corr = df[['Departure Delay in Minutes', 'Arrival Delay in Minutes']].corr().iloc[0, 1]
print(f"Daparture Delay과 Arrival Delay의 상관계수: {corr:.4f} ")


Daparture Delay과 Arrival Delay의 상관계수: 0.9655 


In [6]:

# 2. 모델 학습을 위한 데이터셋 분리
train_set = df.dropna(subset=['Arrival Delay in Minutes']) # 정답이 있는 데이터
missing_set = df[df['Arrival Delay in Minutes'].isnull()]  # 채워야 할 데이터

X_train = train_set[['Departure Delay in Minutes']]
y_train = train_set['Arrival Delay in Minutes']
X_missing = missing_set[['Departure Delay in Minutes']]


In [7]:

# 3. 선형 회귀 모델 학습
model = LinearRegression()
model.fit(X_train, y_train)


0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [8]:
# 4. 결측치 예측 및 채우기
predicted_delays = model.predict(X_missing)


In [9]:
# 예측값이 음수가 나오면 0으로 보정 (지연 시간이 마이너스일 순 없으므로)
predicted_delays = np.where(predicted_delays < 0, 0, predicted_delays)

df.loc[missing_set.index, 'Arrival Delay in Minutes'] = predicted_delays

print("결측치 개수")
print(df.isnull().sum())
print('-' * 50)
print(df.describe())


결측치 개수
Unnamed: 0                           0
id                                   0
Gender                               0
Customer Type                        0
Age                                  0
Type of Travel                       0
Class                                0
Flight Distance                      0
Inflight wifi service                0
Departure/Arrival time convenient    0
Ease of Online booking               0
Gate location                        0
Food and drink                       0
Online boarding                      0
Seat comfort                         0
Inflight entertainment               0
On-board service                     0
Leg room service                     0
Baggage handling                     0
Checkin service                      0
Inflight service                     0
Cleanliness                          0
Departure Delay in Minutes           0
Arrival Delay in Minutes             0
satisfaction                         0
dtype: int64
-----

# 피처 엔지니어링 (Feature Engineering)

In [10]:

# (1) 총 지연 시간 (Total Delay)
df['Total_Delay'] = df['Departure Delay in Minutes'] + df['Arrival Delay in Minutes']

# (2) 서비스 그룹핑 (점수화)
df['Hard_Service_Score'] = (df['Seat comfort'] + df['Food and drink'] + 
                            df['Leg room service'] + df['Inflight entertainment']) / 4

df['Soft_Service_Score'] = (df['On-board service'] + df['Checkin service'] + 
                            df['Baggage handling'] + df['Gate location']) / 4

df['Tech_Score'] = (df['Inflight wifi service'] + df['Ease of Online booking'] + 
                    df['Online boarding']) / 3

# (3) 고객 페르소나 (Persona) 정의
def define_persona(row):
    
    if row['Type of Travel'] == 'Business travel' and row['Class'] == 'Business':
        return 'VIP_Business' # 돈 많이 쓰고 까다로운 고객
    elif row['Type of Travel'] == 'Personal Travel':
        return 'Vacationer'   # 놀러 가는 들뜬 고객
    else:
        return 'Standard_Traveler' # 일반적인 출장/이동

df['Customer_Persona'] = df.apply(define_persona, axis=1)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 30 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Unnamed: 0                         103904 non-null  int64  
 1   id                                 103904 non-null  int64  
 2   Gender                             103904 non-null  object 
 3   Customer Type                      103904 non-null  object 
 4   Age                                103904 non-null  int64  
 5   Type of Travel                     103904 non-null  object 
 6   Class                              103904 non-null  object 
 7   Flight Distance                    103904 non-null  int64  
 8   Inflight wifi service              103904 non-null  int64  
 9   Departure/Arrival time convenient  103904 non-null  int64  
 10  Ease of Online booking             103904 non-null  int64  
 11  Gate location                      1039

In [11]:
# 만족(satisfied) = 1, 불만족 = 0 으로 변환 (머신러닝/상관분석용)
df['satisfaction_score'] = df['satisfaction'].apply(lambda x: 1 if x == 'satisfied' else 0)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 31 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Unnamed: 0                         103904 non-null  int64  
 1   id                                 103904 non-null  int64  
 2   Gender                             103904 non-null  object 
 3   Customer Type                      103904 non-null  object 
 4   Age                                103904 non-null  int64  
 5   Type of Travel                     103904 non-null  object 
 6   Class                              103904 non-null  object 
 7   Flight Distance                    103904 non-null  int64  
 8   Inflight wifi service              103904 non-null  int64  
 9   Departure/Arrival time convenient  103904 non-null  int64  
 10  Ease of Online booking             103904 non-null  int64  
 11  Gate location                      1039

In [14]:
# A의 주피터 노트북 마지막 셀
# CSV는 데이터 타입(날짜, 숫자 등) 정보가 손실될 수 있으므로 pickle이나 parquet 추천
df.to_pickle("train_processed.pkl")