In [53]:
import pandas as pd
import numpy as np
import random
import os
import gc

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [54]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

# csv to parquet
메모리에 효율적인 데이터 유형을 사용하여 용량을 줄이고 빠른 작업이 가능합니다

In [55]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')# csv to parquet

In [56]:
csv_to_parquet('./train.csv', 'train')
csv_to_parquet('./test.csv', 'test')

train Done.
test Done.


In [57]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

## Data Pre-Processing

In [58]:
#레이블(Delay)을 제외한 결측값이 존재하는 변수들을 학습 데이터의 최빈값으로 대체합니다
NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

for col in NaN_col:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna(mode)
print('Done.')

Done.


In [59]:
#질적 변수들을 수치화합니다
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])
print('Done.')

Done.


In [60]:
print("Original Size: ", train.shape)

Original Size:  (1000000, 19)


In [61]:
# Drop rows with missing values
train = train.dropna()

In [62]:
print("Size After Dropping Missing Values: ", train.shape)

Size After Dropping Missing Values:  (255001, 19)


In [73]:
train.head(10)

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,Delay_num
5,TRAIN_000005,4,13,1545.0,1900.0,0,0,119,11618,4,93,11278,47,199.0,21,8,20452.0,3435,Not_Delayed,0
6,TRAIN_000006,1,20,1742.0,1903.0,0,0,119,11618,28,47,10721,19,200.0,26,8,19393.0,3495,Not_Delayed,0
8,TRAIN_000008,6,13,1420.0,1550.0,0,0,59,10821,4,74,11057,31,361.0,23,10,19393.0,4083,Not_Delayed,0
10,TRAIN_000010,8,13,1730.0,1844.0,0,0,93,11278,47,277,14122,36,204.0,21,0,19393.0,241,Delayed,1
12,TRAIN_000012,1,12,1015.0,1145.0,0,0,72,11042,33,94,11292,5,1201.0,23,10,19393.0,5171,Not_Delayed,0
13,TRAIN_000013,9,19,615.0,706.0,0,0,215,13158,42,94,11292,5,563.0,22,8,20304.0,579,Not_Delayed,0
19,TRAIN_000019,7,14,1907.0,2145.0,0,0,30,10529,6,22,10397,8,859.0,9,3,19790.0,6147,Not_Delayed,0
28,TRAIN_000028,8,7,930.0,1900.0,0,0,309,14679,4,193,12889,26,258.0,24,7,20416.0,3233,Not_Delayed,0
32,TRAIN_000032,6,20,750.0,1014.0,0,0,256,13930,11,202,12945,15,323.0,23,8,20366.0,327,Not_Delayed,0
34,TRAIN_000034,11,21,610.0,748.0,0,0,156,12156,4,331,14869,45,402.0,22,3,19393.0,1386,Not_Delayed,0


In [63]:
column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))
print('Done.')

Done.


In [64]:
x_train = train.drop(columns=['ID', 'Delay', 'Delay_num'])
y_train = train['Delay_num']
x_test = test.drop(columns=['ID'])

In [15]:
# Split the training dataset into a training set and a validation set
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [17]:
# Normalize numerical features
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)
x_test = scaler.transform(x_test)

## Create GBDT models

In [18]:
# create instances of the three classifiers
xgb = XGBClassifier()
lgbm = LGBMClassifier()
catboost = CatBoostClassifier()

## Initializing Hyper-param search space

In [19]:
param_grid = {
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200, 300],
}

In [20]:
# Cross-validation with StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

### XGB

In [21]:
xgb_grid_search = GridSearchCV(xgb, param_grid=param_grid, cv=cv, scoring='neg_log_loss')
xgb_grid_search.fit(x_train, y_train)
print("Done.")

In [30]:
xgb_best = xgb_grid_search.best_estimator_
y_pred_xgb = xgb_best.predict_proba(x_val)
logloss = log_loss(y_val, y_pred_xgb)
print("[XGB] LogLoss on validation set:", logloss)

[XGB] LogLoss on validation set: 0.442642709934066


### LightGBM

In [23]:
lgbm_grid_search = GridSearchCV(lgbm, param_grid=param_grid, cv=cv, scoring='neg_log_loss')
lgbm_grid_search.fit(x_train, y_train)
print("Done.")

Done.


In [28]:
lgbm_best = lgbm_grid_search.best_estimator_
y_pred_lgbm = lgbm_best.predict_proba(x_val)
logloss = log_loss(y_val, y_pred_lgbm)
print("[LightGBM] LogLoss on validation set:", logloss)

[LightGBM] LogLoss on validation set: 0.4427622975518929


### CatBoost

In [None]:
catboost_grid_search = GridSearchCV(catboost, param_grid=param_grid, cv=cv, scoring='neg_log_loss')
catboost_grid_search.fit(x_train, y_train)
print("Done.")

In [27]:
catboost_best = catboost_grid_search.best_estimator_
y_pred_cb = catboost_best.predict_proba(x_val)
logloss = log_loss(y_val, y_pred_cb)
print("[CatBoost] LogLoss on validation set:", logloss)

[CatBoost] LogLoss on validation set: 0.4434896851694139


## Inference

In [66]:
y_pred_xgb = xgb_best.predict_proba(x_test)
y_pred_lgbm = lgbm_best.predict_proba(x_test)
y_pred_cb = catboost_best.predict_proba(x_test)
print("Done.")

Done.


## Ensemble of XGB, LGBM, CatBoost

In [67]:
ensemble_pred = (y_pred_cb+y_pred_lgbm+y_pred_xgb)/3

## Submit

In [70]:
submission = pd.DataFrame(data=ensemble_pred, columns=sample_submission.columns, index=sample_submission.index)

In [71]:
submission.to_csv('ensemble_submission.csv', index=True)