In [1]:
import pandas as pd
import numpy as np
import os, gc, time, random

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

# csv to parquet
메모리에 효율적인 데이터 유형을 사용하여 용량을 줄이고 빠른 작업이 가능합니다

In [3]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')# csv to parquet

In [4]:
csv_to_parquet('./train.csv', 'train')
csv_to_parquet('./test.csv', 'test')

train Done.
test Done.


In [5]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

## Data Pre-Processing

In [6]:
#레이블(Delay)을 제외한 결측값이 존재하는 변수들을 학습 데이터의 최빈값으로 대체합니다
NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

for col in NaN_col:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna(mode)
print('Done.')

Done.


In [7]:
#질적 변수들을 수치화합니다
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])
print('Done.')

Done.


In [8]:
print("Original Size: ", train.shape)

Original Size:  (1000000, 19)


In [9]:
# Drop rows with missing values
train = train.dropna()

In [10]:
print("Size After Dropping Missing Values: ", train.shape)

Size After Dropping Missing Values:  (255001, 19)


In [11]:
train.head(3)

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
5,TRAIN_000005,4,13,1545.0,1900.0,0,0,119,11618,4,93,11278,47,199.0,21,8,20452.0,3435,Not_Delayed
6,TRAIN_000006,1,20,1742.0,1903.0,0,0,119,11618,28,47,10721,19,200.0,26,8,19393.0,3495,Not_Delayed
8,TRAIN_000008,6,13,1420.0,1550.0,0,0,59,10821,4,74,11057,31,361.0,23,10,19393.0,4083,Not_Delayed


In [12]:
train["Delay"].value_counts()

Not_Delayed    210001
Delayed         45000
Name: Delay, dtype: int64

In [13]:
column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))
print('Done.')

Done.


In [14]:
x_train = train.drop(columns=['ID', 'Delay', 'Delay_num'])
y_train = train['Delay_num']
x_test = test.drop(columns=['ID'])

## Balancing the data

In [15]:
from imblearn.under_sampling import RandomUnderSampler

# Resample using random undersampling
rus = RandomUnderSampler(random_state=42)
x_train, y_train = rus.fit_resample(x_train, y_train)
np.bincount(y_train)

array([45000, 45000])

## Train-Test-Split

In [16]:
# Split the training dataset into a training set and a validation set
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [17]:
# Normalize numerical features
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)
x_test = scaler.transform(x_test)

## Create GBDT models

In [18]:
# create instances of the three classifiers
xgb = XGBClassifier()
lgbm = LGBMClassifier()
catboost = CatBoostClassifier(verbose=False)

## Initializing Hyper-param search space

In [24]:
# Cross-validation with StratifiedKFold
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

### Tuning XGB

In [34]:
param_grid = {
    'learning_rate': [0.065],
    'max_depth': [5],
    'n_estimators': [250, 300],
}

xgb_grid_search = GridSearchCV(xgb, param_grid=param_grid, cv=cv, scoring='neg_log_loss')
xgb_grid_search.fit(x_train, y_train)
print("Done.")

***********************
Done.
***********************


In [31]:
xgb_best = xgb_grid_search.best_estimator_
y_pred_xgb = xgb_best.predict_proba(x_val)
logloss = log_loss(y_val, y_pred_xgb)
print("[XGB] LogLoss on validation set:", logloss)
print(xgb_grid_search.best_params_)

[XGB] LogLoss on validation set: 0.6527280361561312
{'learning_rate': 0.065, 'max_depth': 5, 'n_estimators': 300}


### LightGBM

In [40]:
param_grid = {
    'learning_rate': [0.025, 0.03, 0.035],
    'max_depth': [6, 7],
    'n_estimators': [400, 450],
}

lgbm_grid_search = GridSearchCV(lgbm, param_grid=param_grid, cv=cv, scoring='neg_log_loss')
lgbm_grid_search.fit(x_train, y_train)
print("Done.")

Done.


In [39]:
lgbm_best = lgbm_grid_search.best_estimator_
y_pred_lgbm = lgbm_best.predict_proba(x_val)
logloss = log_loss(y_val, y_pred_lgbm)
print("[LightGBM] LogLoss on validation set:", logloss)
print(lgbm_grid_search.best_params_)

[LightGBM] LogLoss on validation set: 0.6530588494205606
{'learning_rate': 0.033, 'max_depth': 6, 'n_estimators': 450}


### CatBoost

In [46]:
param_grid = {
    'learning_rate': [0.065],
    'max_depth': [6],
    'n_estimators': [450],
}

catboost_grid_search = GridSearchCV(catboost, param_grid=param_grid, cv=cv, scoring='neg_log_loss')
catboost_grid_search.fit(x_train, y_train)
print("Done.")

Done.


In [47]:
catboost_best = catboost_grid_search.best_estimator_
y_pred_cb = catboost_best.predict_proba(x_val)
logloss = log_loss(y_val, y_pred_cb)
print("[CatBoost] LogLoss on validation set:", logloss)

[CatBoost] LogLoss on validation set: 0.6538445748604201
{'learning_rate': 0.066, 'max_depth': 6, 'n_estimators': 450}


## Train

In [19]:
x_train, y_train = np.concatenate((x_train, x_val)), np.concatenate((y_train, y_val))

In [20]:
xgb_params = {
    'learning_rate' : 0.065,
    'objective' : 'binary:logistic',
    'eval_metric' : 'logloss',
    'max_depth' : 5
}
xgb1 = XGBClassifier(**xgb_params, n_estimators=250)
xgb2 = XGBClassifier(**xgb_params, n_estimators=300)

########################################

lgbm_params = {
    'learning_rate' : 0.033,
    'objective' : 'binary',
    'metric' : 'binary_logloss'
}
lgbm1 = LGBMClassifier(**lgbm_params, n_estimators=400, max_depth=7)
lgbm2 = LGBMClassifier(**lgbm_params, n_estimators=450, max_depth=6)

########################################

cb_params = {
    'loss_function' : 'Logloss',
    'learning_rate': 0.066, 
    'max_depth': 6, 
    'n_estimators': 450
}
cb = CatBoostClassifier(**cb_params, verbose=False)


all_models = [xgb1, xgb2, lgbm1, lgbm2, cb]
for i, model in enumerate(all_models):
    start_time = time.time()
    print("==>> TRAINING model", i+1)
    
    model.fit(x_train, y_train)
    
    end_time = time.time()
    duration = (end_time-start_time)
    print(f"=== Time took : ({duration:.2f}) seconds ===")
print("Done.")

==>> TRAINING model 1
=== Time took : (6.32) seconds ===
==>> TRAINING model 2
=== Time took : (7.20) seconds ===
==>> TRAINING model 3
=== Time took : (1.04) seconds ===
==>> TRAINING model 4
=== Time took : (1.15) seconds ===
==>> TRAINING model 5
=== Time took : (3.51) seconds ===


## Inference

In [21]:
y_pred_xgb1 = xgb1.predict_proba(x_test)
y_pred_xgb2 = xgb2.predict_proba(x_test)
y_pred_lgbm1 = lgbm1.predict_proba(x_test)
y_pred_lgbm2 = lgbm2.predict_proba(x_test)
y_pred_cb = cb.predict_proba(x_test)
print("Done.")

Done.


## Ensemble of XGB, LGBM, CatBoost

In [22]:
ensemble_pred = (y_pred_xgb1+y_pred_xgb2+y_pred_lgbm1+y_pred_lgbm2+y_pred_cb)/5

## Submit

In [23]:
submission = pd.DataFrame(data=ensemble_pred, columns=sample_submission.columns, index=sample_submission.index)

In [24]:
submission.to_csv('2_ensemble_5models_submission.csv', index=True)