Improve the model to beat **"A2 baseline (10 credits)"** - **0.75914** LB score.

 ### It's crucial to come up with some good features.

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
!pip install catboost

In [0]:
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [0]:
from catboost import CatBoostClassifier

**Read the data**

In [0]:
PATH_TO_DATA = Path('/content/gdrive/My Drive/mlcourse')

In [0]:
train_df = pd.read_csv(PATH_TO_DATA / 'flight_delays_train.csv')

In [0]:
train_df['hour'] = [ int(train_df.DepTime[i]/100) for i in train_df.index]

In [0]:
train_df = train_df.drop(train_df[train_df['DepTime'] >= 2400].index)

In [0]:
train_df['hour'] = train_df.hour.astype('category')
train_df = train_df.drop('DepTime', axis=1)

In [0]:
train_df['Month'] = train_df['Month'].apply(lambda x: x[2:])
train_df['DayofMonth'] = train_df['DayofMonth'].apply(lambda x: x[2:])
train_df['DayOfWeek'] = train_df['DayOfWeek'].apply(lambda x: x[2:])

In [0]:
year_list = [2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011]

In [0]:
train_df['year'] = [0 for i in train_df.index]

In [0]:
from datetime import datetime

def f(mth, dom, dow):
  for y in year_list:
    x = str(y) + str('-') + mth + str('-') + dom
    if datetime.strptime(x, '%Y-%m-%d').weekday() == int(dow) - 1:
      return y

In [0]:
train_df['year'] = [f(train_df['Month'][i], train_df['DayofMonth'][i], train_df['DayOfWeek'][i]) for i in train_df.index]

In [0]:
train_df['time'] = [datetime.strptime(str(str(train_df.year[i]) + str('-') + str(train_df.Month[i]) + str('-') + str(train_df.DayofMonth[i]) + str(' ') + str(train_df.hour[i])), '%Y-%m-%d %H') for i in train_df.index]

In [0]:
train_df = train_df.drop('DayofMonth', axis=1)

In [0]:
train_df['hour'] = train_df.hour.astype('object')

In [0]:
train_df['year'] = train_df.year.astype('object')

In [0]:
train_df.head()

Unnamed: 0,Month,DayOfWeek,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,hour,year,time
0,8,7,AA,ATL,DFW,732,N,19,2016,2016-08-21 19:00:00
1,4,3,US,PIT,MCO,834,N,15,2016,2016-04-20 15:00:00
2,9,5,XE,RDU,CLE,416,N,14,2016,2016-09-02 14:00:00
3,11,6,OO,DEN,MEM,872,N,10,2017,2017-11-25 10:00:00
4,10,6,WN,MDW,OMA,423,Y,18,2017,2017-10-07 18:00:00


**Remember indexes of categorical features (to be passed to CatBoost)**

In [0]:
categ_feat_idx = np.where(train_df.drop(['dep_delayed_15min'], axis=1).dtypes == 'object')[0]
categ_feat_idx

array([0, 1, 2, 3, 4, 6, 7])

**Allocate a hold-out set (a.k.a. a validation set) to validate the model**

In [0]:
X_train = train_df.drop('dep_delayed_15min', axis=1)
y_train = train_df['dep_delayed_15min'].map({'Y': 1, 'N': 0})

In [0]:
X_train = train_df.drop(train_df[train_df['year'] == '2011'].index).drop('dep_delayed_15min', axis=1)
y_train = train_df.drop(train_df[train_df['year'] == '2011'].index)['dep_delayed_15min'].map({'Y': 1, 'N': 0})

In [0]:
X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_train, y_train, 
                                                                test_size=0.2, 
                                                                random_state=17)

In [0]:
X_train_part.head()

Unnamed: 0,Month,DayOfWeek,UniqueCarrier,Origin,Dest,Distance,hour,year,time
28156,11,5,WN,BWI,PVD,328,17,2016,2016-11-25 17:00:00
36655,6,5,US,MSY,DCA,969,10,2016,2016-06-17 10:00:00
65238,1,1,UA,ORD,BDL,783,22,2017,2017-01-02 22:00:00
78169,12,7,WN,TPA,JAX,180,17,2017,2017-12-31 17:00:00
29030,11,6,YV,IAD,BDL,326,12,2017,2017-11-18 12:00:00


**Train Catboost with default arguments, passing only the indexes of categorical features.**

In [0]:
ctb = CatBoostClassifier(random_seed=17, silent=True)

In [0]:
%%time
ctb.fit(X_train_part, y_train_part,
        cat_features=categ_feat_idx,
        use_best_model=True);

You should provide test set for use best model. use_best_model parameter has been switched to false value.


CPU times: user 3min 21s, sys: 9.84 s, total: 3min 31s
Wall time: 1min 47s


<catboost.core.CatBoostClassifier at 0x7ff116aec4a8>

In [0]:
ctb_valid_pred = ctb.predict_proba(X_valid)[:, 1]

In [0]:
roc_auc_score(y_valid, ctb_valid_pred)

In [0]:
%%time
ctb.fit(X_train, y_train,
        cat_features=categ_feat_idx,
        use_best_model=True);

In [0]:
test_df = pd.read_csv(PATH_TO_DATA / 'flight_delays_test.csv')

test_df['hour'] = [ int(test_df.DepTime[i]/100) for i in test_df.index]
test_df['hour'][test_df[test_df.DepTime >= 2400].index] = 0
test_df['hour'] = test_df.hour.astype('category')
test_df = test_df.drop('DepTime', axis=1)

test_df['Month'] = test_df['Month'].apply(lambda x: x[2:])
test_df['DayofMonth'] = test_df['DayofMonth'].apply(lambda x: x[2:])
test_df['DayOfWeek'] = test_df['DayOfWeek'].apply(lambda x: x[2:])

year_list = [2018, 2017, 2016, 2015, 2014, 2013, 2012, 2011]

test_df['year'] = [0 for i in test_df.index]

from datetime import datetime

def f(mth, dom, dow):
  for y in year_list:
    x = str(y) + str('-') + mth + str('-') + dom
    if datetime.strptime(x, '%Y-%m-%d').weekday() == int(dow) - 1:
      return y

test_df['year'] = [f(test_df['Month'][i], test_df['DayofMonth'][i], test_df['DayOfWeek'][i]) for i in test_df.index]

test_df['time'] = [datetime.strptime(str(str(test_df.year[i]) + str('-') + str(test_df.Month[i]) + str('-') + str(test_df.DayofMonth[i]) + str(' ') + str(test_df.hour[i])), '%Y-%m-%d %H') for i in test_df.index]

test_df = test_df.drop('DayofMonth', axis=1)

test_df['hour'] = test_df.hour.astype('object')

test_df['year'] = test_df.year.astype('object')

In [0]:
test_df.head()

Unnamed: 0,Month,DayOfWeek,UniqueCarrier,Origin,Dest,Distance,hour,year,time
0,7,3,YV,MRY,PHX,598,6,2018,2018-07-25 06:00:00
1,4,2,WN,LAS,HOU,1235,7,2018,2018-04-17 07:00:00
2,12,7,MQ,GSP,ORD,577,6,2018,2018-12-02 06:00:00
3,3,7,WN,BWI,MHT,377,16,2018,2018-03-25 16:00:00
4,6,3,UA,ORD,STL,258,15,2018,2018-06-06 15:00:00


In [0]:
ctb_test_pred = ctb.predict_proba(test_df)[:, 1]

In [0]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    sample_sub = pd.read_csv(PATH_TO_DATA / 'sample_submission.csv', 
                             index_col='id')
    sample_sub['dep_delayed_15min'] = ctb_test_pred
    sample_sub.to_csv('ctb_pred_2019-09-24_v0.csv')