In [1]:
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
%matplotlib inline

**Read the data**

In [2]:
PATH_TO_DATA = Path('../input/flight-delays-fall-2018/')
train_df = pd.read_csv(PATH_TO_DATA / 'flight_delays_train.csv')
test_df = pd.read_csv(PATH_TO_DATA / 'flight_delays_test.csv')

In [3]:
train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


Adding features

In [4]:
# remove 'c-''
train_df['Month'] = train_df['Month'].str[2:].astype('int')
train_df['DayofMonth'] = train_df['DayofMonth'].str[2:].astype('int')
train_df['DayOfWeek'] = train_df['DayOfWeek'].str[2:].astype('object')

test_df['Month'] = test_df['Month'].str[2:].astype('int')
test_df['DayofMonth'] = test_df['DayofMonth'].str[2:].astype('int')
test_df['DayOfWeek'] = test_df['DayOfWeek'].str[2:].astype('object')

# hour and minute from DepTime
train_df['DepTime_hour'] =  (train_df['DepTime']//100).astype('object')
train_df['DepTime_minute'] =  (train_df['DepTime']%100).astype('int')
#train_df = train_df.drop(['DepTime'], axis=1)

test_df['DepTime_hour'] =  (test_df['DepTime']//100).astype('object')
test_df['DepTime_minute'] =  (test_df['DepTime']%100).astype('int')
#test_df = test_df.drop(['DepTime'], axis=1)

# Daytime
# we already have categorial DepTime_hour
train_df['daytime'] = pd.cut(train_df['DepTime_hour'], bins=[0, 6, 12, 18, 24, 26], include_lowest=True, labels=False).astype('object')
test_df['daytime'] = pd.cut(test_df['DepTime_hour'], bins=[0, 6, 12, 18, 24, 26], include_lowest=True, labels=False).astype('object')

# group by distance
#train_df['Distance_group'] = pd.cut(train_df['Distance'], 20, labels=False).astype('object')
#test_df['Distance_group'] = pd.cut(test_df['Distance'], 20, labels=False).astype('object')

train_df['Distance_group_log'] = pd.cut(np.log(train_df['Distance']), 20, labels=False).astype('object')
test_df['Distance_group_log'] = pd.cut(np.log(test_df['Distance']), 20, labels=False).astype('object')

# add some strange hour-based features
train_df['s_hour'] = np.sin(2*np.pi*train_df['DepTime_hour'].astype('int')/24)
train_df['c_hour'] = np.cos(2*np.pi*train_df['DepTime_hour'].astype('int')/24)

test_df['s_hour'] = np.sin(2*np.pi*test_df['DepTime_hour'].astype('int')/24)
test_df['c_hour'] = np.cos(2*np.pi*test_df['DepTime_hour'].astype('int')/24)

# season
train_df.loc[train_df['Month'].isin([12,1,2]),'season'] = 'winter'
train_df.loc[train_df['Month'].isin([3,4,5]),'season'] = 'spring'
train_df.loc[train_df['Month'].isin([6,7,8]),'season'] = 'summer'
train_df.loc[train_df['Month'].isin([9,10,11]),'season'] = 'autumn'
train_df['season'] = train_df['season'].astype('object')

test_df.loc[test_df['Month'].isin([12,1,2]),'season'] = 'winter'
test_df.loc[test_df['Month'].isin([3,4,5]),'season'] = 'spring'
test_df.loc[test_df['Month'].isin([6,7,8]),'season'] = 'summer'
test_df.loc[test_df['Month'].isin([9,10,11]),'season'] = 'autumn'
test_df['season'] = test_df['season'].astype('object')

# month_group
#train_df['month_group'] = train_df['Month'].map({1:4, 2:3, 3:4, 4:1, 5:1, 6:2, 7:2, 8:4, 9:1, 10:3, 11:3, 12:2}).astype('object')
#test_df['month_group'] = test_df['Month'].map({1:4, 2:3, 3:4, 4:1, 5:1, 6:2, 7:2, 8:4, 9:1, 10:3, 11:3, 12:2}).astype('object')

In [5]:
# flight
#d.crosstab(train_df.flight, train_df.dep_delayed_15min).apply(lambda r: r/r.sum(), axis=1)
# too many categorical variables: let's try to remove flight

In [6]:
# minute
#plt.plot(pd.crosstab(train_df.DepTime_minute, train_df.dep_delayed_15min).apply(lambda r: r/r.sum(), axis=1)['Y'],'o')

In [7]:
# by month
#plt.plot(pd.crosstab(train_df.Month, train_df.dep_delayed_15min).apply(lambda r: r/r.sum(), axis=1)['Y'],'o')
# within season differences are large, so let's leave it as it is

In [8]:
# by day of week
#plt.plot(pd.crosstab(train_df.DayOfWeek, train_df.dep_delayed_15min).apply(lambda r: r/r.sum(), axis=1)['Y'],'o')

In [9]:
# by hour
#plt.plot(pd.crosstab(train_df.DepTime_hour, train_df.dep_delayed_15min).apply(lambda r: r/r.sum(), axis=1)['Y'].values, 'o')
# leave categorical for now, later can be turned into numeric + 2 binary features

In [10]:
# distance
#pd.crosstab(pd.cut(np.log(train_df['Distance']), 10), train_df['dep_delayed_15min']).apply(lambda r: r/r.sum(), axis=1)['Y'].plot.bar()
#pd.crosstab(pd.cut(train_df['Distance'], 10), train_df['dep_delayed_15min']).apply(lambda r: r/r.sum(), axis=1)['Y'].plot.bar()
# should be cut into some amount of pieces

In [11]:
# minutes
#pd.crosstab(pd.cut(train_df['DepTime_minute'], 12), train_df['dep_delayed_15min']).apply(lambda r: r/r.sum(), axis=1)['Y'].plot.bar()

In [12]:
# day of month
#pd.crosstab(pd.cut(train_df.DayofMonth, 31), train_df.dep_delayed_15min).apply(lambda r: r/r.sum(), axis=1)['Y'].plot.bar()
# looks like a weekly cycle
# probably should be ignored

In [13]:
train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,DepTime_hour,DepTime_minute,daytime,Distance_group_log,s_hour,c_hour,season
0,8,21,7,1934,AA,ATL,DFW,732,N,19,34,3,12,-0.965926,0.258819,summer
1,4,20,3,1548,US,PIT,MCO,834,N,15,48,2,13,-0.707107,-0.7071068,spring
2,9,2,5,1422,XE,RDU,CLE,416,N,14,22,2,10,-0.5,-0.8660254,autumn
3,11,25,6,1015,OO,DEN,MEM,872,N,10,15,1,13,0.5,-0.8660254,autumn
4,10,7,6,1828,WN,MDW,OMA,423,Y,18,28,2,10,-1.0,-1.83697e-16,autumn


In [14]:
test_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,DepTime_hour,DepTime_minute,daytime,Distance_group_log,s_hour,c_hour,season
0,7,25,3,615,YV,MRY,PHX,598,6,15,0,11,1.0,6.123234000000001e-17,summer
1,4,17,2,739,WN,LAS,HOU,1235,7,39,1,14,0.965926,-0.258819,spring
2,12,2,7,651,MQ,GSP,ORD,577,6,51,0,11,1.0,6.123234000000001e-17,winter
3,3,25,7,1614,WN,BWI,MHT,377,16,14,2,9,-0.866025,-0.5,spring
4,6,6,3,1505,UA,ORD,STL,258,15,5,2,8,-0.707107,-0.7071068,summer


**Remember indexes of categorical features (to be passed to CatBoost)**

In [15]:
categ_feat_idx = np.where(train_df.drop('dep_delayed_15min', axis=1).dtypes == 'object')[0]
categ_feat_idx

array([ 2,  4,  5,  6,  8, 10, 11, 14])

**Allocate a hold-out set (a.k.a. a validation set) to validate the model**

In [16]:
X_train = train_df.drop('dep_delayed_15min', axis=1).values
y_train = train_df['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values
X_test = test_df.values

In [17]:
X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_train, y_train, 
                                                                test_size=0.3, 
                                                                random_state=17)

**Train Catboost with default arguments, passing only the indexes of categorical features.**

In [18]:
ctb = CatBoostClassifier(random_seed=17, silent=True)

In [19]:
#%%time
#ctb.fit(X_train_part, y_train_part,
#        cat_features=categ_feat_idx);

In [20]:
#roc_auc_score(y_train_part, ctb.predict_proba(X_train_part)[:, 1])

In [21]:
#ctb_valid_pred = ctb.predict_proba(X_valid)[:, 1]
#roc_auc_score(y_valid, ctb_valid_pred)

**Train on the whole train set, make prediction on the test set. We got ~0.734 in the competition - "Catboost starter" baseline**

In [22]:
%%time
ctb.fit(X_train, y_train,
        cat_features=categ_feat_idx);

CPU times: user 7min 30s, sys: 36.3 s, total: 8min 6s
Wall time: 2min 9s


<catboost.core.CatBoostClassifier at 0x7f89595403c8>

In [23]:
ctb_test_pred = ctb.predict_proba(X_test)[:, 1]

In [24]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    sample_sub = pd.read_csv(PATH_TO_DATA / 'sample_submission.csv', 
                             index_col='id')
    sample_sub['dep_delayed_15min'] = ctb_test_pred
    sample_sub.to_csv('ctb_pred.csv')

In [25]:
!head ctb_pred.csv

id,dep_delayed_15min
0,0.01365414181371734
1,0.040933672715797245
2,0.022497972386854363
3,0.4666993228083243
4,0.34639855436336603
5,0.060126583173261826
6,0.10897148844200467
7,0.41625154785543955
8,0.19961130230490054
