In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

In [0]:
!pip install catboost



In [0]:
PATH_TO_DATA = Path('/content/gdrive/My Drive/mlcourse')

In [0]:
train_df = pd.read_csv(PATH_TO_DATA / 'flight_delays_train.csv')

In [0]:
test_df = pd.read_csv(PATH_TO_DATA / 'flight_delays_test.csv')

In [0]:
train_split = train_df.shape[0]
y = train_df['dep_delayed_15min'].map({'Y': 1, 'N': 0})
train_df = train_df.drop('dep_delayed_15min', axis=1)

full_df = pd.concat((train_df, test_df))

In [0]:
def mk_ftrs(data):
  data['Month'] = data['Month'].apply(lambda x: x[2:])
  data['Month'] = data['Month'].astype('int')
  
  data['DayofMonth'] = data['DayofMonth'].apply(lambda x: x[2:])
  data['DayofMonth'] = data['DayofMonth'].astype('object')
  data['DayOfWeek'] = data['DayOfWeek'].apply(lambda x: x[2:])
  data['DayOfWeek'] = data['DayOfWeek'].astype('object')
  
  data['hour'] = data['DepTime'] // 100
  data.loc[data['hour'] == 24, 'hour'] = 0
  data.loc[data['hour'] == 25, 'hour'] = 1
  data['hour'] = data['hour'].astype('int')
  
  data['daytime'] = pd.cut(data['hour'], bins=[0, 6, 12, 18, 23], include_lowest=True)
  data['daytime'] = pd.factorize(data['daytime'])[0]
  data['daytime'] = data['daytime'].astype('object')
  data['minute'] = data['DepTime'] % 100
  data['minute'] = data['minute'].astype('object')
  
  data['hour']  = data['hour'].astype('str')

  data['UC-ORIGIN'] = data['UniqueCarrier'] + '-' + data['Origin']
  data['UC-DEST']   = data['UniqueCarrier'] + '-' + data['Dest']
  
  data['DPTM-ORIGIN'] = data['hour'] + '-' + data['Origin']
  data['DPTM-DEST']   = data['hour'] + '-' + data['Dest']
  
  data['hour']   = data['hour'].astype('object')  
  
  return data
full_df = mk_ftrs(full_df)
train_df, test_df = full_df.iloc[:train_split], full_df.iloc[train_split:]

In [0]:
train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,hour,daytime,minute,UC-ORIGIN,UC-DEST,DPTM-ORIGIN,DPTM-DEST
0,8,21,7,1934,AA,ATL,DFW,732,19,0,34,AA-ATL,AA-DFW,19-ATL,19-DFW
1,4,20,3,1548,US,PIT,MCO,834,15,1,48,US-PIT,US-MCO,15-PIT,15-MCO
2,9,2,5,1422,XE,RDU,CLE,416,14,1,22,XE-RDU,XE-CLE,14-RDU,14-CLE
3,11,25,6,1015,OO,DEN,MEM,872,10,2,15,OO-DEN,OO-MEM,10-DEN,10-MEM
4,10,7,6,1828,WN,MDW,OMA,423,18,1,28,WN-MDW,WN-OMA,18-MDW,18-OMA


In [0]:
test_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,hour,daytime,minute,UC-ORIGIN,UC-DEST,DPTM-ORIGIN,DPTM-DEST
0,7,25,3,615,YV,MRY,PHX,598,6,3,15,YV-MRY,YV-PHX,6-MRY,6-PHX
1,4,17,2,739,WN,LAS,HOU,1235,7,2,39,WN-LAS,WN-HOU,7-LAS,7-HOU
2,12,2,7,651,MQ,GSP,ORD,577,6,3,51,MQ-GSP,MQ-ORD,6-GSP,6-ORD
3,3,25,7,1614,WN,BWI,MHT,377,16,1,14,WN-BWI,WN-MHT,16-BWI,16-MHT
4,6,6,3,1505,UA,ORD,STL,258,15,1,5,UA-ORD,UA-STL,15-ORD,15-STL


In [0]:
categ_feat_idx = np.where(train_df.dtypes == 'object')[0]

In [0]:
categ_feat_idx

array([ 1,  2,  4,  5,  6,  8,  9, 10, 11, 12, 13, 14])

In [0]:
X_train = train_df
y_train = y

In [0]:
from sklearn.model_selection import train_test_split
X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_train, y_train, test_size=0.3, random_state=17)

In [0]:
from catboost import CatBoostClassifier
ctb = CatBoostClassifier(random_seed=17, silent=True, task_type = 'GPU')

In [0]:
ctb.fit(X_train_part, y_train_part, cat_features=categ_feat_idx);

In [0]:
X_train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,hour,daytime,minute,UC-ORIGIN,UC-DEST,DPTM-ORIGIN,DPTM-DEST
0,8,21,7,1934,AA,ATL,DFW,732,19,0,34,AA-ATL,AA-DFW,19-ATL,19-DFW
1,4,20,3,1548,US,PIT,MCO,834,15,1,48,US-PIT,US-MCO,15-PIT,15-MCO
2,9,2,5,1422,XE,RDU,CLE,416,14,1,22,XE-RDU,XE-CLE,14-RDU,14-CLE
3,11,25,6,1015,OO,DEN,MEM,872,10,2,15,OO-DEN,OO-MEM,10-DEN,10-MEM
4,10,7,6,1828,WN,MDW,OMA,423,18,1,28,WN-MDW,WN-OMA,18-MDW,18-OMA


In [0]:
feat = pd.DataFrame()
feat['feature'] = X_train.columns
feat['importance'] = ctb.get_feature_importance()
feat.sort_values('importance', inplace = True, ascending=False)
print(feat.head(len(X_train.columns)))

          feature  importance
8            hour   12.570803
3         DepTime    9.950864
13    DPTM-ORIGIN    8.489394
10         minute    7.783909
11      UC-ORIGIN    7.390430
12        UC-DEST    7.067567
14      DPTM-DEST    6.450791
4   UniqueCarrier    5.976314
0           Month    5.825271
1      DayofMonth    4.958495
2       DayOfWeek    4.835552
6            Dest    4.744720
9         daytime    4.734246
5          Origin    4.612642
7        Distance    4.609003


In [0]:
ctb_train_pred = ctb.predict_proba(X_train_part)[:, 1]

print(roc_auc_score(y_train_part, ctb_train_pred))

ctb_valid_pred = ctb.predict_proba(X_valid)[:, 1]

roc_auc_score(y_valid, ctb_valid_pred)

0.9689571491955236


0.7765192714012683

In [0]:
#Доучить на полной выборке
ctb.fit(X_train, y, cat_features=categ_feat_idx);

In [0]:
ctb_test_pred = ctb.predict_proba(test_df)[:, 1]

In [0]:
sample_sub = pd.read_csv(PATH_TO_DATA / 'sample_submission.csv', index_col='id')
sample_sub['dep_delayed_15min'] = ctb_test_pred
sample_sub.to_csv('ctb_pred_2019-10-04_v5.csv')