In [1]:
import pandas as pd, numpy as np, time
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn import metrics
from collections import Counter
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
data = pd.read_csv("flights_short.csv")
print(data.shape)
data.head()

(150000, 11)


Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,DESTINATION_AIRPORT,ORIGIN_AIRPORT,AIR_TIME,DEPARTURE_TIME,DISTANCE,ARRIVAL_DELAY
0,1,28,3,14,102,717,608,102.0,713.0,634,0
1,8,11,2,3,152,748,690,134.0,111.0,1028,1
2,2,4,3,4,1184,597,740,111.0,1734.0,931,0
3,3,27,5,14,170,770,609,173.0,1807.0,1436,0
4,8,1,6,14,4321,772,544,63.0,2151.0,481,1


In [3]:
data = data[["MONTH","DAY","DAY_OF_WEEK","AIRLINE","FLIGHT_NUMBER","DESTINATION_AIRPORT",
                 "ORIGIN_AIRPORT","AIR_TIME", "DEPARTURE_TIME","DISTANCE","ARRIVAL_DELAY"]]

In [4]:
data.dropna(inplace=True)

In [5]:
data.shape

(150000, 11)

In [6]:
cols = ["AIRLINE", "FLIGHT_NUMBER", "DESTINATION_AIRPORT", "ORIGIN_AIRPORT"]
for item in cols:
    data[item] = data[item].astype("category").cat.codes + 1

In [7]:
train, test, y_train, y_test = train_test_split(data.drop(["ARRIVAL_DELAY"], axis=1), 
                                                data["ARRIVAL_DELAY"],
                                                random_state=42, test_size=0.25)

In [8]:
print(train.shape)
print(y_train.shape)
print(test.shape)
print(y_test.shape)

(112500, 10)
(112500,)
(37500, 10)
(37500,)


In [9]:
def auc(model, train, test): 
    return (metrics.roc_auc_score(y_train, model.predict_proba(train)[:,1]),
            metrics.roc_auc_score(y_test, model.predict_proba(test)[:,1]))

In [10]:
train=pd.read_csv('X_train.csv')
test=pd.read_csv('X_test.csv')
y_train=np.array(pd.read_csv('y_train.csv'))
y_test=np.array(pd.read_csv('y_test.csv'))

### XGBoost

In [12]:
model = xgb.XGBClassifier()
model.fit(train, np.array(y_train))

auc(model, train, test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(0.7037043613821493, 0.6957299448706683)

In [15]:
parameters = {
    #default
    'objective': 'reg:logistic',
    'learning_rate': 0.1,
    'silent': 1,
    "nthread": 4,
    "random_seed": 1,
    "eval_metric": 'auc',
}

splits = 4
folds = splits
num_rounds = 1000

xgb_train = xgb.DMatrix(train, np.array(y_train), feature_names=train.columns)
results = xgb.cv(parameters, xgb_train, num_boost_round=num_rounds,early_stopping_rounds=10, nfold=folds, verbose_eval=0)
results.iloc[-1]

train-auc-mean    0.829813
train-auc-std     0.001796
test-auc-mean     0.730977
test-auc-std      0.001332
Name: 269, dtype: float64

In [27]:
from sklearn.model_selection import GridSearchCV

num_rounds = len(results)-1
model = xgb.XGBClassifier(n_estimators=num_rounds)

add_params = {'max_depth':[3, 6, 8, 10],
              'max_leaves':[10, 20, 30],
              'subsample':[0.3, 0.5, 0.9],э.
              'reg_lambda':[0.5, 1, 5, 10]
             }
new_params = {**{k:[v] for k,v in parameters.items()},**add_params}
grid = GridSearchCV(model, new_params, n_jobs=-1,verbose=2)
grid.fit(X=train,y=np.array(y_train))


Fitting 3 folds for each of 144 candidates, totalling 432 fits


[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed: 42.4min
[Parallel(n_jobs=-1)]: Done 432 out of 432 | elapsed: 58.2min finished
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


'raise'

In [29]:
grid.best_score_

0.8030844444444445

In [30]:
grid.best_params_

{'eval_metric': 'auc',
 'learning_rate': 0.1,
 'max_depth': 8,
 'max_leaves': 10,
 'nthread': 4,
 'objective': 'reg:logistic',
 'random_seed': 1,
 'reg_lambda': 10,
 'silent': 1,
 'subsample': 0.9}

In [12]:
model = xgb.XGBClassifier(n_estimators=num_rounds,**{'eval_metric': 'auc',
 'learning_rate': 0.1,
 'max_depth': 8,
 'max_leaves': 10,
 'nthread': 4,
 'objective': 'reg:logistic',
 'random_seed': 1,
 'reg_lambda': 10,
 'silent': 1,
 'subsample': 0.9})
model.fit(train, np.array(y_train))
auc(model, train, test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(0.8728775431482623, 0.7382394647335706)

### LightGBM

In [13]:
model2 = lgb.LGBMClassifier()
model2.fit(train, y_train)

auc(model2, train, test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(0.7556001645011479, 0.7227598890343905)

### Catboost

In [28]:
clf = cb.CatBoostClassifier()
clf.fit(train,y_train.flatten())

auc(clf, train, test)

0:	learn: 0.6803917	total: 37ms	remaining: 36.9s
1:	learn: 0.6686141	total: 72.3ms	remaining: 36.1s
2:	learn: 0.6576013	total: 108ms	remaining: 36s
3:	learn: 0.6472956	total: 143ms	remaining: 35.5s
4:	learn: 0.6377463	total: 176ms	remaining: 35s
5:	learn: 0.6287079	total: 209ms	remaining: 34.6s
6:	learn: 0.6202455	total: 244ms	remaining: 34.6s
7:	learn: 0.6123902	total: 279ms	remaining: 34.6s
8:	learn: 0.6050260	total: 311ms	remaining: 34.2s
9:	learn: 0.5981517	total: 344ms	remaining: 34s
10:	learn: 0.5917253	total: 376ms	remaining: 33.8s
11:	learn: 0.5858169	total: 409ms	remaining: 33.7s
12:	learn: 0.5802532	total: 441ms	remaining: 33.5s
13:	learn: 0.5749350	total: 477ms	remaining: 33.6s
14:	learn: 0.5699427	total: 510ms	remaining: 33.5s
15:	learn: 0.5652564	total: 543ms	remaining: 33.4s
16:	learn: 0.5608092	total: 575ms	remaining: 33.3s
17:	learn: 0.5567759	total: 608ms	remaining: 33.2s
18:	learn: 0.5529384	total: 641ms	remaining: 33.1s
19:	learn: 0.5493327	total: 673ms	remaining: 33

165:	learn: 0.4804652	total: 5.53s	remaining: 27.8s
166:	learn: 0.4804137	total: 5.57s	remaining: 27.8s
167:	learn: 0.4803470	total: 5.61s	remaining: 27.8s
168:	learn: 0.4802834	total: 5.64s	remaining: 27.7s
169:	learn: 0.4801707	total: 5.67s	remaining: 27.7s
170:	learn: 0.4800715	total: 5.7s	remaining: 27.7s
171:	learn: 0.4799989	total: 5.74s	remaining: 27.6s
172:	learn: 0.4799481	total: 5.77s	remaining: 27.6s
173:	learn: 0.4798927	total: 5.8s	remaining: 27.6s
174:	learn: 0.4797870	total: 5.84s	remaining: 27.5s
175:	learn: 0.4797266	total: 5.87s	remaining: 27.5s
176:	learn: 0.4796580	total: 5.9s	remaining: 27.4s
177:	learn: 0.4795803	total: 5.93s	remaining: 27.4s
178:	learn: 0.4795217	total: 5.97s	remaining: 27.4s
179:	learn: 0.4794621	total: 6s	remaining: 27.3s
180:	learn: 0.4794043	total: 6.04s	remaining: 27.3s
181:	learn: 0.4793590	total: 6.07s	remaining: 27.3s
182:	learn: 0.4792055	total: 6.1s	remaining: 27.2s
183:	learn: 0.4791529	total: 6.13s	remaining: 27.2s
184:	learn: 0.47905

324:	learn: 0.4701279	total: 10.8s	remaining: 22.5s
325:	learn: 0.4700673	total: 10.8s	remaining: 22.4s
326:	learn: 0.4699972	total: 10.9s	remaining: 22.4s
327:	learn: 0.4699526	total: 10.9s	remaining: 22.4s
328:	learn: 0.4699124	total: 10.9s	remaining: 22.3s
329:	learn: 0.4698804	total: 11s	remaining: 22.3s
330:	learn: 0.4698461	total: 11s	remaining: 22.3s
331:	learn: 0.4698024	total: 11s	remaining: 22.2s
332:	learn: 0.4697441	total: 11.1s	remaining: 22.2s
333:	learn: 0.4696858	total: 11.1s	remaining: 22.2s
334:	learn: 0.4696596	total: 11.1s	remaining: 22.1s
335:	learn: 0.4696223	total: 11.2s	remaining: 22.1s
336:	learn: 0.4695835	total: 11.2s	remaining: 22s
337:	learn: 0.4695502	total: 11.2s	remaining: 22s
338:	learn: 0.4694928	total: 11.3s	remaining: 22s
339:	learn: 0.4694598	total: 11.3s	remaining: 21.9s
340:	learn: 0.4694094	total: 11.3s	remaining: 21.9s
341:	learn: 0.4693634	total: 11.4s	remaining: 21.9s
342:	learn: 0.4692914	total: 11.4s	remaining: 21.8s
343:	learn: 0.4692524	to

484:	learn: 0.4634231	total: 16.1s	remaining: 17.1s
485:	learn: 0.4633929	total: 16.1s	remaining: 17.1s
486:	learn: 0.4633699	total: 16.2s	remaining: 17s
487:	learn: 0.4633488	total: 16.2s	remaining: 17s
488:	learn: 0.4633139	total: 16.2s	remaining: 17s
489:	learn: 0.4632710	total: 16.3s	remaining: 16.9s
490:	learn: 0.4632311	total: 16.3s	remaining: 16.9s
491:	learn: 0.4631867	total: 16.3s	remaining: 16.9s
492:	learn: 0.4631418	total: 16.4s	remaining: 16.8s
493:	learn: 0.4631000	total: 16.4s	remaining: 16.8s
494:	learn: 0.4630544	total: 16.4s	remaining: 16.8s
495:	learn: 0.4630259	total: 16.5s	remaining: 16.7s
496:	learn: 0.4629672	total: 16.5s	remaining: 16.7s
497:	learn: 0.4629330	total: 16.5s	remaining: 16.7s
498:	learn: 0.4628844	total: 16.6s	remaining: 16.6s
499:	learn: 0.4628424	total: 16.6s	remaining: 16.6s
500:	learn: 0.4627952	total: 16.6s	remaining: 16.6s
501:	learn: 0.4627668	total: 16.7s	remaining: 16.5s
502:	learn: 0.4627307	total: 16.7s	remaining: 16.5s
503:	learn: 0.4626

648:	learn: 0.4579645	total: 21.6s	remaining: 11.7s
649:	learn: 0.4579424	total: 21.6s	remaining: 11.6s
650:	learn: 0.4579043	total: 21.6s	remaining: 11.6s
651:	learn: 0.4578667	total: 21.7s	remaining: 11.6s
652:	learn: 0.4578290	total: 21.7s	remaining: 11.5s
653:	learn: 0.4577961	total: 21.7s	remaining: 11.5s
654:	learn: 0.4577576	total: 21.8s	remaining: 11.5s
655:	learn: 0.4577210	total: 21.8s	remaining: 11.4s
656:	learn: 0.4576849	total: 21.8s	remaining: 11.4s
657:	learn: 0.4576542	total: 21.9s	remaining: 11.4s
658:	learn: 0.4576265	total: 21.9s	remaining: 11.3s
659:	learn: 0.4575900	total: 21.9s	remaining: 11.3s
660:	learn: 0.4575661	total: 22s	remaining: 11.3s
661:	learn: 0.4575386	total: 22s	remaining: 11.2s
662:	learn: 0.4574993	total: 22s	remaining: 11.2s
663:	learn: 0.4574717	total: 22.1s	remaining: 11.2s
664:	learn: 0.4574389	total: 22.1s	remaining: 11.1s
665:	learn: 0.4574078	total: 22.1s	remaining: 11.1s
666:	learn: 0.4573885	total: 22.2s	remaining: 11.1s
667:	learn: 0.4573

812:	learn: 0.4533910	total: 27s	remaining: 6.21s
813:	learn: 0.4533773	total: 27.1s	remaining: 6.18s
814:	learn: 0.4533509	total: 27.1s	remaining: 6.15s
815:	learn: 0.4533265	total: 27.1s	remaining: 6.12s
816:	learn: 0.4533120	total: 27.2s	remaining: 6.08s
817:	learn: 0.4532888	total: 27.2s	remaining: 6.05s
818:	learn: 0.4532659	total: 27.2s	remaining: 6.02s
819:	learn: 0.4532420	total: 27.3s	remaining: 5.98s
820:	learn: 0.4532165	total: 27.3s	remaining: 5.95s
821:	learn: 0.4531794	total: 27.3s	remaining: 5.92s
822:	learn: 0.4531596	total: 27.4s	remaining: 5.88s
823:	learn: 0.4531399	total: 27.4s	remaining: 5.85s
824:	learn: 0.4531135	total: 27.4s	remaining: 5.82s
825:	learn: 0.4530854	total: 27.5s	remaining: 5.78s
826:	learn: 0.4530535	total: 27.5s	remaining: 5.75s
827:	learn: 0.4530380	total: 27.5s	remaining: 5.72s
828:	learn: 0.4530156	total: 27.6s	remaining: 5.68s
829:	learn: 0.4529909	total: 27.6s	remaining: 5.65s
830:	learn: 0.4529693	total: 27.6s	remaining: 5.62s
831:	learn: 0.

976:	learn: 0.4495765	total: 32.5s	remaining: 765ms
977:	learn: 0.4495439	total: 32.5s	remaining: 731ms
978:	learn: 0.4495277	total: 32.5s	remaining: 698ms
979:	learn: 0.4495100	total: 32.6s	remaining: 665ms
980:	learn: 0.4494964	total: 32.6s	remaining: 632ms
981:	learn: 0.4494777	total: 32.6s	remaining: 598ms
982:	learn: 0.4494569	total: 32.7s	remaining: 565ms
983:	learn: 0.4494376	total: 32.7s	remaining: 532ms
984:	learn: 0.4494218	total: 32.7s	remaining: 499ms
985:	learn: 0.4494105	total: 32.8s	remaining: 465ms
986:	learn: 0.4493923	total: 32.8s	remaining: 432ms
987:	learn: 0.4493692	total: 32.8s	remaining: 399ms
988:	learn: 0.4493515	total: 32.9s	remaining: 366ms
989:	learn: 0.4493271	total: 32.9s	remaining: 332ms
990:	learn: 0.4493027	total: 32.9s	remaining: 299ms
991:	learn: 0.4492721	total: 33s	remaining: 266ms
992:	learn: 0.4492394	total: 33s	remaining: 233ms
993:	learn: 0.4492165	total: 33s	remaining: 199ms
994:	learn: 0.4491970	total: 33.1s	remaining: 166ms
995:	learn: 0.4491

(0.7534675161036537, 0.7234002534194366)