In [21]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [54]:
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

In [55]:
train_data.shape, test_data.shape

((1000000, 19), (1000000, 18))

In [56]:
y = train_data['Delay']

In [57]:
y.value_counts()

Not_Delayed    210001
Delayed         45000
Name: Delay, dtype: int64

In [58]:
train_data.head()

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
0,TRAIN_000000,4,15,,,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N7858A,
1,TRAIN_000001,8,15,740.0,1024.0,0,0,ORD,13930,Illinois,SLC,14869,Utah,1250.0,SkyWest Airlines Inc.,UA,20304.0,N125SY,
2,TRAIN_000002,9,6,1610.0,1805.0,0,0,CLT,11057,North Carolina,LGA,12953,New York,544.0,American Airlines Inc.,AA,19805.0,N103US,
3,TRAIN_000003,7,10,905.0,1735.0,0,0,LAX,12892,California,EWR,11618,New Jersey,2454.0,United Air Lines Inc.,UA,,N595UA,
4,TRAIN_000004,1,11,900.0,1019.0,0,0,SFO,14771,California,ACV,10157,California,250.0,SkyWest Airlines Inc.,UA,20304.0,N161SY,


In [59]:
train_data.isnull().sum()

ID                               0
Month                            0
Day_of_Month                     0
Estimated_Departure_Time    109019
Estimated_Arrival_Time      109040
Cancelled                        0
Diverted                         0
Origin_Airport                   0
Origin_Airport_ID                0
Origin_State                109015
Destination_Airport              0
Destination_Airport_ID           0
Destination_State           109079
Distance                         0
Airline                     108920
Carrier_Code(IATA)          108990
Carrier_ID(DOT)             108997
Tail_Number                      0
Delay                       744999
dtype: int64

In [64]:
nan_column = ['Estimated_Departure_Time','Estimated_Arrival_Time',
              'Origin_State','Destination_State','Airline',
              'Carrier_Code(IATA)','Carrier_ID(DOT)']

In [65]:
train_data[nan_column].head()

Unnamed: 0,Estimated_Departure_Time,Estimated_Arrival_Time,Origin_State,Destination_State,Airline,Carrier_Code(IATA),Carrier_ID(DOT)
0,,,Oklahoma,Texas,Southwest Airlines Co.,WN,19393.0
1,740.0,1024.0,Illinois,Utah,SkyWest Airlines Inc.,UA,20304.0
2,1610.0,1805.0,North Carolina,New York,American Airlines Inc.,AA,19805.0
3,905.0,1735.0,California,New Jersey,United Air Lines Inc.,UA,
4,900.0,1019.0,California,California,SkyWest Airlines Inc.,UA,20304.0


In [66]:
train_data[nan_column].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 7 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Estimated_Departure_Time  890981 non-null  float64
 1   Estimated_Arrival_Time    890960 non-null  float64
 2   Origin_State              890985 non-null  object 
 3   Destination_State         890921 non-null  object 
 4   Airline                   891080 non-null  object 
 5   Carrier_Code(IATA)        891010 non-null  object 
 6   Carrier_ID(DOT)           891003 non-null  float64
dtypes: float64(3), object(4)
memory usage: 53.4+ MB


In [63]:
ddf1 = train_data[['Estimated_Departure_Time','Estimated_Arrival_Time','Carrier_ID(DOT)']]

In [None]:
ddf2 = train_data[['Origin_State','Destination_State','Airline','Carrier_Code(IATA)']]

In [69]:
df_numeric = pd.DataFrame(numeric_data)

In [71]:
df_numeric.head()

Unnamed: 0,0,1,2
0,1348.404964,1495.075996,19393.0
1,740.0,1024.0,20304.0
2,1610.0,1805.0,19805.0
3,905.0,1735.0,20005.130671
4,900.0,1019.0,20304.0


In [74]:
train_data.shape, df_numeric.shape

((1000000, 19), (1000000, 3))

In [76]:
train_data = pd.concat([train_data,df_numeric],axis=1)
train_data.shape

(1000000, 22)

In [99]:
train_data = train_data.drop(['Estimated_Departure_Time','Estimated_Arrival_Time','Carrier_ID(DOT)'],axis=1)

In [105]:
train_data.rename(columns={0:'Estimated_Departure_Time',1:'Estimated_Arrival_Time',2:'Carrier_ID(DOT)'},inplace=True)

In [106]:
train_data.to_csv('train_v2.csv',encoding='utf-8')

In [107]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [108]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [109]:
csv_to_parquet('./train_v2.csv', 'train')
csv_to_parquet('./test.csv', 'test')

train Done.
test Done.


In [110]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

In [111]:
#레이블(Delay)을 제외한 결측값이 존재하는 변수들을 학습 데이터의 최빈값으로 대체합니다
NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

for col in NaN_col:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna(mode)
print('Done.')

Done.


In [112]:
#질적 변수들을 수치화합니다
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])

    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])
print('Done.')

Done.


In [113]:
#레이블이 없는 데이터들을 제거합니다
train = train.dropna()

In [114]:
column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))
print('Done.')

Done.


In [126]:
del train['Unnamed: 0']

In [127]:
from pycaret.classification import *
s = setup(train, target = 'Delay', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Delay
2,Target type,Binary
3,Target mapping,"Delayed: 0, Not_Delayed: 1"
4,Original data shape,"(255001, 20)"
5,Transformed data shape,"(255001, 20)"
6,Transformed train set shape,"(178500, 20)"
7,Transformed test set shape,"(76501, 20)"
8,Numeric features,18
9,Categorical features,1


In [128]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.496
dt,Decision Tree Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.478
ridge,Ridge Classifier,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.444
rf,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.594
ada,Ada Boost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.488
gbc,Gradient Boosting Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.863
et,Extra Trees Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.931
xgboost,Extreme Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.118
lightgbm,Light Gradient Boosting Machine,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.621
catboost,CatBoost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.349


In [115]:
train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

In [129]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(random_state=42)
clf.fit(train_x, train_y)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=42)

In [120]:
clf = RandomForestClassifier()
clf.fit(train_x, train_y)

RandomForestClassifier()

In [130]:
y_pred = clf.predict_proba(test_x)

In [135]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(random_state=42)
model.fit(train_x, train_y)

Learning rate set to 0.109781
0:	learn: 0.6391503	total: 72.1ms	remaining: 1m 12s
1:	learn: 0.5968810	total: 84.8ms	remaining: 42.3s
2:	learn: 0.5639063	total: 97.5ms	remaining: 32.4s
3:	learn: 0.5390625	total: 111ms	remaining: 27.6s
4:	learn: 0.5201496	total: 124ms	remaining: 24.6s
5:	learn: 0.5049479	total: 138ms	remaining: 22.9s
6:	learn: 0.4932997	total: 157ms	remaining: 22.2s
7:	learn: 0.4843012	total: 178ms	remaining: 22.1s
8:	learn: 0.4780003	total: 192ms	remaining: 21.2s
9:	learn: 0.4725494	total: 206ms	remaining: 20.3s
10:	learn: 0.4679452	total: 220ms	remaining: 19.8s
11:	learn: 0.4643468	total: 235ms	remaining: 19.3s
12:	learn: 0.4612777	total: 247ms	remaining: 18.8s
13:	learn: 0.4590112	total: 260ms	remaining: 18.3s
14:	learn: 0.4574586	total: 272ms	remaining: 17.8s
15:	learn: 0.4559802	total: 286ms	remaining: 17.6s
16:	learn: 0.4549037	total: 301ms	remaining: 17.4s
17:	learn: 0.4539327	total: 315ms	remaining: 17.2s
18:	learn: 0.4529398	total: 327ms	remaining: 16.9s
19:	lea

165:	learn: 0.4371585	total: 2.5s	remaining: 12.6s
166:	learn: 0.4371110	total: 2.52s	remaining: 12.6s
167:	learn: 0.4370713	total: 2.54s	remaining: 12.6s
168:	learn: 0.4370132	total: 2.55s	remaining: 12.5s
169:	learn: 0.4369766	total: 2.56s	remaining: 12.5s
170:	learn: 0.4369212	total: 2.58s	remaining: 12.5s
171:	learn: 0.4368821	total: 2.6s	remaining: 12.5s
172:	learn: 0.4368072	total: 2.61s	remaining: 12.5s
173:	learn: 0.4367626	total: 2.62s	remaining: 12.5s
174:	learn: 0.4367068	total: 2.63s	remaining: 12.4s
175:	learn: 0.4366532	total: 2.65s	remaining: 12.4s
176:	learn: 0.4366034	total: 2.67s	remaining: 12.4s
177:	learn: 0.4365521	total: 2.68s	remaining: 12.4s
178:	learn: 0.4364931	total: 2.69s	remaining: 12.4s
179:	learn: 0.4364359	total: 2.71s	remaining: 12.3s
180:	learn: 0.4363959	total: 2.72s	remaining: 12.3s
181:	learn: 0.4363193	total: 2.74s	remaining: 12.3s
182:	learn: 0.4362642	total: 2.75s	remaining: 12.3s
183:	learn: 0.4362158	total: 2.77s	remaining: 12.3s
184:	learn: 0.

329:	learn: 0.4302744	total: 5s	remaining: 10.2s
330:	learn: 0.4302376	total: 5.01s	remaining: 10.1s
331:	learn: 0.4301958	total: 5.03s	remaining: 10.1s
332:	learn: 0.4301631	total: 5.04s	remaining: 10.1s
333:	learn: 0.4301238	total: 5.06s	remaining: 10.1s
334:	learn: 0.4300891	total: 5.07s	remaining: 10.1s
335:	learn: 0.4300528	total: 5.08s	remaining: 10s
336:	learn: 0.4300038	total: 5.1s	remaining: 10s
337:	learn: 0.4299613	total: 5.11s	remaining: 10s
338:	learn: 0.4299330	total: 5.13s	remaining: 10s
339:	learn: 0.4299038	total: 5.14s	remaining: 9.98s
340:	learn: 0.4298636	total: 5.16s	remaining: 9.97s
341:	learn: 0.4298173	total: 5.17s	remaining: 9.95s
342:	learn: 0.4297924	total: 5.19s	remaining: 9.94s
343:	learn: 0.4297511	total: 5.2s	remaining: 9.92s
344:	learn: 0.4297154	total: 5.22s	remaining: 9.91s
345:	learn: 0.4296744	total: 5.23s	remaining: 9.89s
346:	learn: 0.4296412	total: 5.25s	remaining: 9.88s
347:	learn: 0.4296089	total: 5.26s	remaining: 9.86s
348:	learn: 0.4295818	tot

495:	learn: 0.4247421	total: 7.48s	remaining: 7.6s
496:	learn: 0.4247107	total: 7.5s	remaining: 7.59s
497:	learn: 0.4246782	total: 7.51s	remaining: 7.57s
498:	learn: 0.4246490	total: 7.53s	remaining: 7.56s
499:	learn: 0.4246243	total: 7.54s	remaining: 7.54s
500:	learn: 0.4245960	total: 7.55s	remaining: 7.52s
501:	learn: 0.4245583	total: 7.57s	remaining: 7.51s
502:	learn: 0.4245323	total: 7.59s	remaining: 7.5s
503:	learn: 0.4245013	total: 7.6s	remaining: 7.48s
504:	learn: 0.4244838	total: 7.62s	remaining: 7.47s
505:	learn: 0.4244587	total: 7.63s	remaining: 7.45s
506:	learn: 0.4244347	total: 7.65s	remaining: 7.44s
507:	learn: 0.4244176	total: 7.66s	remaining: 7.42s
508:	learn: 0.4243846	total: 7.68s	remaining: 7.41s
509:	learn: 0.4243609	total: 7.69s	remaining: 7.39s
510:	learn: 0.4243339	total: 7.71s	remaining: 7.38s
511:	learn: 0.4242988	total: 7.73s	remaining: 7.36s
512:	learn: 0.4242772	total: 7.74s	remaining: 7.35s
513:	learn: 0.4242390	total: 7.76s	remaining: 7.34s
514:	learn: 0.42

659:	learn: 0.4200278	total: 9.98s	remaining: 5.14s
660:	learn: 0.4200001	total: 10s	remaining: 5.13s
661:	learn: 0.4199812	total: 10s	remaining: 5.12s
662:	learn: 0.4199523	total: 10s	remaining: 5.1s
663:	learn: 0.4199139	total: 10.1s	remaining: 5.09s
664:	learn: 0.4198899	total: 10.1s	remaining: 5.07s
665:	learn: 0.4198599	total: 10.1s	remaining: 5.06s
666:	learn: 0.4198232	total: 10.1s	remaining: 5.04s
667:	learn: 0.4197917	total: 10.1s	remaining: 5.03s
668:	learn: 0.4197693	total: 10.1s	remaining: 5.01s
669:	learn: 0.4197443	total: 10.1s	remaining: 5s
670:	learn: 0.4197130	total: 10.2s	remaining: 4.98s
671:	learn: 0.4196895	total: 10.2s	remaining: 4.97s
672:	learn: 0.4196621	total: 10.2s	remaining: 4.95s
673:	learn: 0.4196343	total: 10.2s	remaining: 4.94s
674:	learn: 0.4195994	total: 10.2s	remaining: 4.92s
675:	learn: 0.4195797	total: 10.2s	remaining: 4.91s
676:	learn: 0.4195600	total: 10.3s	remaining: 4.89s
677:	learn: 0.4195339	total: 10.3s	remaining: 4.88s
678:	learn: 0.4195160	

830:	learn: 0.4154621	total: 12.7s	remaining: 2.58s
831:	learn: 0.4154384	total: 12.7s	remaining: 2.56s
832:	learn: 0.4154163	total: 12.7s	remaining: 2.55s
833:	learn: 0.4153908	total: 12.7s	remaining: 2.53s
834:	learn: 0.4153584	total: 12.7s	remaining: 2.52s
835:	learn: 0.4153297	total: 12.8s	remaining: 2.5s
836:	learn: 0.4152913	total: 12.8s	remaining: 2.49s
837:	learn: 0.4152670	total: 12.8s	remaining: 2.47s
838:	learn: 0.4152399	total: 12.8s	remaining: 2.46s
839:	learn: 0.4152095	total: 12.8s	remaining: 2.44s
840:	learn: 0.4151857	total: 12.8s	remaining: 2.42s
841:	learn: 0.4151549	total: 12.8s	remaining: 2.41s
842:	learn: 0.4151218	total: 12.9s	remaining: 2.39s
843:	learn: 0.4151051	total: 12.9s	remaining: 2.38s
844:	learn: 0.4150839	total: 12.9s	remaining: 2.36s
845:	learn: 0.4150625	total: 12.9s	remaining: 2.35s
846:	learn: 0.4150357	total: 12.9s	remaining: 2.33s
847:	learn: 0.4150145	total: 12.9s	remaining: 2.32s
848:	learn: 0.4149956	total: 12.9s	remaining: 2.3s
849:	learn: 0.

<catboost.core.CatBoostClassifier at 0x3217f6d60>

In [136]:
y_pred = model.predict_proba(test_x)

In [137]:
import time

t = time.strftime('%m-%d_%H:%M')

In [138]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)

In [139]:
submission.to_csv(f'{t}baseline_submission.csv', index=True)

In [141]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [144]:
train_data = TabularDataset(train)

In [147]:
label = 'Delay_num'
predictor = TabularPredictor(label=label, eval_metric='log_loss').fit(train_data)

No path specified. Models will be saved in: "AutogluonModels/ag-20230405_155514/"
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230405_155514/"
AutoGluon Version:  0.7.0
Python Version:     3.8.16
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 22.3.0: Mon Jan 30 20:38:37 PST 2023; root:xnu-8792.81.3~2/RELEASE_ARM64_T6000
Train Data Rows:    255001
Train Data Columns: 19
Label Column: Delay_num
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as 

In [155]:
predictor.leaderboard(train_data, silent=True)

Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBMXT,-1.033899e-07,-1.033885e-07,0.199132,0.003367,1.220456,0.199132,0.003367,1.220456,1,True,3
1,LightGBM,-1.033899e-07,-1.033885e-07,0.280857,0.006831,1.513759,0.280857,0.006831,1.513759,1,True,4
2,LightGBMLarge,-1.033899e-07,-1.033885e-07,1.516017,0.021683,13.11253,1.516017,0.021683,13.11253,1,True,13
3,WeightedEnsemble_L2,-1.033899e-07,-1.033886e-07,0.217104,0.003638,1.456501,0.017972,0.000271,0.236045,2,True,14
4,NeuralNetFastAI,-1.0339e-07,-1.033885e-07,0.731263,0.010638,56.247708,0.731263,0.010638,56.247708,1,True,10
5,NeuralNetTorch,-1.049947e-07,-1.056243e-07,0.278504,0.008057,99.509869,0.278504,0.008057,99.509869,1,True,12
6,XGBoost,-7.742596e-06,-7.752639e-06,0.145744,0.003208,0.851043,0.145744,0.003208,0.851043,1,True,11
7,CatBoost,-7.953383e-05,-8.008309e-05,0.036798,0.001353,5.561977,0.036798,0.001353,5.561977,1,True,7
8,ExtraTreesEntr,-0.0005023495,-0.001394424,0.480909,0.040692,2.067749,0.480909,0.040692,2.067749,1,True,9
9,ExtraTreesGini,-0.0005123133,-0.001415188,0.460815,0.041298,2.082321,0.460815,0.041298,2.082321,1,True,8


In [None]:
test_data = TabularDataset(test)

y_pred = predictor.predict_proba(test)
y_pred.head()

In [156]:
train_data.head()

Unnamed: 0,ID,Month,Day_of_Month,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Tail_Number,Delay,Estimated_Departure_Time,Estimated_Arrival_Time,Carrier_ID(DOT),Delay_num
5,TRAIN_000005,4,13,0,0,119,11618,4,93,11278,47,199.0,21,8,3435,Not_Delayed,1545.0,1653.253741,20452.0,0
6,TRAIN_000006,1,20,0,0,119,11618,28,47,10721,19,200.0,26,8,3495,Not_Delayed,1742.0,1903.0,19994.866726,0
8,TRAIN_000008,6,13,0,0,59,10821,4,74,11057,31,361.0,23,10,4083,Not_Delayed,1420.0,1550.0,19393.0,0
10,TRAIN_000010,8,13,0,0,93,11278,47,277,14122,36,204.0,21,0,241,Delayed,1730.0,1844.0,19994.602602,1
12,TRAIN_000012,1,12,0,0,72,11042,33,94,11292,5,1201.0,23,10,5171,Not_Delayed,1015.0,1145.0,19999.333487,0


In [171]:
train_data = TabularDataset(train)
test_data = TabularDataset(test)

train_data.drop(['ID','Delay'],axis = 1, inplace = True)
test_data.drop('ID',axis = 1, inplace = True)


label = 'Delay_num'
eval_metric = 'log_loss'

In [172]:
predictor = TabularPredictor(
    label=label, problem_type='binary', eval_metric=eval_metric
).fit(train_data, 
      presets='best_quality', 
      num_stack_levels=3)

No path specified. Models will be saved in: "AutogluonModels/ag-20230406_012526/"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=3, num_bag_folds=8, num_bag_sets=1
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230406_012526/"
AutoGluon Version:  0.7.0
Python Version:     3.8.16
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 22.3.0: Mon Jan 30 20:38:37 PST 2023; root:xnu-8792.81.3~2/RELEASE_ARM64_T6000
Train Data Rows:    255001
Train Data Columns: 17
Label Column: Delay_num
Preprocessing data ...
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Avai

[1000]	valid_set's binary_logloss: 0.438469
[2000]	valid_set's binary_logloss: 0.438063
[1000]	valid_set's binary_logloss: 0.44129
[2000]	valid_set's binary_logloss: 0.441189
[1000]	valid_set's binary_logloss: 0.440939
[2000]	valid_set's binary_logloss: 0.440713
[1000]	valid_set's binary_logloss: 0.437788
[1000]	valid_set's binary_logloss: 0.441174
[1000]	valid_set's binary_logloss: 0.441925
[2000]	valid_set's binary_logloss: 0.441852
[1000]	valid_set's binary_logloss: 0.439507
[2000]	valid_set's binary_logloss: 0.439342
[1000]	valid_set's binary_logloss: 0.441187


	-0.44	 = Validation score   (-log_loss)
	117.0s	 = Training   runtime
	2.73s	 = Validation runtime
Fitting model: LightGBM_BAG_L1 ...
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy


[1000]	valid_set's binary_logloss: 0.436161
[1000]	valid_set's binary_logloss: 0.439209
[1000]	valid_set's binary_logloss: 0.436212
[1000]	valid_set's binary_logloss: 0.440426
[1000]	valid_set's binary_logloss: 0.438978


	-0.4384	 = Validation score   (-log_loss)
	62.14s	 = Training   runtime
	1.23s	 = Validation runtime
Fitting model: RandomForestGini_BAG_L1 ...
	-0.446	 = Validation score   (-log_loss)
	18.96s	 = Training   runtime
	5.91s	 = Validation runtime
Fitting model: RandomForestEntr_BAG_L1 ...
	-0.4457	 = Validation score   (-log_loss)
	22.85s	 = Training   runtime
	6.23s	 = Validation runtime
Fitting model: CatBoost_BAG_L1 ...
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy
	-0.4379	 = Validation score   (-log_loss)
	288.41s	 = Training   runtime
	0.15s	 = Validation runtime
Fitting model: ExtraTreesGini_BAG_L1 ...
	-0.4462	 = Validation score   (-log_loss)
	5.57s	 = Training   runtime
	4.89s	 = Validation runtime
Fitting model: ExtraTreesEntr_BAG_L1 ...
	-0.4462	 = Validation score   (-log_loss)
	5.35s	 = Training   runtime
	4.94s	 = Validation runtime
Fitting model: NeuralNetFastAI_BAG_L1 ...
	Fitting 8 child models (S1F1 - S1F8) | Fitting with Sequ

Fitting model: NeuralNetFastAI_BAG_L4 ...
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy
No improvement since epoch 7: early stopping
	-0.4361	 = Validation score   (-log_loss)
	546.57s	 = Training   runtime
	0.74s	 = Validation runtime
Fitting model: XGBoost_BAG_L4 ...
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy
	-0.4367	 = Validation score   (-log_loss)
	9.49s	 = Training   runtime
	0.23s	 = Validation runtime
Fitting model: NeuralNetTorch_BAG_L4 ...
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy
	-0.4369	 = Validation score   (-log_loss)
	306.33s	 = Training   runtime
	0.72s	 = Validation runtime
Fitting model: LightGBMLarge_BAG_L4 ...
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy
	-0.4367	 = Validation score   (-log_loss)
	28.95s	 = Training   runtime
	0.33s	 = Validation runtime
Fitting model: WeightedEnsemble_L5 ..

In [173]:
print(predictor.leaderboard(silent = True))

                      model  score_val  pred_time_val     fit_time  \
0       WeightedEnsemble_L3  -0.436003      60.924778  2655.895751   
1       WeightedEnsemble_L4  -0.436012      74.152080  3379.831018   
2       WeightedEnsemble_L5  -0.436102     102.619222  4535.549572   
3    NeuralNetFastAI_BAG_L3  -0.436111      62.561428  3307.133356   
4    NeuralNetFastAI_BAG_L4  -0.436139      89.818747  4411.778093   
5           LightGBM_BAG_L3  -0.436272      61.986840  2772.817350   
6           CatBoost_BAG_L3  -0.436297      61.875266  2786.532449   
7           CatBoost_BAG_L4  -0.436354      89.131819  3888.978910   
8           LightGBM_BAG_L4  -0.436357      89.222719  3875.193574   
9         LightGBMXT_BAG_L3  -0.436364      62.059725  2775.288636   
10           XGBoost_BAG_L3  -0.436405      62.040076  2772.338113   
11        LightGBMXT_BAG_L4  -0.436420      89.278399  3876.273589   
12     LightGBMLarge_BAG_L3  -0.436475      62.195739  2794.986122   
13     LightGBMLarge

In [174]:
predictor.feature_importance(train_data)

These features in provided data are not utilized by the predictor and will be ignored: ['Cancelled', 'Diverted']
Computing feature importance via permutation shuffling for 15 features using 5000 rows with 5 shuffle sets...
	383.84s	= Expected runtime (76.77s per shuffle set)
	163.44s	= Actual runtime (Completed 5 of 5 shuffle sets)


Unnamed: 0,importance,stddev,p_value,n,p99_high,p99_low
Estimated_Departure_Time,0.047643,0.001721,2.040417e-07,5,0.051187,0.044099
Estimated_Arrival_Time,0.046219,0.001636,1.878504e-07,5,0.049587,0.042852
Month,0.040131,0.003078,4.121939e-06,5,0.046469,0.033793
Day_of_Month,0.034826,0.002173,1.809226e-06,5,0.0393,0.030352
Carrier_ID(DOT),0.032356,0.000687,2.443687e-08,5,0.033772,0.030941
Tail_Number,0.032296,0.0012,2.284246e-07,5,0.034768,0.029825
Distance,0.031067,0.00119,2.580216e-07,5,0.033518,0.028617
Origin_Airport,0.026676,0.000786,9.048831e-08,5,0.028295,0.025057
Destination_Airport,0.026564,0.000737,7.087214e-08,5,0.028081,0.025047
Destination_Airport_ID,0.024232,0.000419,1.077189e-08,5,0.025096,0.023368


In [175]:
test_data.head()

Unnamed: 0,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number
0,12,16,1156.0,1900.0,0,0,169,12266,42,310,14683,42,191.0,26,8,19393.0,4387
1,9,12,1500.0,1715.0,0,0,119,11618,28,22,10397,4,746.0,9,3,19790.0,1936
2,3,6,1600.0,1915.0,0,0,256,13930,11,204,12953,30,733.0,26,8,19977.0,2147
3,5,18,1920.0,2045.0,0,0,248,13796,4,195,12892,4,337.0,23,10,19393.0,5486
4,7,7,1915.0,2152.0,0,0,127,11697,7,195,12892,4,2343.0,18,2,20409.0,5965


In [176]:
model_to_use = predictor.get_model_best()

In [177]:
model_to_use

'WeightedEnsemble_L3'

In [178]:
model_pred = predictor.predict_proba(test_data, model=model_to_use)

In [179]:
t = time.strftime('%m-%d_%H:%M')

In [180]:
submission = pd.DataFrame(data=model_pred, columns=sample_submission.columns, index=sample_submission.index)

In [181]:
submission.to_csv(f'{t}baseline_submission.csv', index=True)