<a href="https://colab.research.google.com/github/robynmundle/predicting_flight_delays/blob/main/Final_Model_Delay_Range_RFC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Copy of current Feature Engineering to save CPU strength**

Import Packages

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from sklearn import preprocessing
import time
from datetime import datetime, date, time
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings('ignore')
import copy

Completed Functions

In [2]:
# CRS_ELAPSED_TIME --> HAUL_LENGTH
def haul(df, col):
    '''Determine if flight length is SHORT, MEDIUM or LONG based on expected elapsed flight time. 
            Input: 
            (0) df containing flight information, 
            (1) column containing the elapsed flight time in minutes   
            Output:   'haul_length' column determining haul length category per row in df'''
    length=[]
    for i in df[col]:
        if i < (3*60): # up to 3 hours
            length.append(0) # 0 = SHORT HAUL
        elif (i >= (3*60)) and (i < (6*60)): # 3-6 hours
            length.append(1) # 1 = MEDIUM HAUL
        elif i >= (6*60):# 6+ hours
            length.append(2) # 2 = LONG HAUL
    df['haul_length'] = length
# example of implementation: haul(flight10k, 'crs_elapsed_time')

# CRS_DEP_TIME (hhmm) --> CRS_DEP_TIME (hh) -- to be used within time_day function
def gethour(df,col):
    '''Convert hhmm to hh (24-hr) hour-only output
            Input: 
            (0) df containing flight information, 
            (1) column containing the hhmm time                  
            Output:   rewrite on input column in rounded hh format'''
    values = []
    for i in df[col]:
        mins = (i % 100) / 60 
        hour = i // 100
        hh = round(hour+mins)
        values.append(hh)
    df[col] = values
# example of implementation: gethour(flight10k, 'crs_dep_time')

# CRS_DEP/ARR_TIME (hhmm) --> hot encoded categorical time of day 'morning, aft...' 
def time_day(df, col):
    ''' Input:
            (0) df containing flight information
            (1) corresponding column of time of flight (i.e. departure or arrival) (format hhmm)
        Output:   rewrite of time column into categorical MORNING, AFTERNOON, EVENING, or OVERNIGHT'''
    gethour(df, col)
    timeday = []
    for i in df[col]:
        if (i>=23) or (i<5):
            timeday.append(0) # 0 = OVERNIGHT
        elif (i>=5) and (i<12):
            timeday.append(1) # 1 = MORNING
        elif (i>=12) and (i<18):
            timeday.append(2) # 2 = AFTERNOON
        elif (i>=18) and (i<23):
            timeday.append(3) # 3 = EVENING
    return timeday
# example of implementation: time_day(flight10k, 'crs_dep_time')

CSVs of Pre-Evaluated Features (Historical)

In [3]:
airline_rating = pd.read_csv('../data/airline_delay_rating.csv', index_col=0)
origin_traffic = pd.read_csv('../data/origin_traffic_rating.csv', index_col=0)
origin_delay = pd.read_csv('../data/origin_delay_rating.csv', index_col=0)
dest_traffic = pd.read_csv('../data/dest_traffic_rating.csv', index_col=0)
delay_dep_h = pd.read_csv('../data/crs_dep_time_delay_rating.csv', index_col=0)
delay_arr_h = pd.read_csv('../data/crs_arr_time_delay_rating.csv', index_col=0)
weather_df = pd.read_csv('../data/weather_df_monthlymean_bins.csv', index_col=0)

Open CSV of Flight Training Information to Model

In [4]:
# This is for the dataset you want to investigate
flights = pd.read_csv('../data/flights250K.csv', index_col=0)
flights.head(1)
flights.shape

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,dep_time,dep_delay,taxi_out,wheels_off,wheels_on,taxi_in,crs_arr_time,arr_time,arr_delay,cancelled,cancellation_code,diverted,dup,crs_elapsed_time,actual_elapsed_time,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,2018-12-26,DL,DL_CODESHARE,DL,4598,OO,N641CA,4598,13851,OKC,"Oklahoma City, OK",14869,SLC,"Salt Lake City, UT",839,830.0,-9.0,15.0,845.0,945.0,25.0,1028,1010.0,-18.0,0.0,,0.0,N,169.0,160.0,120.0,1.0,866.0,,,,,,,,,


(250000, 42)

Build df based on columns we will use in transformation - Data Cleaning and Feature Implementation

See option A or B in first rows to build df based on training or test dataset (for copy pasta later)

In [10]:
# A - if this is a training dataset, we need arr_delay as our target variable so use this first block of code
model_df = flights[flights['cancelled'] == 0][['arr_delay','fl_date','op_unique_carrier','origin','dest','crs_dep_time','crs_arr_time','crs_elapsed_time','distance']]
# B - if this is a testing dataset, we will not have arr_delay and cannot include it
#model_df = flights[['tail_num','op_carrier_fl_num','fl_date','op_unique_carrier','origin','dest','crs_dep_time','crs_arr_time','crs_elapsed_time','distance']]
model_df.shape
# first regression will be simple-- is the flight going to be delayed or not?
if 'arr_delay' in model_df:
    model_df.dropna(subset=['arr_delay'], inplace=True)
    delay_bin = []
    for i in model_df['arr_delay']:
        if i <= 5:
            delay_bin.append(0) # no delay (within 5 minutes)
        elif (i > 5) and (i <= 10):
            delay_bin.append(1) # expect a 5 to 10 minute delay
        elif (i > 10) and (i <= 20):
            delay_bin.append(2) # expect a 10 to 20 minute delay
        elif (i >= 20) and  (i <= 45):
            delay_bin.append(3) # expect a 20 to 45 minute delay
        elif (i > 45):
            delay_bin.append(4) # expect a 45+ minute delay
        
    model_df['delay_range'] = delay_bin
    model_df.drop(columns='arr_delay', inplace=True)

# convert date to datetime in order to grab the month
model_df['fl_date'] = pd.to_datetime(model_df['fl_date'])
#model_df['year'] = model_df['fl_date'].dt.year # decided I do not want year
model_df['month'] = model_df['fl_date'].dt.month
model_df['day'] = model_df['fl_date'].dt.day
model_df['weekday'] = model_df['fl_date'].dt.dayofweek
model_df.drop(columns='fl_date', inplace=True) # this won't be needed after we got month

# join weather columns by origin and destination per each monthly average
model_df = model_df.merge(weather_df, left_on=['month','origin'], right_on=['month','airport'], how='left')
model_df.rename(columns={'mean_precip_monthly':'origin_precip_monthly','mean_snow_monthly':'origin_snow_monthly','mean_wind_monthly':'origin_wind_monthly','mean_cloud_monthly':'origin_cloud_monthly'}, inplace=True)
model_df.drop(columns='airport', inplace=True)
model_df = model_df.merge(weather_df, left_on=['month','dest'], right_on=['month','airport'], how='left')
model_df.rename(columns={'mean_precip_monthly':'dest_precip_monthly','mean_snow_monthly':'dest_snow_monthly','mean_wind_monthly':'dest_wind_monthly','mean_cloud_monthly':'dest_cloud_monthly'}, inplace=True)
model_df.drop(columns='airport', inplace=True)
model_df = model_df.fillna(0)

# set delay rating based on expected performance of the airline
model_df = model_df.merge(airline_rating, left_on='op_unique_carrier', right_on='airline', how='left')
model_df.drop(columns=['airline'],inplace=True) 

# obtain haul length of the flight using haul function defined above
haul(model_df, 'crs_elapsed_time')
model_df.drop(columns=['crs_elapsed_time'],inplace=True)

# new column of categorical time of day information using time_day function defined above as well as expected delays relating to the time of day departure
model_df['dep_timeday'] = time_day(model_df, 'crs_dep_time')
model_df['arr_timeday'] = time_day(model_df, 'crs_arr_time')
model_df = model_df.merge(delay_dep_h, left_on='crs_dep_time', right_on='crs_dep_time', how='left')
model_df = model_df.merge(delay_arr_h, left_on='crs_arr_time', right_on='crs_arr_time', how='left')
model_df.drop(columns=['crs_dep_time','crs_arr_time'],inplace=True)

# classify the expected traffic of the origin and departure airports
model_df = model_df.merge(origin_traffic, left_on='origin', right_on='origin', how='left')
model_df = model_df.merge(dest_traffic, left_on='dest', right_on='dest', how='left')
model_df['busy_origin'].fillna(value=model_df['busy_origin'].mean(), inplace=True)
model_df['busy_dest'].fillna(value=model_df['busy_dest'].mean(), inplace=True)
model_df = model_df.merge(origin_delay, left_on='origin', right_on='origin', how='left')
model_df.drop(columns=['origin','dest'],inplace=True)

# currently hashed out the dropping of the raw features to test out improved correlations - to keep cat feats we need to encode
# label encode values for identification of the flight later
le = preprocessing.LabelEncoder()
model_df['op_unique_carrier'] = le.fit_transform(model_df['op_unique_carrier'].values)
#model_df['origin'] = le.fit_transform(model_df['origin'].values)
#model_df['dest'] = le.fit_transform(model_df['dest'].values)

# have a look at the dataset
model_df.head(10)
model_df.shape

(245714, 9)

Unnamed: 0,op_unique_carrier,distance,delay_range,month,day,weekday,origin_precip_monthly,origin_snow_monthly,origin_wind_monthly,origin_cloud_monthly,dest_precip_monthly,dest_snow_monthly,dest_wind_monthly,dest_cloud_monthly,airline_delay,haul_length,dep_timeday,arr_timeday,delay_dep_h,delay_arr_h,busy_origin,busy_dest,origin_delay
0,19,866.0,0,12,26,2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0,1,0,1,1,0,0,3,3,1
1,24,342.0,2,4,18,3,1.0,0.0,0.0,3.0,0.0,0.0,2.0,1.0,2,0,2,3,3,2,3,3,1
2,24,1024.0,0,10,30,1,0.0,1.0,3.0,1.0,1.0,0.0,1.0,2.0,2,1,1,1,0,0,4,4,2
3,2,331.0,0,3,9,5,0.0,0.0,2.0,3.0,0.0,0.0,1.0,2.0,2,0,1,1,0,0,3,4,2
4,19,284.0,0,3,29,4,0.0,0.0,0.0,0.0,0.0,2.0,3.0,2.0,1,0,1,1,0,1,1,3,1
5,19,649.0,0,12,21,5,0.0,0.0,2.0,1.0,0.0,1.0,3.0,1.0,1,0,3,3,3,3,3,4,0
6,22,758.0,0,12,1,5,1.0,0.0,0.0,3.0,0.0,0.0,1.0,1.0,2,0,2,3,3,3,3,3,3
7,22,853.0,0,2,8,3,1.0,0.0,3.0,2.0,0.0,1.0,3.0,1.0,2,0,2,2,1,1,3,4,2
8,8,944.0,0,1,31,2,0.0,0.0,1.0,1.0,0.0,1.0,3.0,2.0,1,0,2,3,2,2,3,3,3
9,19,588.0,0,3,1,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,0,2,2,1,1,3,3,2


(245034, 23)

Import More Packages

In [6]:
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import r2_score
from sklearn import metrics

import seaborn as sns; sns.set(style='darkgrid', context='talk')
import matplotlib.pyplot as plt
import pickle

Data Scaling

In [11]:
X = model_df.drop(columns=['delay_range'])
y = model_df['delay_range']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = RobustScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

RobustScaler()

HERE WE ARE RUNNING ONLY FOR DELAY RANGE WITHOUT THE INFORMATION OF THE LOG REG

GridSearch takes too long for this late in the project. I would probably get better results if I did it but I cannot keep waiting around for the end result

In [12]:
# Classifier Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
import collections
from sklearn.model_selection import cross_val_score

classifiers = {
    "Logisitic Regression": LogisticRegression(),
    "K-Nearest Neighbour": KNeighborsClassifier(),
    "Support Vector Classifier": SVC(),
    "DecisionTree Classifier": DecisionTreeClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    'XGBoost': XGBClassifier()
}

In [None]:
for key, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    training_score = cross_val_score(classifier, X_train, y_train, cv=5)
    print(key, "has a training accuracy score of", round(training_score.mean(), 2) * 100, "%")
    y_pred = classifier.predict(X_test)
    print("Accuracy \t{:.3f}".format(metrics.accuracy_score(y_test, y_pred)))
    print('Recall \t\t{:.3f}'.format(metrics.recall_score(y_test, y_pred, average='weighted')))
    print('Precision \t{:.3f}'.format(metrics.precision_score(y_test, y_pred, average='weighted')))
    print('F1 Score \t{:.3f}'.format(metrics.f1_score(y_test, y_pred, average='weighted')))
    #print('AUC Score \t{:.3f} '.format(metrics.roc_auc_score(y_test, y_rfc_proba, multi_class='ovo')))

LogisticRegression()

Logisitic Regression has a training accuracy score of 73.0 %
Accuracy 	0.729
Recall 		0.729
Precision 	0.531
F1 Score 	0.614


KNeighborsClassifier()

K-Nearest Neighbour has a training accuracy score of 70.0 %
Accuracy 	0.709
Recall 		0.709
Precision 	0.569
F1 Score 	0.617


In [9]:
classifiers = {
    "Gaussian Naive Bayes": GaussianNB(),
    'XGBoost': XGBClassifier()
}
for key, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    training_score = cross_val_score(classifier, X_train, y_train, cv=5)
    print(key, "has a training accuracy score of", round(training_score.mean(), 2) * 100, "%")
    y_pred = classifier.predict(X_test)
    print("Accuracy \t{:.3f}".format(metrics.accuracy_score(y_test, y_pred)))
    print('Recall \t\t{:.3f}'.format(metrics.recall_score(y_test, y_pred, average='weighted')))
    print('Precision \t{:.3f}'.format(metrics.precision_score(y_test, y_pred, average='weighted')))
    print('F1 Score \t{:.3f}'.format(metrics.f1_score(y_test, y_pred, average='weighted')))


GaussianNB()

Gaussian Naive Bayes has a training accuracy score of 63.0 %
Accuracy 	0.627
Recall 		0.627
Precision 	0.564
F1 Score 	0.589


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

XGBoost has a training accuracy score of 73.0 %
Accuracy 	0.729
Recall 		0.729
Precision 	0.575
F1 Score 	0.617


In [21]:
%%time
rfc = RandomForestClassifier(max_depth=15,n_estimators=750, min_samples_split=5, random_state=0)
rfc.fit(X_train, y_train)

CPU times: user 3min 49s, sys: 1.52 s, total: 3min 51s
Wall time: 3min 50s


In [22]:
rfc.get_params

<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=750,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)>

In [23]:
y_rfc = rfc.predict(X_test)
y_rfc_proba = rfc.predict_proba(X_test)
print("Parameters: ",rfc.score)
print("Accuracy \t{:.3f}".format(metrics.accuracy_score(y_test, y_rfc)))
print('Recall \t\t{:.3f}'.format(metrics.recall_score(y_test, y_rfc, average='weighted')))
print('Precision \t{:.3f}'.format(metrics.precision_score(y_test, y_rfc, average='weighted')))
print('F1 Score \t{:.3f}'.format(metrics.f1_score(y_test, y_rfc, average='weighted')))
print('AUC Score \t{:.3f} '.format(metrics.roc_auc_score(y_test, y_rfc_proba, multi_class='ovo')))

Parameters:  <bound method ClassifierMixin.score of RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=750,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)>
Accuracy 	0.729
Recall 		0.729
Precision 	0.582
F1 Score 	0.615
AUC Score 	0.568 


In [24]:
filename = 'model3c_randforest_delayrangeonly.sav'
pickle.dump(rfc, open(filename, 'wb'))

If Re-Running: Load from PICKLE + change forest in following cells to re_forest

In [86]:
# load from pickle
re_forest = pickle.load(open(filename, 'rb'))
result = re_forest.score(X_test, y_test)
print('Re-Loaded from Pickle: ', result)

Re-Loaded from Pickle:  0.7288142510253637


# Evaluation

Alike how we predicted the whole of X in order to proceed into the second model, we need to go back and now train the entire model on the historical flights information and predict on the flight_test information.

In [31]:
flighttest = pd.read_csv('data/flighttest.csv', index_col=0)
flighttest.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,origin_city_name,dest_airport_id,dest,dest_city_name,crs_dep_time,crs_arr_time,dup,crs_elapsed_time,flights,distance
0,2020-01-01,WN,WN,WN,5888,WN,N951WN,5888,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1810,1945,N,95,1,363
1,2020-01-01,WN,WN,WN,6276,WN,N467WN,6276,13891,ONT,"Ontario, CA",14771,SFO,"San Francisco, CA",1150,1320,N,90,1,363
2,2020-01-01,WN,WN,WN,4598,WN,N7885A,4598,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",2020,2130,N,70,1,333
3,2020-01-01,WN,WN,WN,4761,WN,N551WN,4761,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",1340,1455,N,75,1,333
4,2020-01-01,WN,WN,WN,5162,WN,N968WN,5162,13891,ONT,"Ontario, CA",14831,SJC,"San Jose, CA",915,1035,N,80,1,333


Data Cleaning

In [33]:
# A - if this is a training dataset, we need arr_delay as our target variable so use this first block of code
#model_df = flights[flights['cancelled'] == 0][['arr_delay','fl_date','op_unique_carrier','origin','dest','crs_dep_time','crs_arr_time','crs_elapsed_time','distance']]
# B - if this is a testing dataset, we will not have arr_delay and cannot include it
model_ft = flighttest[['fl_date','op_unique_carrier','origin','dest','crs_dep_time','crs_arr_time','crs_elapsed_time','distance']]
#model_df.shape

# convert date to datetime in order to grab the month
model_ft['fl_date'] = pd.to_datetime(model_ft['fl_date'])
#model_df['year'] = model_df['fl_date'].dt.year # decided I do not want year
model_ft['month'] = model_ft['fl_date'].dt.month
model_ft['day'] = model_ft['fl_date'].dt.day
model_ft['weekday'] = model_ft['fl_date'].dt.dayofweek
model_ft.drop(columns='fl_date', inplace=True) # this won't be needed after we got month

# join weather columns by origin and destination per each monthly average
model_ft = model_ft.merge(weather_df, left_on=['month','origin'], right_on=['month','airport'], how='left')
model_ft.rename(columns={'mean_precip_monthly':'origin_precip_monthly','mean_snow_monthly':'origin_snow_monthly','mean_wind_monthly':'origin_wind_monthly','mean_cloud_monthly':'origin_cloud_monthly'}, inplace=True)
model_ft.drop(columns='airport', inplace=True)
model_ft = model_ft.merge(weather_df, left_on=['month','dest'], right_on=['month','airport'], how='left')
model_ft.rename(columns={'mean_precip_monthly':'dest_precip_monthly','mean_snow_monthly':'dest_snow_monthly','mean_wind_monthly':'dest_wind_monthly','mean_cloud_monthly':'dest_cloud_monthly'}, inplace=True)
model_ft.drop(columns='airport', inplace=True)
model_ft = model_ft.fillna(0)

# set delay rating based on expected performance of the airline
model_ft = model_ft.merge(airline_rating, left_on='op_unique_carrier', right_on='airline', how='left')
model_ft.drop(columns=['airline'],inplace=True) 

# obtain haul length of the flight using haul function defined above
haul(model_ft, 'crs_elapsed_time')
#model_df.drop(columns=['crs_elapsed_time'],inplace=True)

# new column of categorical time of day information using time_day function defined above as well as expected delays relating to the time of day departure
model_ft['dep_timeday'] = time_day(model_ft, 'crs_dep_time')
model_ft['arr_timeday'] = time_day(model_ft, 'crs_arr_time')
model_ft = model_ft.merge(delay_dep_h, left_on='crs_dep_time', right_on='crs_dep_time', how='left')
model_ft = model_ft.merge(delay_arr_h, left_on='crs_arr_time', right_on='crs_arr_time', how='left')
#model_df.drop(columns=['crs_dep_time','crs_arr_time'],inplace=True)

# classify the expected traffic of the origin and departure airports
model_ft = model_ft.merge(origin_traffic, left_on='origin', right_on='origin', how='left')
model_ft = model_ft.merge(dest_traffic, left_on='dest', right_on='dest', how='left')
model_ft['busy_origin'].fillna(value=model_ft['busy_origin'].mean(), inplace=True)
model_ft['busy_dest'].fillna(value=model_ft['busy_dest'].mean(), inplace=True)
model_ft = model_ft.merge(origin_delay, left_on='origin', right_on='origin', how='left')
#model_df.drop(columns=['origin','dest'],inplace=True)

# currently hashed out the dropping of the raw features to test out improved correlations - to keep cat feats we need to encode
# label encode values for identification of the flight later
le = preprocessing.LabelEncoder()
model_ft['op_unique_carrier'] = le.fit_transform(model_ft['op_unique_carrier'].values)
model_ft['origin'] = le.fit_transform(model_ft['origin'].values)
model_ft['dest'] = le.fit_transform(model_ft['dest'].values)

# have a look at the dataset
model_ft.head(10)
model_ft.shape

Unnamed: 0,op_unique_carrier,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,month,day,weekday,origin_precip_monthly,origin_snow_monthly,origin_wind_monthly,origin_cloud_monthly,dest_precip_monthly,dest_snow_monthly,dest_wind_monthly,dest_cloud_monthly,airline_delay,haul_length,dep_timeday,arr_timeday,delay_dep_h,delay_arr_h,busy_origin,busy_dest,origin_delay
0,21,246,313,18,20,95,363,1,1,2,0.0,0.0,2.0,2.0,0.0,0.0,2.0,2.0,2,0,3,3,3,3,3.0,4.0,0.0
1,21,246,313,12,13,90,363,1,1,2,0.0,0.0,2.0,2.0,0.0,0.0,2.0,2.0,2,0,2,2,1,1,3.0,4.0,0.0
2,21,246,320,20,22,70,333,1,1,2,0.0,0.0,2.0,2.0,0.0,0.0,2.0,1.0,2,0,3,3,3,3,3.0,3.0,0.0
3,21,246,320,14,15,75,333,1,1,2,0.0,0.0,2.0,2.0,0.0,0.0,2.0,1.0,2,0,2,2,1,1,3.0,3.0,0.0
4,21,246,320,9,11,80,333,1,1,2,0.0,0.0,2.0,2.0,0.0,0.0,2.0,1.0,2,0,1,1,0,0,3.0,3.0,0.0
5,21,246,320,6,7,75,333,1,1,2,0.0,0.0,2.0,2.0,0.0,0.0,2.0,1.0,2,0,1,1,0,0,3.0,3.0,0.0
6,21,246,320,16,18,80,333,1,1,2,0.0,0.0,2.0,2.0,0.0,0.0,2.0,1.0,2,0,2,3,2,2,3.0,3.0,0.0
7,21,246,325,15,16,85,390,1,1,2,0.0,0.0,2.0,2.0,0.0,0.0,0.0,1.0,2,0,2,2,1,1,3.0,3.0,0.0
8,21,246,325,12,14,85,390,1,1,2,0.0,0.0,2.0,2.0,0.0,0.0,0.0,1.0,2,0,2,2,1,1,3.0,3.0,0.0
9,21,246,325,8,9,80,390,1,1,2,0.0,0.0,2.0,2.0,0.0,0.0,0.0,1.0,2,0,1,1,0,0,3.0,3.0,0.0


(660556, 27)

Train on entire training dataset (no more X_train X_test split)

In [39]:
model_ft = model_ft.fillna(8) # there were 88 origin_delay spots np.nan so I filled them with 0 for no delay from the origin as they're likely an uncommon origin airport

In [34]:
%%time
rfc = RandomForestClassifier(max_depth=15,n_estimators=1000, min_samples_split=5, random_state=0)
rfc.fit(X, y)

CPU times: user 6min 29s, sys: 2.49 s, total: 6min 32s
Wall time: 6min 31s


In [50]:
y_pred = rfc.predict(X)
y_proba = rfc.predict_proba(X)
print("Parameters: ",rfc.score)
print("Accuracy \t{:.3f}".format(metrics.accuracy_score(y, y_pred)))
print('Recall \t\t{:.3f}'.format(metrics.recall_score(y, y_pred, average='weighted')))
print('Precision \t{:.3f}'.format(metrics.precision_score(y, y_pred, average='weighted')))
print('F1 Score \t{:.3f}'.format(metrics.f1_score(y, y_pred, average='weighted')))
print('AUC Score \t{:.3f} '.format(metrics.roc_auc_score(y, y_proba, multi_class='ovo')))

Parameters:  <bound method ClassifierMixin.score of RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=15, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)>
Accuracy 	0.727
Recall 		0.727
Precision 	0.797
F1 Score 	0.614
AUC Score 	0.896 


In [35]:
filename = 'finalmodel_fulltrainfit.sav'
pickle.dump(rfc, open(filename, 'wb'))

Data Scale Flight_Test Model for prediction

In [40]:
X_ft = model_ft

scaler = RobustScaler()
scaler.fit(X_ft)
X_ft = scaler.transform(X_ft)

RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True)

It's the final predict time...

In [41]:
y_ft = rfc.predict(X_ft)

In [42]:
model_ft['delay_pred'] = y_ft
# 0 = <5 min delay = no delay
# 1 = 5 - 10 min delay = slight delay
# 2 = 10 - 20 min delay = moderate delay
# 3 = 20 - 45 min delay = delay
# 4 = 45+ min delay = long delay

model_ft.head()
model_ft.shape

Unnamed: 0,op_unique_carrier,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,month,day,weekday,origin_precip_monthly,origin_snow_monthly,origin_wind_monthly,origin_cloud_monthly,dest_precip_monthly,dest_snow_monthly,dest_wind_monthly,dest_cloud_monthly,airline_delay,haul_length,dep_timeday,arr_timeday,delay_dep_h,delay_arr_h,busy_origin,busy_dest,origin_delay,delay_pred
0,21,246,313,18,20,95,363,1,1,2,0.0,0.0,2.0,2.0,0.0,0.0,2.0,2.0,2,0,3,3,3,3,3.0,4.0,0.0,0
1,21,246,313,12,13,90,363,1,1,2,0.0,0.0,2.0,2.0,0.0,0.0,2.0,2.0,2,0,2,2,1,1,3.0,4.0,0.0,0
2,21,246,320,20,22,70,333,1,1,2,0.0,0.0,2.0,2.0,0.0,0.0,2.0,1.0,2,0,3,3,3,3,3.0,3.0,0.0,0
3,21,246,320,14,15,75,333,1,1,2,0.0,0.0,2.0,2.0,0.0,0.0,2.0,1.0,2,0,2,2,1,1,3.0,3.0,0.0,0
4,21,246,320,9,11,80,333,1,1,2,0.0,0.0,2.0,2.0,0.0,0.0,2.0,1.0,2,0,1,1,0,0,3.0,3.0,0.0,0


(660556, 28)

In [43]:
delay_range = {
    0:'<5 min - no delay',
    1:'5 - 10 min delay',
    2:'10 - 20 min delay',
    3:'20 - 45 min delay',
    4:'45+ min delay'}

model_ft['delay_pred_range'] = model_ft['delay_pred'].map(lambda x: delay_range[x])

model_ft.head()
model_ft.shape

Unnamed: 0,op_unique_carrier,origin,dest,crs_dep_time,crs_arr_time,crs_elapsed_time,distance,month,day,weekday,origin_precip_monthly,origin_snow_monthly,origin_wind_monthly,origin_cloud_monthly,dest_precip_monthly,dest_snow_monthly,dest_wind_monthly,dest_cloud_monthly,airline_delay,haul_length,dep_timeday,arr_timeday,delay_dep_h,delay_arr_h,busy_origin,busy_dest,origin_delay,delay_pred,delay_pred_range
0,21,246,313,18,20,95,363,1,1,2,0.0,0.0,2.0,2.0,0.0,0.0,2.0,2.0,2,0,3,3,3,3,3.0,4.0,0.0,0,<5 min - no delay
1,21,246,313,12,13,90,363,1,1,2,0.0,0.0,2.0,2.0,0.0,0.0,2.0,2.0,2,0,2,2,1,1,3.0,4.0,0.0,0,<5 min - no delay
2,21,246,320,20,22,70,333,1,1,2,0.0,0.0,2.0,2.0,0.0,0.0,2.0,1.0,2,0,3,3,3,3,3.0,3.0,0.0,0,<5 min - no delay
3,21,246,320,14,15,75,333,1,1,2,0.0,0.0,2.0,2.0,0.0,0.0,2.0,1.0,2,0,2,2,1,1,3.0,3.0,0.0,0,<5 min - no delay
4,21,246,320,9,11,80,333,1,1,2,0.0,0.0,2.0,2.0,0.0,0.0,2.0,1.0,2,0,1,1,0,0,3.0,3.0,0.0,0,<5 min - no delay


(660556, 29)

In [44]:
model_ft.to_csv('flight_test_prediction_RM.csv', index=False)

In [53]:
model_ft['delay_pred'].value_counts()
print('\n\n')
model_ft['delay_pred_range'].value_counts()

0    660556
Name: delay_pred, dtype: int64






<5 min - no delay    660556
Name: delay_pred_range, dtype: int64

Aaaaand despite good fit on the training set, my model completely predicted NO DELAY for all of the flight_test dataset. Love that for us.