In [176]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import tree
import graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb

pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('display.max_columns', None)

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

In [177]:
flight_data = pd.read_csv('./dataset/merged_data/latter_flight_data.csv')
flight_data.head()
flight_data.shape

Unnamed: 0,DAY_OF_WEEK,MKT_UNIQUE_CARRIER,OP_UNIQUE_CARRIER,ORIGIN,ARR_DELAY,SCH_DEP_TIME,SCH_ARR_TIME,ORGIN_WTH_temp,ORGIN_WTH_precip,ORGIN_WTH_precipprob,ORGIN_WTH_snow,ORGIN_WTH_windspeed,ORGIN_WTH_winddir,ORGIN_WTH_cloudcover,ORGIN_WTH_visibility,ORGIN_WTH_severerisk,DEST_WTH_temp,DEST_WTH_precip,DEST_WTH_precipprob,DEST_WTH_snow,DEST_WTH_windspeed,DEST_WTH_winddir,DEST_WTH_cloudcover,DEST_WTH_visibility,DEST_WTH_severerisk,FORMER_FLIGHT_STATUS
0,6,WN,WN,MCO,-26.0,2022-01-01 10:30:00,2022-01-01 13:20:00,74.0,0.0,0,0.0,6.9,200.0,2.9,9.9,3.0,48.0,0.0,0,0.0,3.6,8.0,100.0,9.8,3.0,
1,6,UA,OO,ORD,-25.0,2022-01-01 10:40:00,2022-01-01 13:32:00,36.1,0.0,0,0.02,21.0,20.0,100.0,9.9,3.0,47.9,0.0,0,0.0,0.4,358.0,100.0,9.7,3.0,
2,6,B6,B6,MCO,22.0,2022-01-01 13:13:00,2022-01-01 15:56:00,83.0,0.0,0,0.0,9.9,199.0,4.3,9.9,3.0,47.7,0.0,0,0.0,7.9,311.0,100.0,7.8,3.0,early
3,6,B6,B6,JFK,36.0,2022-01-01 21:45:00,2022-01-01 22:59:00,52.8,0.15,100,0.0,8.1,40.0,100.0,2.2,3.0,37.9,0.02,100,0.0,6.1,303.0,100.0,6.8,3.0,
4,7,B6,B6,JFK,-12.0,2022-01-02 08:29:00,2022-01-02 09:50:00,52.1,0.0,0,0.0,0.0,0.0,100.0,5.9,3.0,25.0,0.0,0,0.01,13.8,303.0,100.0,1.2,3.0,


(6773, 26)

In [178]:
# Drop NaN for latter flight data
if True:
    flight_data.dropna(subset=['FORMER_FLIGHT_STATUS'], inplace=True)

In [179]:
X = flight_data.drop(columns=['ARR_DELAY'])
y = flight_data['ARR_DELAY']

In [180]:
potential_categorical_vars = {}
# For categorical variables if we are considering variables which were coded to integers, but are actually catagorical.
if True:
    for col in flight_data.columns:
        unique_col_vals = flight_data[col].unique()
        if(len(unique_col_vals) < 15):
            potential_categorical_vars[col] = unique_col_vals
potential_categorical_vars

{'DAY_OF_WEEK': array([6, 7, 1, 2, 3, 4, 5]),
 'MKT_UNIQUE_CARRIER': array(['B6', 'UA', 'DL', 'AA', 'WN'], dtype=object),
 'OP_UNIQUE_CARRIER': array(['B6', 'OO', '9E', 'MQ', 'G7', 'PT', 'YX', 'UA', 'WN', 'ZW'],
       dtype=object),
 'ORIGIN': array(['MCO', 'ORD', 'JFK'], dtype=object),
 'ORGIN_WTH_precipprob': array([  0, 100]),
 'ORGIN_WTH_severerisk': array([  3.,  10.,  30.,   5.,  15.,  60., 100.,   8.,  50.,  19.,  25.,
         75.,  38.]),
 'DEST_WTH_precipprob': array([  0, 100]),
 'DEST_WTH_severerisk': array([  3.,  10.,  30.,  60.,   5.,  75., 100.,   8.]),
 'FORMER_FLIGHT_STATUS': array(['early', 'late', 'on-time'], dtype=object)}

In [181]:
# Handle Categorical Variables
categorical_vars = ['DAY_OF_WEEK', 'MKT_UNIQUE_CARRIER',
                    'OP_UNIQUE_CARRIER', 'ORIGIN',
                    'ORGIN_WTH_precipprob', 'ORGIN_WTH_severerisk',
                    'DEST_WTH_precipprob', 'DEST_WTH_severerisk',
                    'FORMER_FLIGHT_STATUS', 'MONTH']

# categorical_vars = potential_categorical_vars.keys()

In [182]:
def preprocess(flight_data: pd.DataFrame):

    # Dealing with date and time
    flight_data['SCH_ARR_TIME'] = pd.to_datetime(flight_data['SCH_ARR_TIME'])
    flight_data['SCH_DEP_TIME'] = pd.to_datetime(flight_data['SCH_DEP_TIME'])

    flight_data['MONTH'] = flight_data['SCH_ARR_TIME'].dt.month
    flight_data['DAY'] = flight_data['SCH_ARR_TIME'].dt.day
    flight_data['DEP_MINUTES'] = flight_data['SCH_DEP_TIME'].dt.hour * 60 + flight_data['SCH_DEP_TIME'].dt.minute
    flight_data['ARR_MINUTES'] = flight_data['SCH_ARR_TIME'].dt.hour * 60 + flight_data['SCH_ARR_TIME'].dt.minute

    flight_data.drop(columns=['SCH_DEP_TIME', 'SCH_ARR_TIME'], inplace=True)

    # Dropping unwanted columns
    cols = [
        'ORGIN_WTH_temp', 'DEST_WTH_temp',
        'DEST_WTH_severerisk', 'ORGIN_WTH_severerisk',
        'DEST_WTH_precipprob', 'ORGIN_WTH_precipprob'
        ]
    flight_data.drop(columns=cols, inplace=True)
    
    cat_col = list(set(flight_data.columns).intersection(categorical_vars))
    flight_data = pd.get_dummies(flight_data, columns = list(cat_col), drop_first = False)

    return flight_data
    

In [183]:
X = preprocess(X)

In [184]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 35)

X_train.head()
X_test.head()
y_train.head()
y_test.head()


Unnamed: 0,ORGIN_WTH_precip,ORGIN_WTH_snow,ORGIN_WTH_windspeed,ORGIN_WTH_winddir,ORGIN_WTH_cloudcover,ORGIN_WTH_visibility,DEST_WTH_precip,DEST_WTH_snow,DEST_WTH_windspeed,DEST_WTH_winddir,DEST_WTH_cloudcover,DEST_WTH_visibility,DAY,DEP_MINUTES,ARR_MINUTES,FORMER_FLIGHT_STATUS_early,FORMER_FLIGHT_STATUS_late,FORMER_FLIGHT_STATUS_on-time,OP_UNIQUE_CARRIER_9E,OP_UNIQUE_CARRIER_B6,OP_UNIQUE_CARRIER_G7,OP_UNIQUE_CARRIER_MQ,OP_UNIQUE_CARRIER_OO,OP_UNIQUE_CARRIER_PT,OP_UNIQUE_CARRIER_UA,OP_UNIQUE_CARRIER_WN,OP_UNIQUE_CARRIER_YX,OP_UNIQUE_CARRIER_ZW,MONTH_1,MONTH_2,MONTH_3,MONTH_4,MONTH_5,MONTH_6,MONTH_7,MONTH_8,MONTH_9,MONTH_10,MONTH_11,MONTH_12,ORIGIN_JFK,ORIGIN_MCO,ORIGIN_ORD,DAY_OF_WEEK_1,DAY_OF_WEEK_2,DAY_OF_WEEK_3,DAY_OF_WEEK_4,DAY_OF_WEEK_5,DAY_OF_WEEK_6,DAY_OF_WEEK_7,MKT_UNIQUE_CARRIER_AA,MKT_UNIQUE_CARRIER_B6,MKT_UNIQUE_CARRIER_DL,MKT_UNIQUE_CARRIER_UA,MKT_UNIQUE_CARRIER_WN
1460,0.0,0.0,13.9,290.0,47.3,9.9,0.0,0.0,20.5,280.0,89.6,9.9,9,910,1081,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False
3395,0.0,0.0,1.6,27.0,23.8,9.9,0.0,0.0,4.6,70.0,48.3,9.9,8,980,1068,True,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,False
4434,0.0,0.0,16.5,295.0,90.3,9.9,0.0,0.0,0.5,355.0,86.7,9.9,24,827,996,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,True,False,False,False,False
5136,0.0,0.0,8.4,140.0,78.4,9.9,0.0,0.0,0.0,1.0,26.9,9.9,13,1116,1292,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False
5504,0.0,0.0,9.6,252.0,24.2,7.8,0.0,0.0,13.6,140.0,100.0,8.8,24,495,667,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False


Unnamed: 0,ORGIN_WTH_precip,ORGIN_WTH_snow,ORGIN_WTH_windspeed,ORGIN_WTH_winddir,ORGIN_WTH_cloudcover,ORGIN_WTH_visibility,DEST_WTH_precip,DEST_WTH_snow,DEST_WTH_windspeed,DEST_WTH_winddir,DEST_WTH_cloudcover,DEST_WTH_visibility,DAY,DEP_MINUTES,ARR_MINUTES,FORMER_FLIGHT_STATUS_early,FORMER_FLIGHT_STATUS_late,FORMER_FLIGHT_STATUS_on-time,OP_UNIQUE_CARRIER_9E,OP_UNIQUE_CARRIER_B6,OP_UNIQUE_CARRIER_G7,OP_UNIQUE_CARRIER_MQ,OP_UNIQUE_CARRIER_OO,OP_UNIQUE_CARRIER_PT,OP_UNIQUE_CARRIER_UA,OP_UNIQUE_CARRIER_WN,OP_UNIQUE_CARRIER_YX,OP_UNIQUE_CARRIER_ZW,MONTH_1,MONTH_2,MONTH_3,MONTH_4,MONTH_5,MONTH_6,MONTH_7,MONTH_8,MONTH_9,MONTH_10,MONTH_11,MONTH_12,ORIGIN_JFK,ORIGIN_MCO,ORIGIN_ORD,DAY_OF_WEEK_1,DAY_OF_WEEK_2,DAY_OF_WEEK_3,DAY_OF_WEEK_4,DAY_OF_WEEK_5,DAY_OF_WEEK_6,DAY_OF_WEEK_7,MKT_UNIQUE_CARRIER_AA,MKT_UNIQUE_CARRIER_B6,MKT_UNIQUE_CARRIER_DL,MKT_UNIQUE_CARRIER_UA,MKT_UNIQUE_CARRIER_WN
158,0.0,0.08,9.2,315.0,100.0,2.6,0.0,0.0,9.3,280.0,88.8,9.8,20,750,825,False,False,True,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False
1891,0.0,0.0,6.9,353.0,76.3,9.9,0.0,0.0,12.7,319.0,48.8,9.9,12,835,1010,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,True,False
73,0.0,0.0,13.6,304.0,0.0,9.9,0.0,0.04,19.7,240.0,28.7,9.6,10,760,931,False,True,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,True,False,False,False,False
496,0.0,0.0,6.0,292.0,0.8,9.9,0.0,0.05,0.2,354.0,50.6,9.9,2,750,825,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,True,False,False
3408,0.0,0.0,8.7,191.0,88.1,9.9,0.0,0.0,15.8,251.0,100.0,5.0,9,1104,1277,False,True,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,True,False


1460   -17.00
3395    -9.00
4434    -1.00
5136    17.00
5504    20.00
Name: ARR_DELAY, dtype: float64

158      9.00
1891   -11.00
73     -13.00
496    -14.00
3408   -16.00
Name: ARR_DELAY, dtype: float64

In [185]:
def sMAPE_metric(actual_values, predicted_values):
    actual_predicted_absolute_sum = np.abs(actual_values) + np.abs(predicted_values)
    actual_predicted_absolute_diff = np.abs(actual_values - predicted_values)
    sMAPE = np.mean(actual_predicted_absolute_diff / actual_predicted_absolute_sum)
    return sMAPE

In [186]:
gb = GradientBoostingRegressor(random_state=50, min_samples_leaf = 2, min_samples_split = 6, max_depth = 5)

gb = gb.fit(X_train, y_train) 
test_output_gb = pd.DataFrame(gb.predict(X_test), index = X_test.index, columns = ['pred_ARR_DELAY'])
test_output_gb = test_output_gb.merge(y_test, left_index = True, right_index = True)
test_output_gb.head()
mean_absolute_error_gb = abs(test_output_gb['pred_ARR_DELAY'] - test_output_gb['ARR_DELAY']).mean()
print(f"GradientBoostingRegressor mean absolute error is: {mean_absolute_error_gb:.2f}")
print(f"GradientBoostingRegressor error ratio: {abs(test_output_gb['pred_ARR_DELAY'] - test_output_gb['ARR_DELAY']).mean()/test_output_gb['ARR_DELAY'].mean():.2f}")
test_output_gb['ARR_DELAY'].mean()

Unnamed: 0,pred_ARR_DELAY,ARR_DELAY
158,25.05,9.0
1891,2.49,-11.0
73,4.28,-13.0
496,11.3,-14.0
3408,8.11,-16.0


GradientBoostingRegressor mean absolute error is: 24.16
GradientBoostingRegressor error ratio: 3.24


7.450897571277719

In [187]:
print(f"sMAPE for GradientBoostingRegressor model: {sMAPE_metric(test_output_gb['pred_ARR_DELAY'], test_output_gb['ARR_DELAY']) * 100:.2f}")

sMAPE for GradientBoostingRegressor model: 72.28


## Trying classification

In [188]:
def categorize_delay(delay):
    if delay < -7:
        return 0 #early
    elif delay > 6:
        return 2 #late
    else:
        return 1 #ontime

In [189]:
y = flight_data['ARR_DELAY'].apply(categorize_delay)
y.value_counts()

ARR_DELAY
0    1957
2    1437
1    1338
Name: count, dtype: int64

##### Trying SMOT

In [190]:
import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

if False:
    over = SMOTE()
    under = RandomUnderSampler()
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)

    x_smote, y_smote = pipeline.fit_resample(X, y)

    counter = Counter(y)
    for k,v in counter.items():
        per = v / len(y) * 100

    plt.bar(counter.keys(), counter.values())
    plt.xlabel("Class value distribution before sampling")
    plt.show()

    counter = Counter(y_smote)
    for k,v in counter.items():
        per = v / len(y) * 100

    plt.bar(counter.keys(), counter.values())
    plt.xlabel("Class value distribution before sampling")
    plt.show()

    X_train, X_test, y_train, y_test = train_test_split(x_smote, y_smote, stratify=y_smote, test_size=0.20, random_state = 21)
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.20, random_state = 21)



In [192]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'min_child_weight': [1, 2, 3],
    'learning_rate': [0.1, 0.01, 0.005],
    'max_depth': [1, 2, 3, 4]
}

xgb_classifier = XGBClassifier(
    n_estimators=600,
    reg_lambda=0.007
)

grid_search = GridSearchCV(
    estimator=xgb_classifier,
    param_grid=param_grid,
    scoring='accuracy',  # Consider other metrics such as 'f1_macro' for imbalanced classes
    cv=7,
    verbose=1
)


grid_search.fit(X, y)


print("Best parameters for the XGBClassifier model:", grid_search.best_params_)
best_score = grid_search.best_score_ * 100
print(f"Best Accuracy: {best_score:.4f}%")


for i in range(len(grid_search.cv_results_['mean_test_score'])):
    params = grid_search.cv_results_['params'][i]
    mean_test_score = grid_search.cv_results_['mean_test_score'][i]
    std_deviation_score = grid_search.cv_results_['std_test_score'][i]
    print(f"Parameters selected: {params}, Mean Accuracy: {mean_test_score*100:.4f}%, Standard Deviation: {std_deviation_score*100:.4f}%")


Fitting 7 folds for each of 36 candidates, totalling 252 fits


Best parameters for the XGBClassifier model: {'learning_rate': 0.01, 'max_depth': 1, 'min_child_weight': 2}
Best Accuracy: 43.4911%
Parameters selected: {'learning_rate': 0.1, 'max_depth': 1, 'min_child_weight': 1}, Mean Accuracy: 38.5883%, Standard Deviation: 4.7820%
Parameters selected: {'learning_rate': 0.1, 'max_depth': 1, 'min_child_weight': 2}, Mean Accuracy: 39.4336%, Standard Deviation: 4.1944%
Parameters selected: {'learning_rate': 0.1, 'max_depth': 1, 'min_child_weight': 3}, Mean Accuracy: 39.3491%, Standard Deviation: 4.0674%
Parameters selected: {'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 1}, Mean Accuracy: 36.4751%, Standard Deviation: 4.8103%
Parameters selected: {'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 2}, Mean Accuracy: 36.7075%, Standard Deviation: 4.5143%
Parameters selected: {'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 3}, Mean Accuracy: 36.9822%, Standard Deviation: 3.8217%
Parameters selected: {'learning_rate': 0.1, 'm

In [None]:
def lineplot(learning_rate_range, train, test, title):
    plt.plot(learning_rate_range, test, c='m', label='Test')
    plt.plot(learning_rate_range, train, c='orange', label='Train')
    plt.xlabel('Learning rate')
    plt.xticks(learning_rate_range)
    plt.ylabel('Accuracy score')
    plt.ylim(0.5, 1)
    plt.legend(prop={'size': 12}, loc=3)
    plt.title(title, size=14)

#### xgBoost

In [None]:
if False:
    # XGBoost (different learning rate)
    learning_rate_range = np.arange(0.01, 1, 0.05)
    test = [] 
    train = []
    for lr in learning_rate_range:
        xgb_classifier = xgb.XGBClassifier(eta = lr)
        _ =xgb_classifier.fit(X_train, y_train)
        train.append(xgb_classifier.score(X_train, y_train))
        test.append(xgb_classifier.score(X_test, y_test))
    # Line plot
    fig = plt.figure(figsize=(10, 7))
    lineplot(learning_rate_range, train, test, title='Accuracy score vs. Learning rate of XGBoost')
    plt.show()

    # Resolve overfitting 
    # new learning rate range
    learning_rate_range = np.arange(0.01, 0.5, 0.05)
    fig = plt.figure(figsize=(19, 17))
    idx = 1
    # grid search for min_child_weight
    for weight in np.arange(0, 4.5, 0.5):
        train = []
        test = []
        for lr in learning_rate_range:
            xgb_classifier = xgb.XGBClassifier(eta = lr, reg_lambda=1, min_child_weight=weight)
            _ = xgb_classifier.fit(X_train, y_train)
            train.append(xgb_classifier.score(X_train, y_train))
            test.append(xgb_classifier.score(X_test, y_test))
        fig.add_subplot(3, 3, idx)
        idx += 1
        title = "Min child weight:" + str(weight)
        lineplot(learning_rate_range, train ,test, title)
    plt.show()

In [None]:
xg_model = xgb.XGBClassifier(eta = '0.08', max_depth=5, min_child_weight=1, reg_lambda=0.007)
xg_model = xg_model.fit(X_train, y_train) 
xg_model.score(X_train, y_train) 

# gb.feature_importances_
feat_imp = pd.Series(xg_model.feature_importances_, X_train.columns.values).sort_values(ascending=False)

feat_imp_table = pd.DataFrame(feat_imp)
feat_imp_table = feat_imp_table.reset_index()
feat_imp_table.columns = ['Features', 'Values']
feat_imp.head(20)

test_output = pd.DataFrame(xg_model.predict(X_test), index = X_test.index, columns = ['pred_Y'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
print('Fraction of correct classification ')
xg_model.score(X_test, y_test) 
test_output['pred_Y'].value_counts()

0.7101717305151916

MKT_UNIQUE_CARRIER_DL        0.07
OP_UNIQUE_CARRIER_B6         0.05
ORGIN_WTH_snow               0.03
OP_UNIQUE_CARRIER_UA         0.03
OP_UNIQUE_CARRIER_MQ         0.03
OP_UNIQUE_CARRIER_OO         0.03
MONTH_5                      0.03
FORMER_FLIGHT_STATUS_early   0.02
MONTH_1                      0.02
ORIGIN_ORD                   0.02
ARR_MINUTES                  0.02
MKT_UNIQUE_CARRIER_AA        0.02
ORGIN_WTH_visibility         0.02
MONTH_8                      0.02
MONTH_11                     0.02
DAY_OF_WEEK_3                0.02
MONTH_7                      0.02
ORGIN_WTH_precip             0.02
OP_UNIQUE_CARRIER_WN         0.02
DEP_MINUTES                  0.02
dtype: float32

Unnamed: 0,pred_Y,ARR_DELAY
2237,0,0
2429,0,2
2706,1,2
2526,0,1
2438,0,2


Fraction of correct classification 


0.4941921858500528

pred_Y
0    579
2    226
1    142
Name: count, dtype: int64

#### GradientBoosting

In [None]:
gb = GradientBoostingClassifier(random_state=50, min_samples_split = 2, min_samples_leaf = 6, max_depth = 7, n_estimators = 200)
gb = gb.fit(X_train, y_train) 
gb.score(X_train, y_train) 

# gb.feature_importances_
feat_imp = pd.Series(gb.feature_importances_, X_train.columns.values).sort_values(ascending=False)

feat_imp_table = pd.DataFrame(feat_imp)
feat_imp_table = feat_imp_table.reset_index()
feat_imp_table.columns = ['Features', 'Values']
feat_imp.head(20)

test_output = pd.DataFrame(gb.predict(X_test), index = X_test.index, columns = ['pred_Y'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
print('Fraction of correct classification ')
gb.score(X_test, y_test) 
test_output['pred_Y'].value_counts()

0.9994715984147953

ORGIN_WTH_winddir            0.10
ORGIN_WTH_windspeed          0.10
DEST_WTH_winddir             0.09
DEST_WTH_windspeed           0.09
ARR_MINUTES                  0.07
DAY                          0.07
DEP_MINUTES                  0.07
ORGIN_WTH_cloudcover         0.07
DEST_WTH_cloudcover          0.06
DEST_WTH_visibility          0.03
ORGIN_WTH_visibility         0.02
DEST_WTH_precip              0.02
ORGIN_WTH_precip             0.01
FORMER_FLIGHT_STATUS_early   0.01
MKT_UNIQUE_CARRIER_DL        0.01
OP_UNIQUE_CARRIER_UA         0.01
OP_UNIQUE_CARRIER_B6         0.01
DAY_OF_WEEK_1                0.01
DAY_OF_WEEK_5                0.01
ORGIN_WTH_snow               0.01
dtype: float64

Unnamed: 0,pred_Y,ARR_DELAY
2237,1,0
2429,0,2
2706,2,2
2526,0,1
2438,0,2


Fraction of correct classification 


0.4699049630411827

pred_Y
0    487
2    278
1    182
Name: count, dtype: int64

##### Random Forest

In [None]:
rf = RandomForestClassifier(max_features='sqrt', min_samples_leaf=4)
rf = rf.fit(X_train, y_train)
rf.score(X_train, y_train)

feat_imp = pd.Series(rf.feature_importances_, X_train.columns.values).sort_values(ascending=False)
feat_imp.head(20)

test_output = pd.DataFrame(rf.predict(X_test), index = X_test.index, columns = ['pred_Y'])

test_output.head()
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
print('Fraction of correct classification ')
rf.score(X_test, y_test) 
test_output['pred_Y'].value_counts()

0.8826948480845442

ORGIN_WTH_windspeed            0.08
DEST_WTH_winddir               0.08
ORGIN_WTH_winddir              0.08
ARR_MINUTES                    0.08
DEP_MINUTES                    0.07
DEST_WTH_windspeed             0.07
ORGIN_WTH_cloudcover           0.07
DAY                            0.07
DEST_WTH_cloudcover            0.06
DEST_WTH_visibility            0.03
ORGIN_WTH_visibility           0.03
FORMER_FLIGHT_STATUS_early     0.02
DEST_WTH_precip                0.01
ORGIN_WTH_precip               0.01
OP_UNIQUE_CARRIER_B6           0.01
MKT_UNIQUE_CARRIER_DL          0.01
FORMER_FLIGHT_STATUS_late      0.01
MKT_UNIQUE_CARRIER_B6          0.01
FORMER_FLIGHT_STATUS_on-time   0.01
DAY_OF_WEEK_1                  0.01
dtype: float64

Unnamed: 0,pred_Y
2237,0
2429,0
2706,0
2526,0
2438,0


Unnamed: 0,pred_Y,ARR_DELAY
2237,0,0
2429,0,2
2706,0,2
2526,0,1
2438,0,2


Fraction of correct classification 


0.5068637803590285

pred_Y
0    581
2    256
1    110
Name: count, dtype: int64