In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import requests
from datetime import datetime
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from xgboost import XGBClassifier
from xgboost import plot_tree


In [2]:
SAMPLE_SET = pd.read_csv('SAMPLE_SET.csv')
SAMPLE_SET = SAMPLE_SET.iloc[0:1000].copy()
SAMPLE_SET.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 41 columns):
Unnamed: 0             1000 non-null int64
DATETIME               1000 non-null object
MONTH                  1000 non-null int64
DAY                    1000 non-null int64
DAY_OF_WEEK            1000 non-null int64
AIRLINE                1000 non-null object
FLIGHT_NUMBER          1000 non-null int64
TAIL_NUMBER            1000 non-null object
DESTINATION_AIRPORT    1000 non-null object
SCHEDULED_DEPARTURE    1000 non-null int64
DEPARTURE_TIME         1000 non-null float64
TAXI_OUT               1000 non-null float64
WHEELS_OFF             1000 non-null float64
SCHEDULED_TIME         1000 non-null float64
ELAPSED_TIME           1000 non-null float64
AIR_TIME               1000 non-null float64
DISTANCE               1000 non-null int64
WHEELS_ON              1000 non-null float64
TAXI_IN                1000 non-null float64
SCHEDULED_ARRIVAL      1000 non-null int64
ARRIVAL_TIME  

In [3]:
# train test split for only numerical columns (XGB and random forest)

y = SAMPLE_SET['DELAYED']
X = SAMPLE_SET.drop(['DELAYED', 'DATETIME', 'DATE_FORMAT', 'Unnamed: 0'], axis = 1)
scaler = StandardScaler()
cat_cols_list = ['summary', 'icon', 'DESTINATION_AIRPORT','TAIL_NUMBER', 'AIRLINE']
num_cols_df = X[X.columns.difference(cat_cols_list)]
num_cols_df = pd.DataFrame(scaler.fit_transform(num_cols_df),index=num_cols_df.index, columns=num_cols_df.columns)


X_train, X_test, y_train, y_test = train_test_split(num_cols_df, y, test_size = 0.25, random_state = 123)

In [4]:
clf_xgb = xgb.XGBClassifier()
clf_xgb.fit(X_train, y_train)
training_preds_xgb = clf_xgb.predict(X_train)
test_preds_xgb = clf_xgb.predict(X_test)
training_accuracy_xgb = accuracy_score(y_train, training_preds_xgb)
test_accuracy_xgb = accuracy_score(y_test, test_preds_xgb)

# default values for xgb

print(y_test)
print(test_preds_xgb)
print(training_accuracy_xgb)
print(test_accuracy_xgb)

131    0.0
203    0.0
50     1.0
585    0.0
138    1.0
      ... 
653    0.0
11     0.0
252    1.0
521    1.0
227    0.0
Name: DELAYED, Length: 250, dtype: float64
[0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0.
 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 1. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0.
 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1. 1. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0.
 1. 0. 1. 1. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 1. 0.]
0.9346666666666666
0.796


In [5]:
# xgb with param grid

xgb_param_grid = {'learning_rate':[0.1, 0.3], 
              'max_depth': [6], 
              'min_child_weight':[1], 
              'subsample':[1],
              'n_estimators':[9,5,2]}

xgb_grid_clf = GridSearchCV(clf_xgb, xgb_param_grid, scoring = 'accuracy', cv = None, n_jobs = 1)
xgb_grid_clf.fit(num_cols_df, y)
xgb_best_parameters = xgb_grid_clf.best_params_
print('Grid seach found the following optimal parameters: ')
for param_name in sorted(xgb_best_parameters.keys()):
    print('%s:%r'%(param_name, xgb_best_parameters[param_name]))

training_preds_xgb_grid = xgb_grid_clf.predict(X_train)
test_preds_xgb_grid = xgb_grid_clf.predict(X_test)
training_accuracy_xgb_grid = accuracy_score(y_train, training_preds_xgb_grid)
test_accuracy_xgb_grid = accuracy_score(y_test, test_preds_xgb_grid)
print('Training Accuracy: ', training_accuracy_xgb_grid)
print('Test Accuracy: ', test_accuracy_xgb_grid)



print(y_test)
print(test_preds_xgb_grid)
print(training_accuracy_xgb_grid)
print(test_accuracy_xgb_grid)



Grid seach found the following optimal parameters: 
learning_rate:0.1
max_depth:6
min_child_weight:1
n_estimators:9
subsample:1
Training Accuracy:  0.8946666666666667
Test Accuracy:  0.912
131    0.0
203    0.0
50     1.0
585    0.0
138    1.0
      ... 
653    0.0
11     0.0
252    1.0
521    1.0
227    0.0
Name: DELAYED, Length: 250, dtype: float64
[0. 0. 1. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0.
 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 1. 0. 1. 1. 0. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0.
 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0.
 1. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.



In [6]:
pip install lime 

Note: you may need to restart the kernel to use updated packages.


In [None]:
from lime import lime_text
from sklearn.pipeline import make_pipeline
c = make_pipeline(vectorizer, xgb_grid_clf)

print(c.predict_proba([X_test.num_cols_df[0]]))
