In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import scipy
import pylab
import time

import seaborn as sns

# special matplotlib argument for improved plots
from matplotlib import rcParams
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
df= pd.read_csv("Encoded_last.csv")

In [3]:
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [4]:
df.head()

Unnamed: 0,DATE,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,ORG_apparentTemperature,...,ARRIVAL_DELAY,DIVERTED,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,IsHoliday,Holiday,CANCELLED
0,2015-02-07 00:00:00,2015,2,7,6,OO,6222,N766SK,DFW,68.56,...,2.0,0.0,,,,,,0,02/07/2015,0.0
1,2015-11-13 00:00:00,2015,11,13,5,UA,342,N808UA,ORD,32.95,...,-13.0,0.0,,,,,,0,11/13/2015,0.0
2,2015-11-26 00:00:00,2015,11,26,4,NK,245,N528NK,ORD,57.9,...,3.0,0.0,,,,,,1,Thanksgiving Day,0.0
3,2015-09-04 00:00:00,2015,9,4,5,AA,375,N4XHAA,DEN,83.4,...,-11.0,0.0,,,,,,0,09/04/2015,0.0
4,2015-05-24 00:00:00,2015,5,24,7,US,603,N552UW,DFW,77.91,...,12.0,0.0,,,,,,0,05/24/2015,0.0


In [5]:
X=df.drop(['CANCELLED'],axis=1)
y=df['CANCELLED']

In [1]:
from sklearn.preprocessing import Imputer
imputer = Imputer()
X = imputer.fit_transform(X)

In [None]:
#Reduce the size to 10%
from sklearn.model_selection import train_test_split
_, X, _, y = train_test_split(X,y,test_size=0.1, random_state=30, stratify=y)

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(ratio='minority')
X_sm, y_sm = smote.fit_sample(X, y)


In [None]:
print(len(y_sm[y_sm==1]))
print(len(y_sm[y_sm==0]))

print(len(y[y==1]))
print(len(y[y==0]))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_sm,y_sm,test_size=0.3, random_state=30, stratify=y_sm)
#X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=30, stratify=y)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
rf = RandomForestRegressor(random_state = 42)
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 100],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 5],
    'min_samples_split': [8, 10],
    'n_estimators': [100, 200]
}

grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, scoring='average_precision',
                          iid=False)
grid_search.fit(X_train, y_train)

In [None]:
%%time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline as pl
from imblearn.under_sampling import RandomUnderSampler
# Steps for pipeline
steps = [("rus", RandomUnderSampler(random_state=30)),
         ("model", RandomForestClassifier(random_state=40))]
pipe = pl(steps)

# Parameters
param_grid = dict(model__n_estimators = [50],
                  model__max_features = ['sqrt', 'log2'],
                  model__min_samples_leaf = [5, 10, 15]
                  #model__class_weight = ['balanced', {0:0.05, 1:0.95}, {0:0.1, 1:0.9}]
                  )
# For CV in grid search
cv = 2
# Setting up the grid search
RFrus = GridSearchCV(pipe, param_grid = param_grid, 
                           verbose = 3,
                           cv = cv,
                          n_jobs=-1,
                          scoring='average_precision',
                          iid=False)
# Training using CV
RFrus.fit(X_train, y_train)

In [None]:
print("best params is : ",grid_search.best_params_)
print("best score is : ", grid_search.best_score_)

In [None]:
from sklearn.metrics import roc_curve, auc, precision_recall_curve, cohen_kappa_score
from sklearn import metrics

In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [None]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)

In [None]:
y_pred_test = RFrus.predict(X_test)
y_pred_test_prob = RFrus.predict_proba(X_test)

In [None]:
y_pred_test = RFrus.predict(X_test)
#y_pred_test_prob = grid_search.predict_proba(X_test)
print("#####################")
print("Test data")
print("#####################")
print("F1: ", metrics.f1_score(y_test, y_pred_test))
print("Cohen Kappa: ", metrics.cohen_kappa_score(y_test, y_pred_test))
print("Brier: ", metrics.brier_score_loss(y_test, y_pred_test))
#print("LogLoss: ", metrics.log_loss(y_test, y_pred_test_prob))
print(metrics.classification_report(y_test, y_pred_test))
conf_matrix = metrics.confusion_matrix(y_test, y_pred_test)

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))
ax.matshow(conf_matrix, alpha=0.3,cmap=plt.cm.gray_r)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i, s=conf_matrix[i, j], va='center', ha='center', fontsize=24,color ='k')
plt.title('Confusion Matrix ', size=20)
plt.xlabel('Predicted label', size=20)
plt.ylabel('True label', size=20)
ax.tick_params('x', labelsize = 20)
ax.tick_params('y', labelsize = 20)
plt.tight_layout()

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
# ROC curve
fig1, ax1 = plt.subplots(figsize=(8, 8))
fpr, tpr, thresholds = roc_curve(y_test, y_pred_test_prob[:, 1])
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1, label='Model Performance (area = %0.2f)' % (roc_auc))
plt.plot([0, 1], 
        [0, 1], 
        '--', 
        color=(0.6, 0.6, 0.6), 
        label='Luck')
    
plt.plot([0, 0, 1], 
        [0, 1, 1], 
        lw=2,
        linestyle=':',
        color='black',
        label='Perfect Performance')
        
plt.xlim([-0.02, 1.02])
plt.ylim([-0.02, 1.02])
plt.xlabel('False Positive Rate', size=20)
plt.ylabel('True Positive Rate', size=20)
plt.title('Receiver Operating Characteristic', size=20)
plt.xticks(size = 20)
plt.yticks(size = 20)
plt.legend(loc="lower right", fontsize=18)
   