In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import sklearn
import scipy
import pylab
import time

import seaborn as sns

# special matplotlib argument for improved plots
from matplotlib import rcParams
sns.set_style("whitegrid")
sns.set_context("poster")

In [None]:
df= pd.read_csv("Encoded_last.csv")

In [None]:
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [None]:
df.head()

In [None]:
X=df.drop(['CANCELLED'],axis=1)
y=df['CANCELLED']

In [None]:
from sklearn.preprocessing import Imputer
imputer = Imputer()
X = imputer.fit_transform(X)

In [None]:
#Reduce the size to 10%
from sklearn.model_selection import train_test_split
_, X, _, y = train_test_split(X,y,test_size=0.1, random_state=30, stratify=y)

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(ratio='minority')
X_sm, y_sm = smote.fit_sample(X, y)


In [None]:
print(len(y_sm[y_sm==1]))
print(len(y_sm[y_sm==0]))

print(len(y[y==1]))
print(len(y[y==0]))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_sm,y_sm,test_size=0.3, random_state=30, stratify=y_sm)
#X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=30, stratify=y)

In [None]:
%%time
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline as pl
from imblearn.under_sampling import RandomUnderSampler
# Steps for pipeline
steps = [("scaler", MinMaxScaler(feature_range=(0,1))),
         ("model", LogisticRegression(random_state=40))]
pipeline = pl(steps)

# Parameters
param_grid = dict(model__C = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
                  model__penalty = ['l1', 'l2'],
                  model__class_weight = ['balanced'])

# Setting up the grid search
LRcw = GridSearchCV(pipeline, param_grid = param_grid, 
                           verbose = 3,
                           cv = 5,
                          n_jobs=-1,
                          scoring='average_precision',
                          iid=False)
# Training using CV
LRcw.fit(X_train, y_train)

In [None]:
from sklearn.metrics import roc_curve, auc, precision_recall_curve, cohen_kappa_score
from sklearn import metrics
print("best params is : ",LRcw.best_params_)
print("best score is : ", LRcw.best_score_)

In [None]:
from sklearn.metrics import roc_curve, auc, precision_recall_curve, cohen_kappa_score
from sklearn import metrics
print("best params is : ",LRcw.best_params_)
print("best score is : ", LRcw.best_score_)
y_pred_test = LRcw.predict(X_test)
y_pred_test_prob = LRcw.predict_proba(X_test)
print("#####################")
print("Test data")
print("#####################")
print("F1: ", metrics.f1_score(y_test, y_pred_test))
print("Cohen Kappa: ", metrics.cohen_kappa_score(y_test, y_pred_test))
print("Brier: ", metrics.brier_score_loss(y_test, y_pred_test))
print("LogLoss: ", metrics.log_loss(y_test, y_pred_test_prob))
print(metrics.classification_report(y_test, y_pred_test))
conf_matrix = metrics.confusion_matrix(y_test, y_pred_test)

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))
ax.matshow(conf_matrix, alpha=0.3,cmap=plt.cm.gray_r)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i, s=conf_matrix[i, j], va='center', ha='center', fontsize=24,color ='k')
plt.title('Confusion Matrix ', size=20)
plt.xlabel('Predicted label', size=20)
plt.ylabel('True label', size=20)
ax.tick_params('x', labelsize = 20)
ax.tick_params('y', labelsize = 20)
plt.tight_layout()

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
# ROC curve
fig1, ax1 = plt.subplots(figsize=(8, 8))
fpr, tpr, thresholds = roc_curve(y_test, y_pred_test_prob[:, 1])
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=1, label='Model Performance (area = %0.2f)' % (roc_auc))
plt.plot([0, 1], 
            [0, 1], 
             '--', 
             color=(0.6, 0.6, 0.6), 
             label='Luck')
    
plt.plot([0, 0, 1], 
             [0, 1, 1], 
             lw=2,
             linestyle=':',
             color='black',
             label='Perfect Performance')
        
plt.xlim([-0.02, 1.02])
plt.ylim([-0.02, 1.02])
plt.xlabel('False Positive Rate', size=20)
plt.ylabel('True Positive Rate', size=20)
plt.title('Receiver Operating Characteristic', size=20)
plt.xticks(size = 20)
plt.yticks(size = 20)
plt.legend(loc="lower right", fontsize=18)
   

In [None]:
# PR curve
fig2, ax2 = plt.subplots(figsize=(8, 8))
prec, recall, thresholds = precision_recall_curve(y_test, y_pred_test_prob[:, 1])
#pr_auc = auc(prec, recall)
plt.plot(recall, prec, lw=1, label='Model Performance (area = %0.2f)' % 
             (metrics.average_precision_score(y_test, y_pred_test_prob[:, 1])))    
plt.xlim([-0.02, 1.02])
plt.ylim([-0.02, 1.02])
plt.xlabel('Recall', size=20)
plt.ylabel('Precision', size=20)
plt.title('PR Curve for', size=20)
plt.xticks(size = 20)
plt.yticks(size = 20)
plt.legend(loc="top right", fontsize=18)