In [0]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import jaccard_similarity_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier

In [0]:
df = pd.read_csv("drive/My Drive/temp_merged.csv")

In [0]:
df = df.drop(columns=['Unnamed: 0', 'FlightDate', 'Quarter', 'Year', 'Month', 'DayofMonth',
       'Origin', 'Dest','ArrTime'])
y = df['ArrDel15']
X = df.drop(columns=['ArrDel15','ArrDelayMinutes'])

In [0]:
#FEATURE SET FOR TRAINING
X.columns

In [0]:
#HEATMAP TO CHECK CORRELATION BETWEEN EACH OF THE FEATURES

plt.figure(figsize=(30,30))
cor = df.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [0]:
#CHECKING CORRELATION OF TARGET VARIABLE WITH EACH OF THE FEATURES

corr_y = abs(cor['ArrDel15'])
highest_corr = corr_y[corr_y >0.03]
highest_corr.sort_values(ascending=True)

SPLITTING INTO TEST AND TRAIN SETS

In [0]:
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

FEATURE SELECTION

In [0]:
#USING UNIVARITE FEATURE SELECTION - SelectKbest -chi2
"""
min_max_scaler = preprocessing.MinMaxScaler()
Scaled_X = min_max_scaler.fit_transform(X)

bestfeatures = SelectKBest(chi2, k=15)
fit = bestfeatures.fit(Scaled_X,y)

dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns

feats = list(featureScores.nlargest(21,'Score').Specs)

x_train = x_train[feats]
x_test = x_test[feats]

"""

In [0]:
#USING RFE TECHNIQUE
"""

estimator = linear_model.LogisticRegression(max_iter=1000) 
selector = RFE(estimator) 
selector = selector.fit(X, y)

k=list(selector.ranking_) 
feats=[] 
for i in range(len(k)): 
  if(k[i]==1): 
    feats.append(X.columns[i])

x_train = x_train[feats] 
x_test = x_test[feats]

"""

LOGISTIC REGRESSION

In [0]:
lr = linear_model.LogisticRegression(max_iter=2000)
clf = lr.fit(x_train, y_train)

In [0]:
print("Logistic regression Train Accuracy : ", clf.score(x_train,y_train))
print("Logistic regression Test Accuracy  : ", metrics.accuracy_score(y_test, lr.predict(x_test)))

print(confusion_matrix(y_test,lr.predict(x_test),labels=[1,0]))

print(classification_report(y_test,lr.predict(x_test)))

Logistic regression Train Accuracy :  0.9162363050085532
Logistic regression Test Accuracy  :  0.9157157517598977
[[ 79757  36955]
 [  9859 428859]]
              precision    recall  f1-score   support

         0.0       0.92      0.98      0.95    438718
         1.0       0.89      0.68      0.77    116712

    accuracy                           0.92    555430
   macro avg       0.91      0.83      0.86    555430
weighted avg       0.91      0.92      0.91    555430



RANDOM FOREST CLASSIFIER

In [0]:
rf = RandomForestClassifier(criterion='entropy')
clf = rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)

In [0]:
print(" Train Accuracy : ", clf.score(x_train, y_train))
print(" Test Accuracy  : ", metrics.accuracy_score(y_test, y_pred))

print("CONFUSION MATRIX\n",confusion_matrix(y_test, y_pred, labels=[1,0]))

print(classification_report(y_test, y_pred))

 Train Accuracy :  0.999964506255001
 Test Accuracy  :  0.9181715067605278
CONFUSION MATRIX
 [[ 81690  35022]
 [ 10428 428290]]
              precision    recall  f1-score   support

         0.0       0.92      0.98      0.95    438718
         1.0       0.89      0.70      0.78    116712

    accuracy                           0.92    555430
   macro avg       0.91      0.84      0.87    555430
weighted avg       0.92      0.92      0.91    555430



EXTRA TREES CLASSIFIER

In [0]:
ex = ExtraTreesClassifier(criterion='entropy')
clf = ex.fit(x_train,y_train)
y_pred = ex.predict(x_test)

In [0]:
print(" Train Accuracy : ", clf.score(x_train,y_train))
print(" Test Accuracy  : ", metrics.accuracy_score(y_test, y_pred))

print("CONFUSION MATRIX\n",confusion_matrix(y_test,y_pred,labels=[1,0]))

print(classification_report(y_test,y_pred))

 Train Accuracy :  0.9999992283968479
 Test Accuracy  :  0.912050123327872
CONFUSION MATRIX
 [[ 86705  30007]
 [ 18843 419875]]
              precision    recall  f1-score   support

         0.0       0.93      0.96      0.95    438718
         1.0       0.82      0.74      0.78    116712

    accuracy                           0.91    555430
   macro avg       0.88      0.85      0.86    555430
weighted avg       0.91      0.91      0.91    555430



XGBOOST CLASSIFIER

In [0]:
xg = XGBClassifier()
clf = xg.fit(x_train, y_train)
y_pred = xg.predict(x_test)

In [0]:
print(" Train Accuracy : ", clf.score(x_train, y_train))
print(" Test Accuracy  : ", metrics.accuracy_score(y_test, y_pred))

print("CONFUSION MATRIX\n",confusion_matrix(y_test, y_pred, labels=[1,0]))

print(classification_report(y_test, y_pred))

 Train Accuracy :  0.9174315182912385
 Test Accuracy  :  0.9167581873503412
CONFUSION MATRIX
 [[ 79500  37212]
 [  9023 429695]]
              precision    recall  f1-score   support

         0.0       0.92      0.98      0.95    438718
         1.0       0.90      0.68      0.77    116712

    accuracy                           0.92    555430
   macro avg       0.91      0.83      0.86    555430
weighted avg       0.92      0.92      0.91    555430



DECISION TREE CLASSIFIER

In [0]:
rf = DecisionTreeClassifier(criterion='entropy')
clf = rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)

In [12]:
print(" Train Accuracy : ", clf.score(x_train, y_train))
print(" Test Accuracy  : ", metrics.accuracy_score(y_test, y_pred))

print("CONFUSION MATRIX\n",confusion_matrix(y_test, y_pred, labels=[1,0]))

print(classification_report(y_test, y_pred))

 Train Accuracy :  0.9999992798367245
 Test Accuracy  :  0.8736699513242694
CONFUSION MATRIX
 [[ 69015  28420]
 [ 30053 335371]]
              precision    recall  f1-score   support

         0.0       0.92      0.92      0.92    365424
         1.0       0.70      0.71      0.70     97435

    accuracy                           0.87    462859
   macro avg       0.81      0.81      0.81    462859
weighted avg       0.87      0.87      0.87    462859

