In [15]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
import seaborn as sns
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN 
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import CondensedNearestNeighbour
from sklearn import svm
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier

In [None]:
df = pd.read_csv('final.csv')

for i in df.keys():
    if '_dept' in i:  ## Filtering for Arrival Features
        del df[i]

df = df.drop(['ArrDelayMinutes','DepDel15','DepDelayMinutes','airport_arr','date_arr','DepTime','ArrTime','time_arr'],1)
lb = LabelEncoder()
df['Origin'] = lb.fit_transform(df['Origin'])
df['Dest'] = lb.fit_transform(df['Dest'])

### Decision Tree Classifier

In [27]:
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)

In [28]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

Test Score: 0.871574625105775


In [29]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))

precision score : 0.6855666116725098
recall score : 0.7084027010108813
f1 score : 0.6967976052454035


In [30]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.92      0.91      0.92    285352
         1.0       0.69      0.71      0.70     75083

    accuracy                           0.87    360435
   macro avg       0.80      0.81      0.81    360435
weighted avg       0.87      0.87      0.87    360435



### Over Sampling

#### SMOTE

In [31]:
sm = SMOTE()
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.30, random_state=42)
x_tr, y_tr = sm.fit_resample(x_train, y_train)

In [32]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_tr, y_tr)
y_pred = clf.predict(x_test)

In [33]:
print("Test Score: "+ str(clf.score(x_test,y_test))) 
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

Test Score: 0.867545357188437
precision score : 0.6743821139590508
recall score : 0.7053099386154774
f1 score : 0.6894993799699958
              precision    recall  f1-score   support

         0.0       0.92      0.91      0.92    427921
         1.0       0.67      0.71      0.69    112732

    accuracy                           0.87    540653
   macro avg       0.80      0.81      0.80    540653
weighted avg       0.87      0.87      0.87    540653



#### ADAYSN

In [10]:
sm = ADASYN()
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)
x_train, y_train = sm.fit_sample(x_train, y_train)

In [11]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

Test Score: 0.7285779682883183


In [12]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

precision score : 0.3631161383600423
recall score : 0.39692152409790565
f1 score : 0.37926702367960213
              precision    recall  f1-score   support

         0.0       0.84      0.82      0.83    285138
         1.0       0.36      0.40      0.38     75297

    accuracy                           0.73    360435
   macro avg       0.60      0.61      0.60    360435
weighted avg       0.74      0.73      0.73    360435



#### Random OverSampling

In [13]:
ros = RandomOverSampler(random_state=42)
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)
x_train, y_train = ros.fit_resample(x_train, y_train)

In [14]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

Test Score: 0.7312136723681107


In [15]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

precision score : 0.3570150914896718
recall score : 0.35784958232067676
f1 score : 0.3574318498375008
              precision    recall  f1-score   support

         0.0       0.83      0.83      0.83    285138
         1.0       0.36      0.36      0.36     75297

    accuracy                           0.73    360435
   macro avg       0.59      0.59      0.59    360435
weighted avg       0.73      0.73      0.73    360435



### Under Sampling

#### Near Miss

In [16]:
nm = NearMiss()
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)
x_train, y_train = nm.fit_resample(x_train, y_train)

In [17]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

Test Score: 0.4714775202186247


In [18]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

precision score : 0.23406111924208076
recall score : 0.6732804759817789
f1 score : 0.3473637329130837
              precision    recall  f1-score   support

         0.0       0.83      0.42      0.56    285138
         1.0       0.23      0.67      0.35     75297

    accuracy                           0.47    360435
   macro avg       0.53      0.55      0.45    360435
weighted avg       0.70      0.47      0.51    360435



#### Random UnderSampling

In [19]:
rus = RandomUnderSampler()
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)
x_train, y_train = rus.fit_resample(x_train, y_train)

In [20]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

Test Score: 0.5933718978456587


In [21]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

precision score : 0.27976315561763726
recall score : 0.6011394876289892
f1 score : 0.38182807445242545
              precision    recall  f1-score   support

         0.0       0.85      0.59      0.70    285138
         1.0       0.28      0.60      0.38     75297

    accuracy                           0.59    360435
   macro avg       0.56      0.60      0.54    360435
weighted avg       0.73      0.59      0.63    360435



### Extra Trees Classifier ( Without SMOTE Oversampling )

In [None]:
sm = SMOTE()
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)

In [None]:
clf = ExtraTreesClassifier()
clf.fit(x_train, y_train) 
y_pred = clf.predict(x_test)

print("Test Scoare: "+ str(clf.score(x_test,y_test))) 

In [None]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

### Extra Trees Classifier ( Applying SMOTE Oversampling )

In [None]:
sm = SMOTE()
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)
X, Y = sm.fit_resample(X, Y)

In [38]:
clf = ExtraTreesClassifier()
clf.fit(x_train, y_train) 
y_pred = clf.predict(x_test)

print("Test Scoare: "+ str(clf.score(x_test,y_test))) 



Test Score: 0.8975376975044044


In [39]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

precision score : 0.8359278695453105
recall score : 0.6322203428206119
f1 score : 0.7199417603834108
              precision    recall  f1-score   support

         0.0       0.91      0.97      0.94    285352
         1.0       0.84      0.63      0.72     75083

    accuracy                           0.90    360435
   macro avg       0.87      0.80      0.83    360435
weighted avg       0.89      0.90      0.89    360435



### Gradient Boosting ( Applying SMOTE OverSampling )

In [8]:
clf = GradientBoostingClassifier()
clf.fit(x_train, y_train) 
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

Test Score: 0.7931027785869852


In [9]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

precision score : 0.6362951807228916
recall score : 0.022444453298272174
f1 score : 0.04335946018754891
              precision    recall  f1-score   support

         0.0       0.79      1.00      0.88    285138
         1.0       0.64      0.02      0.04     75297

    accuracy                           0.79    360435
   macro avg       0.72      0.51      0.46    360435
weighted avg       0.76      0.79      0.71    360435

