In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
import seaborn as sns
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN 
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import CondensedNearestNeighbour
from sklearn import svm
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier

Using TensorFlow backend.


In [2]:
df = pd.read_csv('final.csv')
del df['Unnamed: 0']
del df['FlightDate']

for i in df.keys():
    if '_y' in i:  ## Filtering for Arrival Features
        del df[i]

df = df.drop(['ArrDelayMinutes','DepDel15','DepDelayMinutes','airport_x','date_x','DepTime','ArrTime','time_x'],1)
lb = LabelEncoder()
df['Origin'] = lb.fit_transform(df['Origin'])
df['Dest'] = lb.fit_transform(df['Dest'])

### Decision Tree Classifier

In [3]:
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)

In [4]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

Test Score: 0.7263001650783082


In [5]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))

precision score : 0.3547925785913251
recall score : 0.3789128384929014
f1 score : 0.3664562367946157


In [6]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.83      0.82      0.83    285138
         1.0       0.35      0.38      0.37     75297

    accuracy                           0.73    360435
   macro avg       0.59      0.60      0.60    360435
weighted avg       0.73      0.73      0.73    360435



### Over Sampling

#### SMOTE

In [7]:
sm = SMOTE()
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
X, Y = sm.fit_sample(X, Y)
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)

In [8]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

Test Score: 0.8209275518712905


In [9]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

precision score : 0.814814052068853
recall score : 0.8310298956801793
f1 score : 0.8228420897565879
              precision    recall  f1-score   support

         0.0       0.83      0.81      0.82    285176
         1.0       0.81      0.83      0.82    285660

    accuracy                           0.82    570836
   macro avg       0.82      0.82      0.82    570836
weighted avg       0.82      0.82      0.82    570836



#### ADAYSN

In [10]:
sm = ADASYN()
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
X, Y = sm.fit_sample(X, Y)
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)

In [11]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

Test Score: 0.8196729419328856


In [12]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

precision score : 0.81317695853826
recall score : 0.826418506625275
f1 score : 0.8197442625034872
              precision    recall  f1-score   support

         0.0       0.83      0.81      0.82    285244
         1.0       0.81      0.83      0.82    280894

    accuracy                           0.82    566138
   macro avg       0.82      0.82      0.82    566138
weighted avg       0.82      0.82      0.82    566138



#### Random OverSampling

In [13]:
ros = RandomOverSampler(random_state=42)
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
X, Y = ros.fit_resample(X, Y)
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)

In [14]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

Test Score: 0.8872635923452621


In [15]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

precision score : 0.8304514803466585
recall score : 0.973464958342085
f1 score : 0.8962892054006839
              precision    recall  f1-score   support

         0.0       0.97      0.80      0.88    285176
         1.0       0.83      0.97      0.90    285660

    accuracy                           0.89    570836
   macro avg       0.90      0.89      0.89    570836
weighted avg       0.90      0.89      0.89    570836



### Under Sampling

#### Near Miss

In [16]:
nm = NearMiss()
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
X, Y = nm.fit_resample(X, Y)
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)

In [17]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

Test Score: 0.637315541810523


In [18]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

precision score : 0.6400469679996761
recall score : 0.6308934653043848
f1 score : 0.6354372542057202
              precision    recall  f1-score   support

         0.0       0.63      0.64      0.64     74866
         1.0       0.64      0.63      0.64     75168

    accuracy                           0.64    150034
   macro avg       0.64      0.64      0.64    150034
weighted avg       0.64      0.64      0.64    150034



#### Random UnderSampling

In [19]:
rus = RandomUnderSampler()
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
X, Y = rus.fit_resample(X, Y)
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)

In [20]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

Test Score: 0.5961515389844968


In [21]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

precision score : 0.5961125103846611
recall score : 0.6013862281822052
f1 score : 0.598737756703598
              precision    recall  f1-score   support

         0.0       0.60      0.59      0.59     74866
         1.0       0.60      0.60      0.60     75168

    accuracy                           0.60    150034
   macro avg       0.60      0.60      0.60    150034
weighted avg       0.60      0.60      0.60    150034



### SMOTE OverSampling

In [22]:
sm = SMOTE()
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
X, Y = sm.fit_sample(X, Y)
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)

### Support Vector Machines ( Applying SMOTE Oversampling )

In [None]:
clf = svm.SVC(gamma='scale')
clf.fit(x_train, y_train) 
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

In [None]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

### Extra Trees Classifier ( Applying SMOTE Oversampling )

In [23]:
clf = ExtraTreesClassifier()
clf.fit(x_train, y_train) 
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 



Test Score: 0.856482422271896


In [24]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

precision score : 0.8824874497332247
recall score : 0.8227683259819366
f1 score : 0.8515821873261967
              precision    recall  f1-score   support

         0.0       0.83      0.89      0.86    285176
         1.0       0.88      0.82      0.85    285660

    accuracy                           0.86    570836
   macro avg       0.86      0.86      0.86    570836
weighted avg       0.86      0.86      0.86    570836



### Gradient Boosting ( Applying SMOTE OverSampling )

In [25]:
clf = GradientBoostingClassifier()
clf.fit(x_train, y_train) 
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

Test Score: 0.8067553553034497


In [26]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

precision score : 0.887215521391378
recall score : 0.7032346145767696
f1 score : 0.7845837491187952
              precision    recall  f1-score   support

         0.0       0.75      0.91      0.82    285176
         1.0       0.89      0.70      0.78    285660

    accuracy                           0.81    570836
   macro avg       0.82      0.81      0.80    570836
weighted avg       0.82      0.81      0.80    570836

