In [0]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.linear_model import RidgeCV, LassoCV, Ridge, Lasso
import seaborn as sns
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import ADASYN 
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import NearMiss
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import CondensedNearestNeighbour
from sklearn import svm
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!ls 'drive/My Drive/Colab Notebooks/'

In [0]:
df = pd.read_csv('drive/My Drive/Colab Notebooks/final.csv')
del df['Unnamed: 0']
del df['FlightDate']

for i in df.keys():
    if '_y' in i:  ## Filtering for Arrival Features
        del df[i]

df = df.drop(['ArrDelayMinutes','DepDel15','DepDelayMinutes','airport_x','date_x','DepTime','ArrTime','time_x'],1)
lb = LabelEncoder()
df['Origin'] = lb.fit_transform(df['Origin'])
df['Dest'] = lb.fit_transform(df['Dest'])

### Decision Tree Classifier

In [0]:
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)

In [0]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

In [0]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))

In [0]:
print(classification_report(y_test,y_pred))

### Over Sampling

#### SMOTE

In [0]:
sm = SMOTE()
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
X, Y = sm.fit_sample(X, Y)
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)

In [0]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

In [0]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

#### ADAYSN

In [0]:
sm = ADASYN()
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
X, Y = sm.fit_sample(X, Y)
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)

In [0]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

In [0]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

#### Random OverSampling

In [0]:
ros = RandomOverSampler(random_state=42)
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
X, Y = ros.fit_resample(X, Y)
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)

In [0]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

In [0]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

### Under Sampling

#### Near Miss

In [0]:
nm = NearMiss()
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
X, Y = nm.fit_resample(X, Y)
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)

In [0]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

In [0]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

#### Random UnderSampling

In [0]:
rus = RandomUnderSampler()
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
X, Y = rus.fit_resample(X, Y)
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)

In [0]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

In [0]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

### SMOTE OverSampling

In [0]:
sm = SMOTE()
X = np.array(df.drop("ArrDel15", axis = 1))
Y = np.array(df["ArrDel15"])
X, Y = sm.fit_sample(X, Y)
x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)

### Support Vector Machines ( Applying SMOTE Oversampling )

In [0]:
clf = svm.SVC(gamma='scale')
clf.fit(x_train, y_train) 
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

In [0]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

### Extra Trees Classifier ( Applying SMOTE Oversampling )

In [0]:
clf = ExtraTreesClassifier()
clf.fit(x_train, y_train) 
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

In [0]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))

### Gradient Boosting ( Applying SMOTE OverSampling )

In [0]:
clf = GradientBoostingClassifier()
clf.fit(x_train, y_train) 
y_pred = clf.predict(x_test)

print("Test Score: "+ str(clf.score(x_test,y_test))) 

In [0]:
print("precision score : "+str(precision_score(y_test,y_pred))) 
print("recall score : "+str(recall_score(y_test,y_pred))) 
print("f1 score : "+str(f1_score(y_test,y_pred)))
print(classification_report(y_test,y_pred))