# CLASSIFICATION MODELS

In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier,GradientBoostingClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import NearMiss

In [2]:
dataset = pd.read_csv('Flight-Delay.csv')

In [3]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,Year,Quarter,Month,DayofMonth,FlightDate,OriginAirportID,Origin,DestAirportID,Dest,...,pressure,cloudcover,DewPointF,WindGustKmph,tempF,WindChillF,humidity,date,time_y,airports
0,0,2016,1,1,1,2016-01-01,14747,SEA,12478,JFK,...,1015,0,32,26,41,32,74,2016-01-01,1600,JFK
1,1,2016,1,1,1,2016-01-01,13303,MIA,12478,JFK,...,1015,0,32,26,41,32,74,2016-01-01,1600,JFK
2,2,2016,1,1,1,2016-01-01,13204,MCO,12478,JFK,...,1015,0,32,26,41,32,74,2016-01-01,1600,JFK
3,3,2016,1,1,1,2016-01-01,12892,LAX,12478,JFK,...,1015,0,32,26,41,32,74,2016-01-01,1600,JFK
4,4,2016,1,1,1,2016-01-01,13930,ORD,12478,JFK,...,1015,0,32,26,41,32,74,2016-01-01,1600,JFK


In [4]:
dataset.columns

Index(['Unnamed: 0', 'Year', 'Quarter', 'Month', 'DayofMonth', 'FlightDate',
       'OriginAirportID', 'Origin', 'DestAirportID', 'Dest', 'CRSDepTime',
       'DepTime', 'DepDelayMinutes', 'DepDel15', 'CRSArrTime', 'ArrTime',
       'ArrDelayMinutes', 'ArrDel15', 'time_x', 'primary', 'windspeedKmph',
       'winddirDegree', 'weatherCode', 'precipMM', 'visibility', 'pressure',
       'cloudcover', 'DewPointF', 'WindGustKmph', 'tempF', 'WindChillF',
       'humidity', 'date', 'time_y', 'airports'],
      dtype='object')

In [48]:
col = ['Unnamed: 0','FlightDate','time_y','date','ArrDelayMinutes','primary','Origin','Dest','time_x','airports','ArrTime','CRSArrTime']

In [6]:
dataset.drop(col,axis=1,inplace=True)

In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1851433 entries, 0 to 1851432
Data columns (total 22 columns):
 #   Column           Dtype  
---  ------           -----  
 0   Year             int64  
 1   Quarter          int64  
 2   Month            int64  
 3   DayofMonth       int64  
 4   OriginAirportID  int64  
 5   DestAirportID    int64  
 6   CRSDepTime       int64  
 7   DepTime          float64
 8   DepDelayMinutes  float64
 9   ArrDel15         float64
 10  windspeedKmph    int64  
 11  winddirDegree    int64  
 12  weatherCode      int64  
 13  precipMM         float64
 14  visibility       int64  
 15  pressure         int64  
 16  cloudcover       int64  
 17  DewPointF        int64  
 18  WindGustKmph     int64  
 19  tempF            int64  
 20  WindChillF       int64  
 21  humidity         int64  
dtypes: float64(4), int64(18)
memory usage: 310.8 MB


In [8]:
dataset.ArrDel15.value_counts()

0.0    1463375
1.0     388058
Name: ArrDel15, dtype: int64

# Training and Test Data split

In [9]:
X = dataset.drop('ArrDel15',axis=1).values
Y = dataset['ArrDel15'].values

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,random_state=1,stratify=Y)

In [11]:
print(Y.mean(),Y_train.mean(),Y_test.mean())

0.20959872704008192 0.20959851358340095 0.20959958086565286


# Logistic Regression

In [12]:
model = LogisticRegression()

In [17]:
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
report = classification_report(Y_test, Y_pred,output_dict=True)
df = pd.DataFrame(report).transpose()
df

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Unnamed: 0,precision,recall,f1-score,support
0.0,0.921051,0.977914,0.948631,292675.0
1.0,0.89144,0.683902,0.774,77612.0
accuracy,0.916289,0.916289,0.916289,0.916289
macro avg,0.906245,0.830908,0.861316,370287.0
weighted avg,0.914844,0.916289,0.912029,370287.0


### Sampling Logistic Regression

**SMOTE**

In [11]:
sm = SMOTE(random_state=42)
x_sm, y_sm = sm.fit_resample(X_train, Y_train)
model.fit(x_sm,y_sm)
y_pred = model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [13]:
report = classification_report(Y_test, y_pred,output_dict=True)
df = pd.DataFrame(report).transpose()
df

Unnamed: 0,precision,recall,f1-score,support
0.0,0.94107,0.927032,0.933998,292675.0
1.0,0.739491,0.781091,0.759722,77612.0
accuracy,0.896442,0.896442,0.896442,0.896442
macro avg,0.840281,0.854061,0.84686,370287.0
weighted avg,0.898819,0.896442,0.89747,370287.0


**Random Over Sampler**

In [14]:
ros = RandomOverSampler(random_state=42)
x_ros, y_ros = ros.fit_resample(X_train, Y_train)
model.fit(x_sm,y_sm)
y_pred = model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [16]:
report = classification_report(Y_test, y_pred,output_dict=True)
df = pd.DataFrame(report).transpose()
df

Unnamed: 0,precision,recall,f1-score,support
0.0,0.94107,0.927032,0.933998,292675.0
1.0,0.739491,0.781091,0.759722,77612.0
accuracy,0.896442,0.896442,0.896442,0.896442
macro avg,0.840281,0.854061,0.84686,370287.0
weighted avg,0.898819,0.896442,0.89747,370287.0


**Near Miss**

In [18]:
nm  = NearMiss()
x_nm, y_nm = nm.fit_resample(X_train,Y_train)
model.fit(x_nm,y_nm)
y_pred = model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [19]:
report = classification_report(Y_test, y_pred,output_dict=True)
df = pd.DataFrame(report).transpose()
df

Unnamed: 0,precision,recall,f1-score,support
0.0,0.941223,0.837243,0.886193,292675.0
1.0,0.566738,0.80284,0.664438,77612.0
accuracy,0.830032,0.830032,0.830032,0.830032
macro avg,0.753981,0.820041,0.775316,370287.0
weighted avg,0.862731,0.830032,0.839714,370287.0


**Random Under Sampler**

In [20]:
rus = RandomUnderSampler(random_state=42)
x_rus, y_rus = rus.fit_resample(X_train, Y_train)
model.fit(x_rus, y_rus)
y_pred = model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [21]:
report = classification_report(Y_test, y_pred,output_dict=True)
df = pd.DataFrame(report).transpose()
df

Unnamed: 0,precision,recall,f1-score,support
0.0,0.940812,0.928163,0.934444,292675.0
1.0,0.742173,0.779802,0.760522,77612.0
accuracy,0.897066,0.897066,0.897066,0.897066
macro avg,0.841493,0.853982,0.847483,370287.0
weighted avg,0.899177,0.897066,0.89799,370287.0


# Decision Trees

In [13]:
clf = DecisionTreeClassifier()

In [19]:
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
report = classification_report(Y_test, Y_pred,output_dict=True)
df = pd.DataFrame(report).transpose()
df

Unnamed: 0,precision,recall,f1-score,support
0.0,0.922848,0.913758,0.91828,292675.0
1.0,0.686428,0.711926,0.698944,77612.0
accuracy,0.871454,0.871454,0.871454,0.871454
macro avg,0.804638,0.812842,0.808612,370287.0
weighted avg,0.873294,0.871454,0.872308,370287.0


### Sampling Decision Trees

**SMOTE**

In [23]:
sm = SMOTE(random_state=42)
x_sm, y_sm = sm.fit_resample(X_train, Y_train)
clf.fit(x_sm,y_sm)
y_pred = clf.predict(X_test)

In [24]:
report = classification_report(Y_test, y_pred,output_dict=True)
df = pd.DataFrame(report).transpose()
df

Unnamed: 0,precision,recall,f1-score,support
0.0,0.921763,0.911902,0.916806,292675.0
1.0,0.680666,0.708125,0.694124,77612.0
accuracy,0.869191,0.869191,0.869191,0.869191
macro avg,0.801215,0.810014,0.805465,370287.0
weighted avg,0.871229,0.869191,0.870132,370287.0


**Random Over Sampler**

In [15]:
ros = RandomOverSampler(random_state=42)
x_ros, y_ros = ros.fit_resample(X_train, Y_train)
clf.fit(x_ros, y_ros)
y_pred = clf.predict(X_test)

In [16]:
report = classification_report(Y_test, y_pred,output_dict=True)
df = pd.DataFrame(report).transpose()
df

Unnamed: 0,precision,recall,f1-score,support
0.0,0.920488,0.918934,0.91971,292675.0
1.0,0.696233,0.700665,0.698442,77612.0
accuracy,0.873185,0.873185,0.873185,0.873185
macro avg,0.80836,0.809799,0.809076,370287.0
weighted avg,0.873484,0.873185,0.873332,370287.0


**Near Miss**

In [21]:
nm  = NearMiss()
x_nm, y_nm = nm.fit_resample(X_train,Y_train)
clf.fit(x_nm,y_nm)
y_pred = clf.predict(X_test)

In [22]:
report = classification_report(Y_test, y_pred,output_dict=True)
df = pd.DataFrame(report).transpose()
df

Unnamed: 0,precision,recall,f1-score,support
0.0,0.93322,0.619523,0.744684,292675.0
1.0,0.36727,0.832822,0.509745,77612.0
accuracy,0.664231,0.664231,0.664231,0.664231
macro avg,0.650245,0.726173,0.627215,370287.0
weighted avg,0.814597,0.664231,0.695441,370287.0


**Random Under Sampler**

In [17]:
rus = RandomUnderSampler(random_state=42)
x_rus, y_rus = rus.fit_resample(X_train, Y_train)
clf.fit(x_rus, y_rus)
y_pred = clf.predict(X_test)

In [20]:
report = classification_report(Y_test, y_pred,output_dict=True)
df = pd.DataFrame(report).transpose()
df

Unnamed: 0,precision,recall,f1-score,support
0.0,0.939836,0.792931,0.860156,292675.0
1.0,0.508722,0.808586,0.624525,77612.0
accuracy,0.796212,0.796212,0.796212,0.796212
macro avg,0.724279,0.800759,0.74234,370287.0
weighted avg,0.849475,0.796212,0.810768,370287.0


# Extra Trees Classifier

In [27]:
model = ExtraTreesClassifier()

In [22]:
model.fit(X_train_etc, Y_train)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [23]:
Y_pred = model.predict(X_test_etc)
report = classification_report(Y_test, Y_pred,output_dict=True)
df = pd.DataFrame(report).transpose()
df

Unnamed: 0,precision,recall,f1-score,support
0.0,0.921636,0.969,0.944724,292675.0
1.0,0.854997,0.689301,0.763259,77612.0
accuracy,0.910375,0.910375,0.910375,0.910375
macro avg,0.888316,0.82915,0.853992,370287.0
weighted avg,0.907668,0.910375,0.906689,370287.0


### Sampling Extra Trees Classifier

**SMOTE**

In [28]:
sm = SMOTE(random_state=42)
x_sm, y_sm = sm.fit_resample(X_train, Y_train)
model.fit(x_sm,y_sm)
y_pred = model.predict(X_test)

In [29]:
report = classification_report(Y_test, y_pred,output_dict=True)
df = pd.DataFrame(report).transpose()
df

Unnamed: 0,precision,recall,f1-score,support
0.0,0.928124,0.958466,0.943051,292675.0
1.0,0.821351,0.720095,0.767397,77612.0
accuracy,0.908503,0.908503,0.908503,0.908503
macro avg,0.874737,0.83928,0.855224,370287.0
weighted avg,0.905744,0.908503,0.906234,370287.0


**Random Over Sampler**

In [30]:
ros = RandomOverSampler(random_state=42)
x_ros, y_ros = ros.fit_resample(X_train, Y_train)
model.fit(x_ros, y_ros)
y_pred = model.predict(X_test)

In [31]:
report = classification_report(Y_test, y_pred,output_dict=True)
df = pd.DataFrame(report).transpose()
df

Unnamed: 0,precision,recall,f1-score,support
0.0,0.919164,0.970144,0.943966,292675.0
1.0,0.857639,0.678259,0.757474,77612.0
accuracy,0.908965,0.908965,0.908965,0.908965
macro avg,0.888401,0.824201,0.85072,370287.0
weighted avg,0.906268,0.908965,0.904877,370287.0


**Near Miss**

In [34]:
nm  = NearMiss()
x_nm, y_nm = nm.fit_resample(X_train,Y_train)
model.fit(x_nm,y_nm)
y_pred = model.predict(X_test)

In [35]:
report = classification_report(Y_test, y_pred,output_dict=True)
df = pd.DataFrame(report).transpose()
df

Unnamed: 0,precision,recall,f1-score,support
0.0,0.940832,0.58676,0.722762,292675.0
1.0,0.355843,0.860846,0.50354,77612.0
accuracy,0.644208,0.644208,0.644208,0.644208
macro avg,0.648337,0.723803,0.613151,370287.0
weighted avg,0.818218,0.644208,0.676813,370287.0


**Random Under Sampler**

In [32]:
rus = RandomUnderSampler(random_state=42)
x_rus, y_rus = rus.fit_resample(X_train, Y_train)
model.fit(x_rus, y_rus)
y_pred = model.predict(X_test)

In [33]:
report = classification_report(Y_test, y_pred,output_dict=True)
df = pd.DataFrame(report).transpose()
df

Unnamed: 0,precision,recall,f1-score,support
0.0,0.950935,0.877424,0.912702,292675.0
1.0,0.642098,0.829279,0.723783,77612.0
accuracy,0.867333,0.867333,0.867333,0.867333
macro avg,0.796517,0.853351,0.818242,370287.0
weighted avg,0.886203,0.867333,0.873104,370287.0


# Gradient Boost

In [36]:
clf = GradientBoostingClassifier()

In [25]:
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
report = classification_report(Y_test, Y_pred,output_dict=True)
df = pd.DataFrame(report).transpose()
df

Unnamed: 0,precision,recall,f1-score,support
0.0,0.921372,0.978925,0.949277,292675.0
1.0,0.896039,0.684971,0.776416,77612.0
accuracy,0.917313,0.917313,0.917313,0.917313
macro avg,0.908705,0.831948,0.862847,370287.0
weighted avg,0.916062,0.917313,0.913046,370287.0


### Sampling Gradient Boost

**SMOTE**

In [37]:
sm = SMOTE(random_state=42)
x_sm, y_sm = sm.fit_resample(X_train, Y_train)
clf.fit(x_sm,y_sm)
y_pred = clf.predict(X_test)

In [38]:
report = classification_report(Y_test, y_pred,output_dict=True)
df = pd.DataFrame(report).transpose()
df

Unnamed: 0,precision,recall,f1-score,support
0.0,0.931658,0.958726,0.944998,292675.0
1.0,0.825204,0.734796,0.77738,77612.0
accuracy,0.91179,0.91179,0.91179,0.91179
macro avg,0.878431,0.846761,0.861189,370287.0
weighted avg,0.909345,0.91179,0.909866,370287.0


**Random Over Sampler**

In [39]:
ros = RandomOverSampler(random_state=42)
x_ros, y_ros = ros.fit_resample(X_train, Y_train)
clf.fit(x_ros, y_ros)
y_pred = clf.predict(X_test)

In [40]:
report = classification_report(Y_test, y_pred,output_dict=True)
df = pd.DataFrame(report).transpose()
df

Unnamed: 0,precision,recall,f1-score,support
0.0,0.942979,0.92328,0.933026,292675.0
1.0,0.731816,0.789466,0.759548,77612.0
accuracy,0.895233,0.895233,0.895233,0.895233
macro avg,0.837397,0.856373,0.846287,370287.0
weighted avg,0.898719,0.895233,0.896665,370287.0


**Near Miss**

In [43]:
nm  = NearMiss()
x_nm, y_nm = nm.fit_resample(X_train,Y_train)
clf.fit(x_nm,y_nm)
y_pred = clf.predict(X_test)

In [44]:
report = classification_report(Y_test, y_pred,output_dict=True)
df = pd.DataFrame(report).transpose()
df

Unnamed: 0,precision,recall,f1-score,support
0.0,0.941449,0.754414,0.837617,292675.0
1.0,0.470547,0.823069,0.598775,77612.0
accuracy,0.768804,0.768804,0.768804,0.768804
macro avg,0.705998,0.788741,0.718196,370287.0
weighted avg,0.842748,0.768804,0.787556,370287.0


**Random Under Sampler**

In [41]:
rus = RandomUnderSampler(random_state=42)
x_rus, y_rus = rus.fit_resample(X_train, Y_train)
clf.fit(x_rus, y_rus)
y_pred = clf.predict(X_test)

In [42]:
report = classification_report(Y_test, y_pred,output_dict=True)
df = pd.DataFrame(report).transpose()
df

Unnamed: 0,precision,recall,f1-score,support
0.0,0.943125,0.923068,0.932989,292675.0
1.0,0.731428,0.790084,0.759625,77612.0
accuracy,0.895195,0.895195,0.895195,0.895195
macro avg,0.837276,0.856576,0.846307,370287.0
weighted avg,0.898753,0.895195,0.896652,370287.0
