IMPORTING LIBRARIES

In [108]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics

READING THE TRAINING DATA AND VIEWING FIRST FIVE ROWS OF THE DATASET

In [109]:
df=pd.read_csv('/content/drive/MyDrive/flight_delays_train.csv')
df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


BASIC INFORMATION OF DATASET

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Month              100000 non-null  object
 1   DayofMonth         100000 non-null  object
 2   DayOfWeek          100000 non-null  object
 3   DepTime            100000 non-null  int64 
 4   UniqueCarrier      100000 non-null  object
 5   Origin             100000 non-null  object
 6   Dest               100000 non-null  object
 7   Distance           100000 non-null  int64 
 8   dep_delayed_15min  100000 non-null  object
dtypes: int64(2), object(7)
memory usage: 6.9+ MB


In [5]:
df.describe()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,Distance
count,100000.0,100000.0,100000.0,100000.0,100000.0
mean,6.53764,15.70385,3.95183,1341.52388,729.39716
std,3.413447,8.793931,1.99164,476.378445,574.61686
min,1.0,1.0,1.0,1.0,30.0
25%,4.0,8.0,2.0,931.0,317.0
50%,7.0,16.0,4.0,1330.0,575.0
75%,9.0,23.0,6.0,1733.0,957.0
max,12.0,31.0,7.0,2534.0,4962.0


In [8]:
df.isnull().sum()

Month                0
DayofMonth           0
DayOfWeek            0
DepTime              0
UniqueCarrier        0
Origin               0
Dest                 0
Distance             0
dep_delayed_15min    0
dtype: int64

CHANGING MONTH, DAY OF MONTH, DAY OF WEEK INTO INTEGERS

In [7]:
df['Month'] = df['Month'].replace('c', '', regex=True).astype(str)
df['Month'] = df['Month'].replace('-', '', regex=True).astype(int)

df['DayofMonth'] = df['DayofMonth'].replace('c', '', regex=True).astype(str)
df['DayofMonth'] = df['DayofMonth'].replace('-', '', regex=True).astype(int)

df['DayOfWeek'] = df['DayOfWeek'].replace('c', '', regex=True).astype(str)
df['DayOfWeek'] = df['DayOfWeek'].replace('-', '', regex=True).astype(int)

CHECKING WHETHER THE DATA IS BALANCED OR IMBALANCED

In [9]:
df.dep_delayed_15min.value_counts()

N    80956
Y    19044
Name: dep_delayed_15min, dtype: int64

ENCODING CATEGORICAL DATA INTO INTEGERS

In [65]:
encoder=LabelEncoder()
df["UniqueCarrier"]=encoder.fit_transform(df["UniqueCarrier"])
df["Origin"]=encoder.fit_transform(df["Origin"])
df["Dest"]=encoder.fit_transform(df["Dest"])
df

PREDICTION FEATURES

In [67]:
df_pred=df[['Month','DayofMonth','DayOfWeek','DepTime','UniqueCarrier','Origin','Dest','Distance']]


READING TESTING DATA AND VIEWING FIRST FIVE ROWS

In [110]:
df_test=pd.read_csv('/content/drive/MyDrive/flight_delays_test.csv')
df_test.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377
4,c-6,c-6,c-3,1505,UA,ORD,STL,258


PREPROCESSING THE TESTING DATA

In [111]:
df_test['Month'] = df_test['Month'].replace('c', '', regex=True).astype(str)
df_test['Month'] = df_test['Month'].replace('-', '', regex=True).astype(int)

df_test['DayofMonth'] = df_test['DayofMonth'].replace('c', '', regex=True).astype(str)
df_test['DayofMonth'] = df_test['DayofMonth'].replace('-', '', regex=True).astype(int)

df_test['DayOfWeek'] = df_test['DayOfWeek'].replace('c', '', regex=True).astype(str)
df_test['DayOfWeek'] = df_test['DayOfWeek'].replace('-', '', regex=True).astype(int)
df_test

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,7,25,3,615,YV,MRY,PHX,598
1,4,17,2,739,WN,LAS,HOU,1235
2,12,2,7,651,MQ,GSP,ORD,577
3,3,25,7,1614,WN,BWI,MHT,377
4,6,6,3,1505,UA,ORD,STL,258
...,...,...,...,...,...,...,...,...
99995,6,5,2,852,WN,CRP,HOU,187
99996,11,24,6,1446,UA,ORD,LAS,1515
99997,1,30,2,1509,OO,ORD,SGF,438
99998,1,5,5,804,DL,LGA,ATL,761


In [112]:
df_test["UniqueCarrier"]=encoder.fit_transform(df_test["UniqueCarrier"])
df_test["Origin"]=encoder.fit_transform(df_test["Origin"])
df_test["Dest"]=encoder.fit_transform(df_test["Dest"])

In [113]:
df_testdata=df_test[['Month','DayofMonth','DayOfWeek','DepTime','UniqueCarrier','Origin','Dest','Distance']]

BALANCING THE DATA USING 'SMOTE'

In [117]:
X = df_pred
y = df['dep_delayed_15min']

In [118]:
from imblearn.over_sampling import SMOTE
smote=SMOTE()
X_smote, y_smote = smote.fit_resample(X, y)

In [119]:
y_smote.value_counts()

N    80956
Y    80956
Name: dep_delayed_15min, dtype: int64

SPLITTING INTO TESTING AND TRAINING DATA AFTER USING 'SMOTE'

In [120]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_smote, y_smote, test_size=0.2, random_state=0)

LOGISTIC REGRESSION

In [122]:
logreg= LogisticRegression(solver='newton-cg')
logreg.fit(X_train1,y_train1)
y_pred= logreg.predict(X_test1)
y_pred_test_logreg_smote=logreg.predict(df_testdata)
print(metrics.classification_report(y_test1, y_pred))


              precision    recall  f1-score   support

           N       0.64      0.62      0.63     16205
           Y       0.63      0.65      0.64     16178

    accuracy                           0.63     32383
   macro avg       0.64      0.64      0.63     32383
weighted avg       0.64      0.63      0.63     32383



ADA BOOST CLASSIFIER

In [123]:
from sklearn.ensemble import AdaBoostClassifier
abc=AdaBoostClassifier()
abc.fit(X_train1,y_train1)
y_pred1 = abc.predict(X_test1)
y_pred_test_adaboost_smote=abc.predict(df_testdata)
print(metrics.classification_report(y_test1, y_pred1))

              precision    recall  f1-score   support

           N       0.71      0.64      0.68     16205
           Y       0.67      0.74      0.71     16178

    accuracy                           0.69     32383
   macro avg       0.69      0.69      0.69     32383
weighted avg       0.69      0.69      0.69     32383



DECISION TREE CLASSIFIER

In [125]:
DTC = DecisionTreeClassifier()
DTC.fit(X_train1, y_train1)
y_pred2 = DTC.predict(X_test1)
y_pred_test_dectree_smote=DTC.predict(df_testdata)
print(metrics.classification_report(y_test1, y_pred2))

              precision    recall  f1-score   support

           N       0.77      0.73      0.75     16205
           Y       0.74      0.78      0.76     16178

    accuracy                           0.75     32383
   macro avg       0.75      0.75      0.75     32383
weighted avg       0.75      0.75      0.75     32383



RANDOM FOREST CLASSIFIER

In [137]:
randomForest= RandomForestClassifier()
randomForest.fit(X_train1,y_train1)
y_pred3 = randomForest.predict(X_test1)
y_pred_test_randomfo_smote=randomForest.predict(df_testdata)
print(metrics.classification_report(y_test1, y_pred3))



              precision    recall  f1-score   support

           N       0.84      0.82      0.83     16205
           Y       0.82      0.85      0.84     16178

    accuracy                           0.83     32383
   macro avg       0.83      0.83      0.83     32383
weighted avg       0.83      0.83      0.83     32383



USING 'ADASYN' TO BALANCE THE DATA

In [129]:
from imblearn.over_sampling import ADASYN
X_adasyn, y_adasyn = ADASYN().fit_resample(X, y)
y_adasyn.value_counts()

Y    85758
N    80956
Name: dep_delayed_15min, dtype: int64

SPLITTING INTO TESTING AND TRAINING DATA AFTER BALANCING USING 'ADASYN'

In [130]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_adasyn, y_adasyn, test_size=0.2, random_state=0)

LOGISTIC REGRESSION

In [131]:
logreg1= LogisticRegression(solver='newton-cg')
logreg1.fit(X_train2,y_train2)
y_pred4= logreg.predict(X_test2)
y_pred_test_logreg_adasyn=logreg.predict(df_testdata)
print(metrics.classification_report(y_test2, y_pred4))

              precision    recall  f1-score   support

           N       0.60      0.62      0.61     16193
           Y       0.63      0.61      0.62     17150

    accuracy                           0.62     33343
   macro avg       0.62      0.62      0.62     33343
weighted avg       0.62      0.62      0.62     33343



ADABOOST CLASSIFIER

In [132]:
from sklearn.ensemble import AdaBoostClassifier
abc=AdaBoostClassifier()
abc.fit(X_train2,y_train2)
y_pred5 = abc.predict(X_test2)
y_pred_test_adaboost_adasyn=abc.predict(df_testdata)
print(metrics.classification_report(y_test2, y_pred5))

              precision    recall  f1-score   support

           N       0.70      0.61      0.65     16193
           Y       0.67      0.75      0.71     17150

    accuracy                           0.68     33343
   macro avg       0.68      0.68      0.68     33343
weighted avg       0.68      0.68      0.68     33343



DECISION TREE CLASSIFIER

In [134]:
DTC = DecisionTreeClassifier()
DTC.fit(X_train2, y_train2)
y_pred6 = DTC.predict(X_test2)
y_pred_dectree_adasyn=DTC.predict(df_testdata)
print(metrics.classification_report(y_test2, y_pred6))


              precision    recall  f1-score   support

           N       0.73      0.70      0.72     16193
           Y       0.73      0.76      0.74     17150

    accuracy                           0.73     33343
   macro avg       0.73      0.73      0.73     33343
weighted avg       0.73      0.73      0.73     33343



RANDOM FOREST CLASSIFIER

In [135]:
randomForest= RandomForestClassifier()
randomForest.fit(X_train2,y_train2)
y_pred7 = randomForest.predict(X_test2)
y_pred_randomfo_adasyn=randomForest.predict(df_testdata)
print(metrics.classification_report(y_test2, y_pred7))

              precision    recall  f1-score   support

           N       0.84      0.79      0.82     16193
           Y       0.81      0.86      0.84     17150

    accuracy                           0.83     33343
   macro avg       0.83      0.83      0.83     33343
weighted avg       0.83      0.83      0.83     33343



COVERTING THE HIGHEST ACCURACY PREDICTION TO CSV FILE

In [138]:
predicted_values=pd.DataFrame(y_pred_test_randomfo_smote)

In [139]:
predicted_values.value_counts()

N    59160
Y    40840
dtype: int64

In [141]:
predicted_values

Unnamed: 0,0
0,N
1,N
2,N
3,N
4,N
...,...
99995,N
99996,Y
99997,N
99998,Y


In [142]:
predicted_values.to_csv('prediction.csv')