In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df1 = pd.read_csv('/kaggle/input/flight-delay-prediction/Jan_2019_ontime.csv')
df2 = pd.read_csv('/kaggle/input/flight-delay-prediction/Jan_2020_ontime.csv')

In [None]:
df2.head()

In [None]:
print(df2.shape)
print(df2['Unnamed: 21'].isnull().sum())

In [None]:
def bar_plot(variable):
    var = df2[variable]
    varValue = var.value_counts()
    
    plt.figure(figsize=(9,6))
    plt.bar(varValue.index, varValue.values)
    plt.xticks(varValue.index, varValue.index.values)
    plt.ylabel("Frequency")
    plt.title(variable)
    plt.show()
    print("{} \n {}".format(variable,varValue))

In [None]:
bar_plot('CANCELLED')

In [None]:
print(df2.columns)
print(df2.shape[1])

In [None]:
df2.info()

In [None]:
column_names = df2.columns
j=0
for i in df2.columns:
    print('{} has got {} Null Sample'.format(df2.columns[j],df2[i].isnull().sum()))
    j=j+1

In [None]:
import missingno as msno
plt.figure(figsize=(4,4))
msno.bar(df2)

In [None]:
msno.heatmap(df2)

In [None]:
#Data Preprocessing
df2 = df2.drop(['Unnamed: 21'],axis=1)
df2.shape

In [None]:
#Drop NaN TAIL_NUM rows
df2 = df2.dropna(subset=['TAIL_NUM'])
print(df2['TAIL_NUM'].isna().sum())
print(df2.shape)

In [None]:
df2['DEP_DEL15'] = df2['DEP_DEL15'].replace(np.NaN,0)
df2['DEP_DEL15'].isnull().sum()

In [None]:
df2['ARR_DEL15'] = df2['ARR_DEL15'].replace(np.NaN,0)
df2['ARR_DEL15'].isnull().sum()

In [None]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan,strategy='mean')
#DEP_TIME

df2['DEP_TIME'] = imp_mean.fit_transform(df2[['DEP_TIME']])
#ARR_TIME

df2['ARR_TIME'] = imp_mean.fit_transform(df2[['ARR_TIME']])

In [None]:
column_names = df2.columns
j=0
for i in df2.columns:
    print("  {} has got {} NaN Sample " .format(df2.columns[j],df2[i].isnull().sum()))
    j=j+1

In [None]:
import seaborn as sns
f,ax= plt.subplots(figsize=(15,15))
sns.heatmap(df2.corr(),linewidths=.5,annot=True,fmt='.4f',ax=ax)
plt.show()

In [None]:
df2 = df2.drop(['DEST_AIRPORT_SEQ_ID'],axis=1)
df2 = df2.drop(['ORIGIN_AIRPORT_SEQ_ID'],axis=1)
print(df2.shape)

In [None]:
bar_plot('CANCELLED')

In [None]:
y = df2.CANCELLED
df2 = df2.drop('CANCELLED',axis=1)
X = df2

In [None]:
categorical_columns = ['OP_CARRIER','OP_UNIQUE_CARRIER','TAIL_NUM','ORIGIN','DEST','DEP_TIME_BLK']
for col in categorical_columns:
    X_encoded = pd.get_dummies(X[col],prefix_sep = '_')
    df2 = df2.drop([col],axis=1)

df2 = pd.concat([df2, X_encoded], axis=1)

In [None]:
X = df2

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,shuffle=True,random_state=42)

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf_dt = DecisionTreeClassifier(random_state = 0)
model_dt = clf_dt.fit(X_train, y_train) 

In [None]:
from sklearn import tree
tree.plot_tree(model_dt) 

In [None]:
from sklearn import metrics
y_pred = model_dt.predict(X_test)
print(metrics.classification_report(y_test,y_pred))

In [None]:
y_test.value_counts()

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(max_depth=50)
model_rf = clf_rf.fit(X_train, y_train)

In [None]:
from sklearn import metrics
y_pred = model_rf.predict(X_test)
print(metrics.classification_report(y_test,y_pred))

In [None]:
from sklearn.ensemble import AdaBoostClassifier
clf_ab = RandomForestClassifier()
model_ab = clf_ab.fit(X_train, y_train)

In [None]:
from sklearn import metrics
y_pred = model_ab.predict(X_test)
print(metrics.classification_report(y_test,y_pred))

In [None]:
import xgboost as xgb
clf_xgb = xgb.XGBClassifier()
model_xgb = clf_xgb.fit(X_train, y_train)

In [None]:
from sklearn import metrics
y_pred = model_xgb.predict(X_test)
print(metrics.classification_report(y_test,y_pred))