In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


import pandas as pd
import seaborn as sns
import numpy as np
from matplotlib import cm
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report , recall_score ,  precision_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
import seaborn as sns

# **Data Preparation:**

Lets prepare our data for modeling

In [None]:
# Read data

flight_details_janury_2019 = pd.read_csv('/kaggle/input/flight-delay-prediction/Jan_2019_ontime.csv')

flight_details_janury_2019.head()

In [None]:
# check the columns:
flight_details_janury_2019.columns

In [None]:
# check type of columns:
flight_details_janury_2019.info()

Lets understand what we are looking to predict:
* We have dataset of flights from januray 2019 and we want to predict if some flight will delayed or not. 
* In our dataset, we have 2 columns of delay: 1 column for departure delay [DEP_DEL15], and 1 column for arrivel delay [ARR_DEL15] 
* We want to predict if someflight will delayed in any time - arrival or departure. So lets create new classifier, named under "delayed" with 2 result:

1.  Delayed - 1 the flight will delay
2.  Delayed - 0 the flight will not delay

In [None]:
flight_details_janury_2019['DELAYED'] = (flight_details_janury_2019['ARR_DEL15'].astype(bool) | flight_details_janury_2019['DEP_DEL15'].astype(bool)).astype(int)


We want to remove the canceled and diverted flights, because we are looking for flights that succesfuly departed

In [None]:
print("The number of rows before deleted 'Cancelled' column and `DIVERTED` is " + str(flight_details_janury_2019.shape[0]) )

flight_details_janury_2019.drop(flight_details_janury_2019[flight_details_janury_2019.CANCELLED == 1].index, inplace=True)

flight_details_janury_2019.drop(flight_details_janury_2019[flight_details_janury_2019.DIVERTED == 1].index, inplace=True)

print("The number of rows after deleted 'Cancelled' column and `DIVERTED` is " + str(flight_details_janury_2019.shape[0]) )


Lets get rid of unuseful columns, that not impact on our results:
* **OP_CARRIER_AIRLINE_ID** - The id of the airline , because we will use the name of airline , for correltions and plots , more clearly to understand ariline name code than the airline id , and after that we will convert the name to numeric value
* **TAIL_NUM** - dosnt give us any information
* **OP_CARRIER_FL_NUM** - dosnt give us any information
* **ORIGIN_AIRPORT_ID** , **'ORIGIN_AIRPORT_ID'** ,**ORIGIN_AIRPORT_SEQ_ID','DEST_AIRPORT_ID','DEST_AIRPORT_SEQ_ID'** - we have instead the origin and destination airport ATA code , more clearly for understading in plots.
* **Unnamed: 21** - Unrelevant column
* **OP_CARRIER** - same as OP_UNIQUE_CARRIER
* **DEP_DEL15** - We convert the DEP and ARR to DELAYED Column
* **ARR_DEL15** - We convert the DEP and ARR to DELAYED Column
* **CANCELED**  - We check only for delayed flight , so Canceled flight are dont relevant
* **DIVERTED**  - We check only for delayed flight , so Canceled flight are dont relevant

In [None]:
flight_details_janury_2019.drop(['OP_CARRIER_AIRLINE_ID','TAIL_NUM','OP_CARRIER_FL_NUM','ORIGIN_AIRPORT_ID','ORIGIN_AIRPORT_SEQ_ID','DEST_AIRPORT_ID','DEST_AIRPORT_SEQ_ID','Unnamed: 21','OP_CARRIER','ARR_DEL15','DEP_DEL15','CANCELLED', 'DIVERTED'], axis='columns', inplace=True)


**Lets check distribution of our target variable:**


In [None]:
flight_details_janury_2019['DELAYED'].value_counts()

We can see that there are very highly difference between the rows with value 1 and 0 , so we should decrease our rows with value 0.

In [None]:
# Split the data into positive and negative
pos = flight_details_janury_2019.loc[flight_details_janury_2019.DELAYED == 1]
neg = flight_details_janury_2019.loc[flight_details_janury_2019.DELAYED == 0]

# Merge the balanced data
data = pd.concat([pos, neg.sample(n = len(pos))], axis = 0)

# Shuffle the order of data
flight_details_janury_2019 = data.sample(n = len(data)).reset_index(drop = True)

In [None]:
flight_details_janury_2019['DELAYED'].value_counts()

Lets rename names if needed for more readble:

In [None]:
flight_details_janury_2019 = flight_details_janury_2019.rename(columns={"OP_UNIQUE_CARRIER": "AIRLINE_CODE"})

Check for some Null/Na values:

In [None]:
flight_details_janury_2019.isna().sum()

the data is clean , we dont have any null values.

**Summerize**:

In [None]:
print("The Data types is:")
flight_details_janury_2019.info()

In [None]:
print("Our final data include: " + str(flight_details_janury_2019.shape[0]) + " Rows and " + str(flight_details_janury_2019.shape[1]) + " Columns" )

In [None]:
 flight_details_janury_2019.head()

**Final Data Format:**

After we carefully analyzing each data points, This is the final data:

* DAY_OF_MONTH - Day of Month
* DAY_OF_WEEK - Day of Week
* AIRLINE_CODE - Airline Carrier Code
* ORIGIN - Origin airport location
* DEST - Destination airport location
* DEP_TIME - Actual Departure Time (local time: hhmm)
* DEP_TIME_BLK - Time Block Departure (hhmm-hhmm)
* ARR_TIME - Actual Arrivel Time (local time: hhmm)
* DISTANCE - Distance between airports (miles)
* DELAYED - Classifier - 1 If flight delayed, else - 0

# **Exploratory Data Analysis**


Histograms:

In [None]:

flight_details_janury_2019.hist(figsize= (15, 14))

In [None]:
 flight_details_janury_2019.describe()

**Lets look for some corelations between the features and our classifier for better understanding,
and learn a more about our features.** 



First Lets see if there some dfference between airline companies delay, due to result we can understand if there problem with spesific company.

In [None]:
# Count delayes by company
count_delayed=flight_details_janury_2019.groupby('AIRLINE_CODE')['DELAYED'].apply(lambda x: (x==1).sum()).reset_index(name='Number Delayed')

color = cm.inferno_r(np.linspace(.4, .8, 30))

count_delayed= count_delayed.sort_values("Number Delayed" , ascending=[False])
count_delayed.plot.bar(x='AIRLINE_CODE', y='Number Delayed', color=color , figsize=(12,7))


Correlation between the day of the month to number of delays:

In [None]:
monthly_days_delayed=flight_details_janury_2019.groupby('DAY_OF_MONTH')['DELAYED'].apply(lambda x: (x==1).sum()).reset_index(name='Number Delayed')
plt.figure(figsize=(10, 6))
plt.xticks(monthly_days_delayed['DAY_OF_MONTH'])
plt.plot(monthly_days_delayed['DAY_OF_MONTH'],monthly_days_delayed['Number Delayed'])
plt.ylabel('Delayed')
plt.xlabel('Day in month')
plt.show()

Now lets see if there any corellation between the distance and delays:

In [None]:
#Calculate the precent of delays with average distance:
avg_distance_delay = flight_details_janury_2019[flight_details_janury_2019['DELAYED'] == 1]['DISTANCE'].values.mean()
#Calculate the precent of delays without average distance:
avg_distance_without_delay = flight_details_janury_2019[flight_details_janury_2019['DELAYED'] == 0]['DISTANCE'].values.mean()

print("Avergae Distance with delay: " + str(avg_distance_delay) + " mile")
print("Avergae Distance without delay: "+ str(avg_distance_without_delay) +" mile")

labels = ['Distance With Delay', 'Distance Without Delay']
sizes = [avg_distance_delay,avg_distance_without_delay]
colors = ['yellowgreen', 'gold']
texts = plt.pie(sizes, colors=colors, shadow=True, startangle=90, autopct='%1.1f%%')
plt.legend(labels, loc="best")
plt.axis('equal')
plt.tight_layout()
plt.show()

Let's see in wich day are the higher number of delays:

In [None]:
# Create the data for the days adays_values
days_values = flight_details_janury_2019.groupby('DAY_OF_WEEK')['DELAYED'].apply(lambda x: (x==1).sum()).reset_index(name='Number Delayed')
days_values.sort_values("DAY_OF_WEEK" )

days_values['DAY_OF_WEEK'] = days_values['DAY_OF_WEEK'].map({1: 'Sun', 2: 'Mon', 3:'Thu',4:'Wed',5:'Thr',6:'Fri',7:'Sat'})

df = pd.DataFrame({'Days':days_values['DAY_OF_WEEK'],'Delayed':days_values['Number Delayed']})
ax = df.plot.barh(x='Days',y='Delayed',figsize=(12,7))

We have to encode our categorial Variabels before we move to modeling:
* OP_UNIQUE_CARRIER
* ORIGIN
* DEST
* DEST_TIME_BLK

In [None]:
def encode_categories(features):
    lb_make = LabelEncoder()
    for i in range(len(features)):
        flight_details_janury_2019[features[i]] = lb_make.fit_transform(flight_details_janury_2019[features[i]])


In [None]:
encode_categories(['AIRLINE_CODE','ORIGIN','DEST','DEP_TIME_BLK',])
flight_details_janury_2019.info()

**Collerations between our features:**

In [None]:
plt.figure(figsize = (12, 10))
sns.heatmap(flight_details_janury_2019.corr(), annot = True, cmap = 'coolwarm')
plt.show()

# Modeling

Lets first create Test set and Train set:

In [None]:
# Create test and train:
feature_names = ['DAY_OF_MONTH','DAY_OF_WEEK','AIRLINE_CODE','ORIGIN','DEST','DEP_TIME','DEP_TIME_BLK','ARR_TIME','DISTANCE']
X =  flight_details_janury_2019[feature_names].values
y =  flight_details_janury_2019['DELAYED'].values

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



**Random Forest**

In [None]:
rf = RandomForestClassifier(n_estimators = 350,max_depth=14,min_samples_leaf=15,min_samples_split=5, n_jobs=-1)
rf.fit(X_train,y_train)

**GradientBoosting**

In [None]:
gb = GradientBoostingClassifier()
gb.fit(X_train,y_train)

**Decision Tree**

In [None]:
dt = DecisionTreeClassifier(max_depth=15)
dt.fit(X_train,y_train)

**AdaBoostClassifier**

In [None]:
ab = AdaBoostClassifier()
ab.fit(X_train, y_train)

# Evaluate

Evaluating of accuarcy of our models

In [None]:
# Acurracy of each model
def get_accuracy(model):
        pred = model[0].predict(X_test)
        check_overfitting(model)
        return accuracy_score(y_test, pred)

In [None]:
def check_overfitting(model):
        pred = model[0].predict(X_test)
        over_fit_check_pred = model[0].predict(X_train)
        print('Checking '+ model[1] + ' Overffiting:')
        print('Train Accuracy ' + str(accuracy_score(y_train, over_fit_check_pred)))
        print('Test Accuracy ' + str(accuracy_score(y_test, pred)))
        print('--------------------------')

In [None]:
# Plot the confusion matrix for each model:
def get_confusion_matrix(model):
    from sklearn.metrics import plot_confusion_matrix
    class_names=['Delay-False','Delay-true']
    disp = plot_confusion_matrix(model[0], X_test, y_test,
                                     display_labels=class_names, values_format='d',
                                     cmap=model[2])
    precision = precision_score(y_test, model[0].predict(X_test), average='binary')
    recall = recall_score(y_test, model[0].predict(X_test), average='binary')
    print('Avg Precision:' +  str(precision))
    print('Avg Recall:' + str(recall))
    
    disp.ax_.set_title(model[1])
    plt.show()

In [None]:
### Save the accuracy
models = [[rf,'Random Forest',plt.cm.Blues],[gb,'Gradient Boosting',plt.cm.Greens],[dt,'Decision Tree',plt.cm.Reds],[ab,'AdaBoost',plt.cm.Oranges]]
accuracy = []
for model in models:
    accuracy.append(get_accuracy(model))

Show the confusion matrix for each model with Recall and Precision

In [None]:
for model in models:
    get_confusion_matrix(model)

Plot accurcay of each model

In [None]:
plt.figure(figsize=(15,5))
model_names = ['Random Forest','Gradient Boosting','Decision Tree','AdaBoost']
ax = sns.barplot(x = model_names, y =accuracy)

accuracy_dic = dict(zip(model_names, accuracy))

for p, value in zip(ax.patches, list(accuracy_dic.values())):
    _x = p.get_x() + p.get_width() / 2
    _y = p.get_y() + p.get_height() + 0.008
    ax.text(_x, _y, round(value, 3), ha="center") 

plt.xlabel("Models")
plt.ylabel("Accuracy")
plt.title("Model vs. Accuracy")
plt.show()

In [None]:
print(classification_report(y_test, rf.predict(X_test), target_names=['Delayed','Not Delayed']))

Feature Importance:****

In [None]:
importances = rf.feature_importances_
features = list(flight_details_janury_2019.columns)
indices = np.argsort(importances)[::-1]

names = [features[i] for i in indices]

plt.figure(figsize=(15,5))
plt.bar(range(X_train.shape[1]), importances[indices])
plt.xticks(range(X_train.shape[1]), names, rotation=30, fontsize = 10)
plt.title("Feature Importance")
plt.show()

We see that Random Forest give us the best accuracy. Lets try to change our params for Random forset, and maybe we will get better result.

**Hyperparameter tuning for RandomForest**

 the optimization takes 20 minutes. for avoid long run time ,you can see the code and the  result in report



Test again the model after we did some optimization and find good parameters:

In [None]:
# Save the older accur:
rf_old_accur = accuracy[0]

rf = RandomForestClassifier(n_estimators=200, min_samples_split=5, max_features='sqrt', max_depth=45)
rf.fit(X_train,y_train)

pred = rf.predict(X_test)
rf_new_accur = accuracy_score(y_test, pred)

print("The Accuracy of RandomForest Model before tuning: " + str(rf_old_accur))
print("The Accuracy of RandomForest Model after tuning: " + str(rf_new_accur))

print("Increase of : " + str(100-((rf_old_accur * 100 ) / rf_new_accur ))+' %')