# Data Science Internship at Widhya

## Mission: Flight Delay Prediction

#### Importing Required Libraries

In [None]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics


#### Reading & Previewing dataset

In [None]:
df = pd.read_csv('/kaggle/input/flight-delays/flights.csv', low_memory=False)
df

#### Subsetting only 100000 rows for analysis

In [None]:
df = df[0:100000]

#### Shape of dataset

In [None]:
df.shape

#### Info of datasset

In [None]:
df.info()

#### Unique counts of 'DIVERTED' columns, wheather 0 indicate Not Diverted and 1 is Diverted.

In [None]:
df.value_counts('DIVERTED')

#### Jointplot of 'SCHEDULED_ARRIVAL' and 'ARRIVAL_TIME'

In [None]:
sns.jointplot(data=df, x="SCHEDULED_ARRIVAL", y="ARRIVAL_TIME")

#### Correlation

In [None]:
df.corr()

#### Correlation of features with 'ARRIVAL_DELAY'

In [None]:
df[df.columns[1:]].corr()['ARRIVAL_DELAY'][:].sort_values(ascending=False)

#### Droping unwwanted columns (i.e not highly co-related)

In [None]:
df=df.drop(['YEAR','FLIGHT_NUMBER','AIRLINE','DISTANCE','TAIL_NUMBER','TAXI_OUT', 'SCHEDULED_TIME','DEPARTURE_TIME','WHEELS_OFF','ELAPSED_TIME', 'AIR_TIME','WHEELS_ON','DAY_OF_WEEK','TAXI_IN','CANCELLATION_REASON'], axis=1)

In [None]:
#### Remaining columns after droping inwanted columns

In [None]:
df.columns

#### Shape of dataset

In [None]:
df.shape

#### Identifying Null values in dataset

In [None]:
df.isna().sum()

#### Replacing Null values with mean

In [None]:
df=df.fillna(df.mean())

In [None]:
df.isna().sum()

#### Confuction Matrix using Heatmap

In [None]:
sns.heatmap(df.corr(), annot=True, cmap="PuBuGn" ,fmt='g')

#### Here we notice that there are no dependent variable yet which serves as our result wheather flight is delayed or not. 

#### So here I'm creating new feature called 'result' which takes the value 0 and 1. O for the resulting flight is not delayed and 1 for flight delayed.

#### Here I'm Imputing these values by using the condition if 'ARRIVAL_DELAY' is greater than 15 then imputing 1 else o.

In [None]:
result=[]
for row in df['ARRIVAL_DELAY']:
    if row > 15:
        result.append(1)
    else:
        result.append(0)  

In [None]:
df['result'] = result

#### Preview after adding new column 'result'

In [None]:
df.head()

In [None]:
#### Unique counts of 'result' columns, wheather 0 indicate        and 1 is       .

In [None]:
df['result'].value_counts()

#### Selecting only following columns for Model Building.
>'MONTH', 'DAY', 'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY', 'SCHEDULED_ARRIVAL', 'DIVERTED', 'CANCELLED', 'AIR_SYSTEM_DELAY','SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY','WEATHER_DELAY', 'result'

In [None]:
df=df.drop(['ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'ARRIVAL_TIME', 'ARRIVAL_DELAY'],axis=1)
df.columns

#### Splitting Dataset into Training and Testing with 70:30 ratio and with random_state = 42

In [None]:
df = df.values
X, y = df[:,:-1], df[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

#### Feature Scalling

In [None]:
scaled_features = StandardScaler().fit_transform(X_train, X_test)

#### Model: DecisionTreeClassifier

In [None]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)

In [None]:
pred = clf.predict_proba(X_test)

#### AUC score of Model

In [None]:
auc_score = roc_auc_score(y_test, pred[:,1])
auc_score

In [None]:
print('AUC Score of Model is: {} %'.format(round(auc_score*100,2)))

#### Confusion Matrix and Classification Report

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result1 = confusion_matrix(y_test, pred[:,1])
print("Confusion Matrix:")
print(result1)
print('')
result2 = classification_report(y_test, pred[:,1])
print("Classification Report:",)
print (result2)
result3 = accuracy_score(y_test,pred[:,1])
print("Accuracy:",result3)

In [None]:
class_names=[0,1]
fig,ax=plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
sns.heatmap(pd.DataFrame(result1), annot=True, cmap="PuBuGn" ,fmt='g')## visualized confusion matrix using heatmap
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

#### Receiver Operating Characteristic (ROC) curve of DecisionTreeClassifier

In [None]:
plt.subplots(1, figsize=(10,6))
plt.title('Receiver Operating Characteristic - DecisionTree')
y_pred_proba = clf.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
plt.plot(fpr, tpr)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

#### Accuracy, Precision, and Recall achived by model.

In [None]:
print("Accuracy Achieved:",round(metrics.accuracy_score(y_test, pred[:,1])*100,2),'%')
print(' ')
print("Precision Achieved:",round(metrics.precision_score(y_test, pred[:,1])*100,2),'%')
print(' ')
print("Recall Achieved:",round(metrics.recall_score(y_test, pred[:,1])*100,2),'%')

### Thank you :)