In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn import svm;
from sklearn.linear_model import LinearRegression;
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
# Collecting data
df_2019 = pd.read_csv('/kaggle/input/flight-delay-prediction/Jan_2019_ontime.csv')
df_2020 = pd.read_csv('/kaggle/input/flight-delay-prediction/Jan_2020_ontime.csv')

## Problem definition.

Predict whether a particular flight will be delayed or not. The data refer to flights from January-19 
and January-20, so we can use the data to predict flight delays in January for the next period (year-2020).
* Binary classification problem.
* 21 variables per dataset.
* Dataset with flights from Jan-19 and Jan-20.
* Variable response is 'ARR_DEL15'

Variable dictionary:
'DAY_OF_MONTH': Day of the month.

'DAY_OF_WEEK': Day of the week.

'OP_UNIQUE_CARRIER': Unique transport code.

'OP_CARRIER_AIRLINE_ID': Unique aviation operator code.

'OP_CARRIER': IATA code of the operator.

'TAIL_NUM': Tail number.

'OP_CARRIER_FL_NUM': Flight number.

'ORIGIN_AIRPORT_ID': Origin airport ID.

'ORIGIN_AIRPORT_SEQ_ID': Origin airport ID - SEQ.

'ORIGIN': Airport of Origin.

'DEST_AIRPORT_ID': ID of the destination airport.

'DEST_AIRPORT_SEQ_ID': Destination airport ID - SEQ.

'DEST': Destination airport.

'DEP_TIME': Flight departure time.

'DEP_DEL15': Departure delay indicator

'DEP_TIME_BLK': block of time (hour) where the match has been postponed.

'ARR_TIME': Flight arrival time.

'ARR_DEL15': Arrival delay indicator.

'CANCELLED': Flight cancellation indicator.

'DIVERTED': Indicator if the flight has been diverted.

'DISTANCE': Distance between airports.
 

In [None]:
#Checking if the bases have the same columns
print(set(df_2020.columns) == set(df_2019.columns))

#Creating year indicator.
df_2019['year'] = 2019
df_2020['year'] = 2020

dataset = pd.concat([df_2019,df_2020])
print(dataset.shape)
dataset.head()

#remove unnessary dataframe to avaiod memeory problems
del df_2019
del df_2020

We will remove variables that has no ralationship with delay

In [None]:
data = dataset.drop(['OP_UNIQUE_CARRIER','OP_CARRIER_AIRLINE_ID','OP_CARRIER','TAIL_NUM', 'ORIGIN_AIRPORT_ID','ORIGIN_AIRPORT_SEQ_ID','DEST_AIRPORT_ID','DEST_AIRPORT_SEQ_ID','Unnamed: 21'], axis=1)
data = data.set_index('OP_CARRIER_FL_NUM')
data.head()

We will check data in all column having na's

In [None]:
# Check na in every column
data.isna().sum()

In [None]:
data[data['DEP_TIME'].isnull()].head()

Note that all values having DEP_TIME na's have cancelled status. We will further verify this.

In [None]:
data.query("DEP_TIME != DEP_TIME & CANCELLED == 0").head()

In [None]:
data[data['DEP_DEL15'].isnull()].head()

Also Note that all values having DEP_DEL15 na's have cancelled status. This is the same  case as of DEP_TIME. We will further verify this.

In [None]:
data.query("DEP_DEL15 != DEP_DEL15 & CANCELLED == 0").head()

In [None]:
data[data['ARR_TIME'].isnull()].head()

Same case as of DEP_TIME, DEP_DEL15, ARR_TIME. 
This makes sense b/c cancelled flight has no departure time, departure delay and arrival delay.

In [None]:
data[data['ARR_DEL15'].isnull()].head()

Note that ARR_DEL15 has both cases Cancelled that Diverted. We wil further investegate this 

In [None]:
data.query("ARR_DEL15 != ARR_DEL15 & CANCELLED == 0 & DIVERTED == 0").head()

This Confirms that Arrival Delay is either due to Cancelled or Flighted is diverted for some reason. No Flight arrival is delayed without these reason.
We will further check if the same condition applies to Departure delay or not ?

In [None]:
data.query("DEP_DEL15 != DEP_DEL15 & CANCELLED == 0 & DIVERTED == 0").head()

This verify our above hypothesis

## Exploratory Analysis

1- How many flights are delayed on arrival and daparture ?

2- How many flights have diverted but not delayed on arrival and departure?

In [None]:
# Q:1
f, (ax,ax1) = plt.subplots(1,2, figsize=(12,6))
dep = sns.countplot(data['DEP_DEL15'], ax=ax)
dep.set_title('Depatures')
dep.set_xlabel('Labels')
dep.set_ylabel('Freq')

arr = sns.countplot(data['ARR_DEL15'], ax=ax1)
arr.set_title('Arrivals')
arr.set_xlabel('Labels')
arr.set_ylabel('Freq')

In [None]:
# Q 2: How many flights have diverted but not delayed on arrival and departure?

data['DEP_DEL15'] = data['DEP_DEL15'].astype('float')
data['ARR_DEL15'] = data['ARR_DEL15'].astype('float')
f_delay_div_cal= data.groupby('DIVERTED').agg({
    'DEP_DEL15':sum,
    'ARR_DEL15':sum
}).head()

f_delay_div_cal.plot(kind='bar')

This shows, only Diverted flights have affected Departure Delay

In [None]:
#Since our target is to predict delay therefore we will remove all row dist delay is not Nan
data = data.query("DEP_DEL15 == DEP_DEL15 ")

In [None]:
# Check na in every column
data.isna().sum()

Before Imputation of null values we need to make new features from existiing features

In [None]:
def arr_time(x):

  if x >= 600 and x <= 659:
    return '0600-0659'
  elif x>=1400 and x<=1459:
    return '1400-1459'
  elif x>=1200 and x<=1259:
    return '1200-1259'
  elif x>=1500 and x<=1559:
    return '1500-1559'
  elif x>=1900 and x<=1959:
    return '1900-1959'
  elif x>=900 and x<=959:
    return '0900-0959'
  elif x>=1000 and x<=1059:
    return  '1000-1059'
  elif x>=2000 and x<=2059:
    return '2000-2059'
  elif x>=1300 and x<=1359:
    return '1300-1359'
  elif x>=1100 and x<=1159:
    return '1100-1159'
  elif x>=800 and x<=859:
    return '0800-0859'
  elif x>=2200 and x<=2259:
    return '2200-2259'
  elif x>=1600 and x<=1659:
    return '1600-1659'
  elif x>=1700 and x<=1759:
    return '1700-1759'
  elif x>=2100 and x<=2159:
    return '2100-2159'
  elif x>=700 and x<=759:
    return '0700-0759'
  elif x>=1800 and x<=1859:
    return '1800-1859'
  elif x>=1 and x<=559:
    return '0001-0559'
  elif x>=2300 and x<=2400:
    return '2300-2400'
  elif x==0:
    return '0000-0000'

In [None]:
# We can create ARR_TIME_BLOCK.
data['ARR_TIME'].fillna(0, inplace = True) 
data['ARR_TIME'] = data['ARR_TIME'].astype('int')
data['ARR_TIME_BLOCK'] = data['ARR_TIME'].apply(lambda x :arr_time(x))
data.reset_index(inplace=True)
data.head()

In [None]:
# Amount of delays within a DEP_TIME_BLK.
count_time_blk = data[['DEP_TIME_BLK','ARR_DEL15']].groupby('DEP_TIME_BLK').sum().sort_values(by='ARR_DEL15',ascending=False)
count_time_blk.reset_index(inplace=True)
count_time_blk.head()

data1 = data.merge(count_time_blk, left_on='DEP_TIME_BLK', right_on='DEP_TIME_BLK') 
data1.rename({'ARR_DEL15_y':'quant_dep_time_blk','ARR_DEL15_x':'ARR_DEL15' }, inplace=True, axis=1)
data1.head()


Number of delays DEP_DEL15 per ORIGIN.

In [None]:
count_later_origin = data[['ORIGIN','DEP_DEL15']].groupby('ORIGIN').sum().sort_values(by='DEP_DEL15',ascending=False)
count_later_origin.reset_index(inplace=True)
count_later_origin.head()

Merging in Dataframe 

In [None]:
data2 = data1.merge(count_later_origin, left_on='ORIGIN', right_on='ORIGIN')
data2.rename({'DEP_DEL15_y':'count_later_origin','DEP_DEL15_x':'DEP_DEL15' }, inplace=True, axis=1)
data2.head() 


Number of delays ARR_DEL15 per DEST.

In [None]:
count_later_dest = data[['DEST','ARR_DEL15']].groupby('DEST').sum().sort_values(by='ARR_DEL15',ascending=False)
count_later_dest.reset_index(inplace=True)
count_later_dest.head()

In [None]:

data3 = data2.merge(count_later_dest, left_on='DEST', right_on='DEST')
data3.rename({'ARR_DEL15_y':'count_later_dest','ARR_DEL15_x':'ARR_DEL15' },inplace=True, axis=1)
data3.head() 


In [None]:
#Data Preparation
base_final = data3.copy()
base_final.drop(['DEP_TIME','ARR_TIME','OP_CARRIER_FL_NUM'], inplace=True, axis=1)
base_final.set_index('year',inplace=True)



# remove unwanted varaibles
del count_time_blk
del count_later_origin
del count_later_dest
del data1
del data2
del data3

In [None]:
# Check data types
data.dtypes

Separate target, numeric and categorical variables 'ORIGIN', 'DEST'

In [None]:
for column in base_final.select_dtypes(include=['object']):
    base_final[column] = base_final[column].astype('category')
    
base_final['DISTANCE_cat'] = pd.qcut(base_final['DISTANCE'],q=4)
base_final_1 = base_final

for column in base_final.select_dtypes(include=['category']):
    base_final_1[column] = base_final[column].cat.codes

base_final_1.isna().sum()

In [None]:
#We impute  ARR_DEL15 with zero b/c DEP_DEL15 has values of either 0 or 1
base_final_2 =  base_final_1.fillna(value={'ARR_DEL15': 0.0})

# Drop column to decrease model training time
# base_final_2 = base_final_2.drop(['DEP_TIME_BLK','ORIGIN','DEST','DISTANCE'],1)

## Make Model
Make a model to predict delay

In [None]:

# base_final.drop(['ARR_DEL15'], axis=1)
classCol =  ['DEP_DEL15'] #['ARR_DEL15']
X = base_final_2.drop(classCol,1)
Y = base_final_2[classCol]    

train_X,test_X,train_Y,test_Y  = train_test_split(X,Y,test_size = 0.2, random_state= 1)


clf = LinearRegression();
clf.fit(train_X,train_Y);
accuracy = clf.score(test_X,test_Y)

# clf = svm.SVC();
# clf.fit(train_X,train_Y);
# accuracy = clf.score(test_X,test_Y)

print(accuracy)