In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
import seaborn as sns
from sklearn.model_selection import train_test_split
import os

## Create a model to predict the delays

## Through this workbook,I am goin to answer below questions:
* Create a model to predict flight delays?
* How well does weather predict plane delays?

#### Loading of data

In [None]:
df_2008=pd.read_csv('2008.csv')
df_2007=pd.read_csv('2007.csv')
airport_df=pd.read_csv('airports.csv')
carrier_df=pd.read_csv('carriers.csv')
planes_df=pd.read_csv('plane-data.csv')
main_df=pd.concat([df_2008],ignore_index=True)
main_df['Date']=pd.to_datetime(pd.DataFrame({'year':main_df['Year'],'month':main_df['Month'],'day':main_df['DayofMonth']}))

#### Problem statement - Is there any delay in the flight or not? If there is a delay do that is Dep Delay or Arr Delay or both or None. IF so how much of departure delay and how much of arrival delay
There are two regression problems in this questions.
#### Problem Statement1 - How much is Departure Delay for this particural plane
#### Problem Statement2 - How much is Arrival Delay for this particural plane
Key parameters can be considered are:
Target parameter: DepDelay or Arr Delay (Prepare a column Total Delay)
#### Problem1 model below

In [None]:
main_df.columns
key_columns=['CRSDepTime','CRSArrTime','UniqueCarrier','AirTime','Diverted','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay',
            'LateAircraftDelay','Month','DayOfWeek','DepDelay']
key2_columns=['UniqueCarrier','Diverted','CarrierDelay','WeatherDelay','NASDelay','SecurityDelay',
            'LateAircraftDelay','Month','DayOfWeek','DepDelay']

In [None]:
main_df_req=main_df[key2_columns].copy()

In [None]:
sns.heatmap(main_df_req.corr())

In [None]:
main_df_req.boxplot(column='DepDelay',by='Diverted')

In [None]:
sns.boxplot(x='Diverted',y='DepDelay',data=main_df_req)

In [None]:
main_df_req['DepDelay'].describe(percentiles=[0.01,0.05,0.95,0.99])

In [None]:
print(main_df_req.columns)
lst=list(main_df_req['UniqueCarrier'].unique())
print(lst)
main_df_req1=main_df_req[main_df_req['DepDelay'].isnull()==False].fillna(0)

In [None]:
main_df['DayOfWeek'].unique()

In [None]:
Unq_carr=tf.feature_column.categorical_column_with_hash_bucket('UniqueCarrier',hash_bucket_size=20)
diver=tf.feature_column.categorical_column_with_vocabulary_list('Diverted',[1,0])
Car_del=tf.feature_column.numeric_column('CarrierDelay')
wt_del=tf.feature_column.numeric_column('WeatherDelay')
Nas_del=tf.feature_column.numeric_column('NASDelay')
Sec_del=tf.feature_column.numeric_column('SecurityDelay')
Late_del=tf.feature_column.numeric_column('LateAircraftDelay')
Month=tf.feature_column.categorical_column_with_vocabulary_list('Month',[1,2,3,4,5,6,7,8,9,10,11,12])
DayOfWeek=tf.feature_column.categorical_column_with_vocabulary_list('DayOfWeek',[1,2,3,4,5,6,7])
DepDelay=tf.feature_column.numeric_column('DepDelay')
Unq_car=tf.feature_column.indicator_column(Unq_carr)
diverted=tf.feature_column.indicator_column(diver)
day_week=tf.feature_column.indicator_column(DayOfWeek)
day_mnth=tf.feature_column.indicator_column(Month)

In [None]:
feat_cols=[Unq_car,diverted,Car_del,wt_del,Nas_del,Sec_del,Late_del,day_mnth,day_week]
x_train,x_test,y_train,y_test=train_test_split(main_df_req1.iloc[:,:-1],main_df_req1.iloc[:,-1],random_state=100,train_size=0.8)

Create an input function

In [None]:
input_func=tf.estimator.inputs.pandas_input_fn(x=x_train,y=y_train,batch_size=100,num_epochs=11,shuffle=False)
test_func=tf.estimator.inputs.pandas_input_fn(x=x_test,y=y_test,batch_size=100,num_epochs=11,shuffle=False)

Create DNNRegressor model

In [None]:
model=tf.estimator.DNNRegressor(feature_columns=feat_cols,hidden_units=[10,30,30,30,10],activation_fn=tf.nn.relu,
                                optimizer=tf.train.ProximalAdagradOptimizer(learning_rate=0.05,l1_regularization_strength=0.001),
                               model_dir=os.getcwd()+'/model')

In [None]:
print(os.getcwd())

In [None]:
model.train(input_fn=input_func,steps=2500)

In [None]:
ev=model.evaluate(input_fn=input_func,steps=100)
print(ev)
# towfurther tweak the model if we add rmse is part of evaluation metrics https://www.tensorflow.org/versions/r1.3/extend/estimators#constructing-modelfn

In [None]:
y_pred=model.predict(input_fn=test_func)
y_pred_test=model.predict(input_fn=input_func)
y_pred1=[]
for i in range(500000):
  y_pred1.append(next(y_pred))
exact_pred=[i['predictions'][0] for i in y_pred1]
y_train_pred=[]
for i in range(500000):
  y_train_pred.append(next(y_pred_test))
exact_train=[i['predictions'][0] for i in y_train_pred]

In [None]:
from sklearn.metrics import mean_squared_error,r2_score
#print(mean_squared_error(y_test[0:len(exact_pred)],exact_pred))
print("Model Accuracy on test data is {} ".format(r2_score(y_test[0:len(exact_pred)],exact_pred)))
#print(mean_squared_error(y_train[0:len(exact_train)],exact_train))
print("Model Accuracy on train data is {} ".format(r2_score(y_train[0:len(exact_pred)],exact_train)))

In [None]:
model.get_variable_value(model.get_variable_names()[0])

In [None]:
model.get_variable_value(model.get_variable_names()[5])

#### Prepare dummy dataset with various Weather delays to check the model delay
Q6. How well does weather predict plane delays?
Ans: As per below results Weatherdelay is linearly correlated with flight departure delays and resultant model correlation is 0.96
Methodology: 
            * Checked correlation coefficient in 2008 data set it is 0.267
            * Generated random dataset where all the other features are constant and only Wealther delay is varying as per normal distribution.
            * Predicted Departure delays for all these instances. To check modeled relation between weather delay and DepDelay.
        Result is Weather delays positively correlated, i.e, bad weather will make flight departure delayed, in a very positive sense.

In [None]:
main_df_req1.corr()

In [None]:
temp_df= pd.DataFrame({'WeatherDelay':np.random.randn(20000)*100})
temp_df['UniqueCarrier']='WN'
temp_df['Diverted']=0
temp_df['CarrierDelay']=0
temp_df['NASDelay']=0
temp_df['SecurityDelay']=0
temp_df['LateAircraftDelay']=0
temp_df['Month']=1
temp_df['DayOfWeek']=1
temp_df.describe()

In [None]:
pred_func=tf.estimator.inputs.pandas_input_fn(x=temp_df,batch_size=100,num_epochs=1,shuffle=False)

In [None]:
pred_temp_df=model.predict(pred_func)
temp_df_pred=[]
for i in pred_temp_df:
  temp_df_pred.append(i['predictions'][0])

In [None]:
temp_df['DepDelay']=temp_df_pred
temp_df.info()

In [None]:
temp_df.iloc[0:2000,:].plot(kind='scatter',x='WeatherDelay',y='DepDelay',figsize=(7,4))