In [44]:
import pandas as pd
import numpy as np
import datetime
import sklearn 
import matplotlib.pyplot as plt

#This function finds the common elemengts in both the lists
def common_member(a, b): 
    a_set = set(a) 
    b_set = set(b) 
    if (a_set & b_set): 
        return a_set & b_set 
    else: 
        print("No common elements")

#importing training and testing data using pandas library
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Filling NA values with 0
# Columns with Datetime are done separately because these are going to be splitted into more columns and to avoid null the 
# format should be kept same as datatime 
train[['runDate','scheduledArrival','scheduledDeparture']] = train[['runDate','scheduledArrival','scheduledDeparture']].fillna('0-0-0 0:0:0')
test[['runDate','scheduledArrival','scheduledDeparture']] = test[['runDate','scheduledArrival','scheduledDeparture']].fillna('0-0-0 0:0:0')
train = train.fillna('0')
test = test.fillna('0')

# Converting all the dates into 6 columns od day, month, year, hour, minute and seconds 
train[["day"]] = pd.to_datetime(train['runDate']).dt.dayofweek
train[["runday", "runmonth", "runyear"]] = train['runDate'].str.split("-", expand = True)
train[["Adate", "Atime"]] = train['scheduledArrival'].str.split(" ", expand = True)
train[["Aday", "Amonth", "Ayear"]] = train['Adate'].str.split("-", expand = True)
train[["Ahour", "Aminute", "Asecond"]] = train['Atime'].str.split(":", expand = True)

train[["Ddate", "Dtime"]] =train['scheduledDeparture'].str.split(" ", expand = True)
train[["Dday", "Dmonth", "Dyear"]] = train['Ddate'].str.split("-", expand = True)
train[["Dhour", "Dminute", "Dsecond"]] = train['Dtime'].str.split(":", expand = True)

#dropping the extra columns
train = train.drop(['scheduledArrival','scheduledDeparture','Adate','Atime','Ddate','Dtime','runDate'], axis = 1)

test[["day"]] = pd.to_datetime(test['runDate']).dt.dayofweek
test[["runday", "runmonth", "runyear"]] = test['runDate'].str.split("-", expand = True)
test[["Adate", "Atime"]] = test['scheduledArrival'].str.split(" ", expand = True)
test[["Aday", "Amonth", "Ayear"]] = test['Adate'].str.split("-", expand = True)
test[["Ahour", "Aminute", "Asecond"]] = test['Atime'].str.split(":", expand = True)

test[["Ddate", "Dtime"]] = test['scheduledDeparture'].str.split(" ", expand = True)
test[["Dday", "Dmonth", "Dyear"]] = test['Ddate'].str.split("-", expand = True)
test[["Dhour", "Dminute", "Dsecond"]] = test['Dtime'].str.split(":", expand = True)

#dropping the extra columns
test = test.drop(['scheduledArrival','scheduledDeparture','Adate','Atime','Ddate','Dtime','runDate'], axis = 1)

train_cols = train.columns
test_cols = test.columns

#getting the common columns so that only these columns persists
cols = list(common_member(train_cols, test_cols))
cols.sort()

test = test[cols]

# arrivalDelay and DepartureDelay are stored to be used later during prediction
y_arrv = train['ArrivalDelay']
y_dept = train['DepartureDelay']

train = train[cols]

print('Train: ',train.columns,'\n Test : ', test.columns)

train = np.array(pd.concat([train,y],axis=1))
X_train = train[:,:(len(train[0])-1)]
y_train = np.array(train)[:,len(train[0])-1]
X_test = np.array(test)

# Label encoding the "stations" column which contains string variables that are not supported by Machine Learning Models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
station_id = X_train[:,18] 
le.fit(station_id)
X_train[:,18] = le.transform(station_id)
X_test[:,18] = le.transform(X_test[:,18])

#converting every variable fro string to int 
X_train = X_train.astype('int')
X_test = X_test.astype('int')

Train:  Index(['Aday', 'Ahour', 'Aminute', 'Amonth', 'Asecond', 'Ayear', 'Dday',
       'Dhour', 'Dminute', 'Dmonth', 'Dsecond', 'Dyear', 'day', 'dayCount',
       'distance', 'runday', 'runmonth', 'runyear', 'stations', 'trainCode',
       'trainStationId'],
      dtype='object') 
 Test :  Index(['Aday', 'Ahour', 'Aminute', 'Amonth', 'Asecond', 'Ayear', 'Dday',
       'Dhour', 'Dminute', 'Dmonth', 'Dsecond', 'Dyear', 'day', 'dayCount',
       'distance', 'runday', 'runmonth', 'runyear', 'stations', 'trainCode',
       'trainStationId'],
      dtype='object')


In [45]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Adaboost model is intialized with DecisionTree as the base classifier 
model_ada = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=30),
                              n_estimators=80)

# Training the model using training data for output variable as arrivalDelay
model_ada.fit(X_train, y_train)

pred_train_ada= model_ada.predict(X_train)

print("RMSE : ",np.sqrt(mean_squared_error(y_train,pred_train_ada)))
print("R2 Score : ",r2_score(y_train, pred_train_ada))

# predicting the final test arrivalDelay that is included in the solution file
pred_test_ada = model_ada.predict(X_test)

RMSE :  0.7279741430443643
R2 Score :  0.9998667727205047


In [46]:
#We can save the model using pickle library
import pickle
filename = 'model_ada_arival.sav'
pickle.dump(model_ada, open(filename, 'wb'))

# load the model from disk
model_ada_arrival = pickle.load(open(filename, 'rb'))

In [47]:
#arrivalDelay variable is put into the training data and testing data so that now departure delay can be predicted
X_train = np.hstack((X_train,np.resize(pred_train_ada,(31875,1))))
X_test = np.hstack((X_test,np.resize(pred_test_ada,(695,1))))

y_train = y_dept

from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Model initialization
model_ada_dept = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=30),
                              n_estimators=80)
# Model Training
model_ada_dept.fit(X_train, y_train)

pred_train_ada= model_ada_dept.predict(X_train)
print("RMSE : ",np.sqrt(mean_squared_error(y_train,pred_train_ada)))
print("R2 Score : ",r2_score(y_train, pred_train_ada))

# Testing data prediction
pred_test_ada = model_ada_dept.predict(X_test)

RMSE :  0.1668572191476974
R2 Score :  0.9999926733177857


In [48]:
# Save model
import pickle
filename = 'model_ada_departure.sav'
pickle.dump(model_ada, open(filename, 'wb'))

# load the model from disk
model_ada_arrival = pickle.load(open(filename, 'rb'))