In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

import pickle

In [2]:
# loading the required datasets
cab_df     = pd.read_csv('cab_rides.csv')
weather_df = pd.read_csv('weather.csv')

In [4]:
cab_df.columns

Index(['distance', 'cab_type', 'time_stamp', 'destination', 'source', 'price',
       'surge_multiplier', 'id', 'product_id', 'name'],
      dtype='object')

In [5]:
weather_df.columns

Index(['temp', 'location', 'clouds', 'pressure', 'rain', 'time_stamp',
       'humidity', 'wind'],
      dtype='object')

In [6]:
#Dropping rows where price value is missing
cab_df = cab_df.dropna(axis=0).reset_index(drop=True)

#replacing missing rain data with 0
weather_df = weather_df.fillna(0)

In [7]:
#converting epoch to date-time format in weather_df
weather_df['date_time'] = pd.to_datetime(weather_df['time_stamp'], unit='s')
weather_df['time_hour'] = pd.to_datetime(weather_df['date_time']).dt.hour

In [9]:
# add a class column with 0 as default value
weather_df['rush_hr'] = 0 

# marking rush hour (rush_hr) as '1' f
weather_df.loc[((weather_df['time_hour'] >= 6) & 
       (weather_df['time_hour'] < 10)) |
       (weather_df['time_hour'] >= 15) & 
       (weather_df['time_hour'] < 19),
       'rush_hr'] = 1 

In [10]:
#converting date-time from epoch to date-time in cab_df 
cab_df['date_time'] = pd.to_datetime(cab_df['time_stamp'], unit='ms')
cab_df['date_time'] = cab_df['date_time'].dt.floor('s')

In [11]:
# add a class column with 0 as default value
weather_df['rush_hr'] = 0 

# find all rows that fulfills the rush_hour set to 1
weather_df.loc[((weather_df['time_hour'] >= 6) & 
       (weather_df['time_hour'] < 10)) |
       (weather_df['time_hour'] >= 15) & 
       (weather_df['time_hour'] < 19),
       'rush_hr'] = 1 

In [12]:
# to convert the date into 365 day format
# this can be used in the future scope to predict rush hours during holidays

weather_df['date_day'] = pd.to_datetime(weather_df['date_time']).dt.dayofyear

In [13]:
#Appending Boston to the location to get API value for Longitude and Latitude

weather_df['location'] = weather_df['location'] + ' Boston'
cab_df['destination'] = cab_df['destination'] + ' Boston'
cab_df['source'] = cab_df['source']+ ' Boston'

In [14]:
merged_df = cab_df.merge(weather_df, how='inner', left_on=['date_time', 'source'], right_on=['date_time', 'location'])

In [15]:
#Making surge_multiplier as a discrete variable.

merged_df.loc[merged_df.surge_multiplier == 1, "surge_mult"] = 1
merged_df.loc[merged_df.surge_multiplier == 1.25, "surge_mult"] = 2
merged_df.loc[merged_df.surge_multiplier == 1.5, "surge_mult"] = 3
merged_df.loc[merged_df.surge_multiplier == 1.75, "surge_mult"] = 4
merged_df.loc[merged_df.surge_multiplier == 2, "surge_mult"] = 5

In [51]:
#Creating a dictionary to map the categorical variables

predictive_surge_mapping= {1:1, 2: 1.25, 3:1.5, 4:1.75, 5:2}

In [16]:
#the below dictionary for the lat-long has been obtained using the Google API

source_lat={'North End Boston': 42.3647024,'Beacon Hill Boston': 42.3587999,'North Station Boston': 42.3664424,'Boston University Boston': 42.3504997,
'South Station Boston':42.3519217,'Fenway Boston':42.3428653,'Theatre District Boston':42.3518662,'West End Boston':42.3643579,
'Back Bay Boston':42.3502648,'Northeastern University Boston':42.3398067,'Haymarket Square Boston':42.3600825,
'Financial District Boston': 42.3559219}

source_long={'North End Boston':-71.0542339 ,'Beacon Hill Boston':-71.0707389 ,'North Station Boston':-71.061974 ,'Boston University Boston':-71.1053991,
'South Station Boston':-71.0550703,'Fenway Boston':-71.1002881,'Theatre District Boston':-71.0642623,'West End Boston':-71.0661193,
'Back Bay Boston':-71.0809757,'Northeastern University Boston':-71.0891717,'Haymarket Square Boston':-71.0588801,'Financial District Boston':-71.0549768 }

merged_df['location_latitude']=merged_df['location'].map(source_lat)
merged_df['location_longitude']=merged_df['location'].map(source_long)

In [20]:
#defining x and y variables

x = merged_df[['temp','clouds','pressure','rain','humidity','wind',
            'rush_hr', 'location_latitude','location_longitude']]

y = merged_df[['surge_mult']]

In [21]:
#using stratified split for the below

x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    stratify=y, 
                                                    test_size=0.25)

In [22]:
y_test = y_test['surge_mult'].values
y_train = y_train['surge_mult'].values

In [31]:
#the model was trained on SVM and decision tree previously
#the best results were obtained using Random Forest

rf = RandomForestClassifier(n_estimators = 10, class_weight ='balanced')
# Train the model on training data
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)

In [47]:
# save the model to disk using pickle python
filename = 'surge_classification_rf_model.sav'
pickle.dump(rf, open(filename, 'wb'))

In [49]:
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(x_test, y_test)
print(result)

0.9143179255918827
