In [35]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression

from geopy.geocoders import GoogleV3
import geopy.distance
import googlemaps

import matplotlib.pyplot as plt

In [36]:
API= add your own API

In [37]:
# loading the required datasets

cab_df = pd.read_csv('cab_rides.csv')
weather_df = pd.read_csv('weather.csv')

In [38]:
#To download the dataframe at any point, uncomment the below

#from pathlib import Path  
#filepath = Path('out_2.csv')  
#filepath.parent.mkdir(parents=True, exist_ok=True)  
#new_df.to_csv(filepath)

In [39]:
cab_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 693071 entries, 0 to 693070
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   distance          693071 non-null  float64
 1   cab_type          693071 non-null  object 
 2   time_stamp        693071 non-null  int64  
 3   destination       693071 non-null  object 
 4   source            693071 non-null  object 
 5   price             637976 non-null  float64
 6   surge_multiplier  693071 non-null  float64
 7   id                693071 non-null  object 
 8   product_id        693071 non-null  object 
 9   name              693071 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 52.9+ MB


In [40]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6276 entries, 0 to 6275
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   temp        6276 non-null   float64
 1   location    6276 non-null   object 
 2   clouds      6276 non-null   float64
 3   pressure    6276 non-null   float64
 4   rain        894 non-null    float64
 5   time_stamp  6276 non-null   int64  
 6   humidity    6276 non-null   float64
 7   wind        6276 non-null   float64
dtypes: float64(6), int64(1), object(1)
memory usage: 392.4+ KB


In [41]:
#plt.hist(cab_df['surge_multiplier'])

In [42]:
#Dropping rows where price value is missing
cab_df.isna().sum() #Returning number of missing values in each column
cab_df = cab_df.dropna(axis=0).reset_index(drop=True)
cab_df.isna().sum()

#replacing missing rain data with 0
weather_df.isna().sum()
weather_df = weather_df.fillna(0)

In [43]:
weather_df['date_time'] = pd.to_datetime(weather_df['time_stamp'], unit='s')
weather_df['time_hour'] = pd.to_datetime(weather_df['date_time']).dt.hour
weather_df['time_day'] = pd.to_datetime(weather_df['date_time']).dt.weekday
weather_df['time_month'] = pd.to_datetime(weather_df['date_time']).dt.month
weather_df['time_date'] = pd.to_datetime(weather_df['date_time']).dt.day
#Monday is defined as 0, Sunday is defined as 6

In [44]:
weather_df['rush_hr'] = 0 # add a class column with 0 as default value

# find all rows that fulfills your conditions and set class to 1
weather_df.loc[((weather_df['time_hour'] >= 6) & 
       (weather_df['time_hour'] < 10)) |
       (weather_df['time_hour'] >= 15) & 
       (weather_df['time_hour'] < 19),
       'rush_hr'] = 1 

In [45]:
cab_df['date_time'] = pd.to_datetime(cab_df['time_stamp'], unit='ms')

In [46]:
weather_df['rush_hr'] = 0 # add a class column with 0 as default value

# find all rows that fulfills your conditions and set class to 1
weather_df.loc[((weather_df['time_hour'] >= 6) & 
       (weather_df['time_hour'] < 10)) |
       (weather_df['time_hour'] >= 15) & 
       (weather_df['time_hour'] < 19),
       'rush_hr'] = 1 

In [47]:
# to convert the date into 365 day format
weather_df['DOY'] = pd.to_datetime(weather_df['date_time']).dt.dayofyear

### To extract the output file

In [48]:
#from pathlib import Path  
#filepath = Path('out.csv')  
#filepath.parent.mkdir(parents=True, exist_ok=True)  
#weather_df.to_csv(filepath)

### To append ", Boston" for the location

In [49]:
weather_df['location'] = weather_df['location'] + ' Boston'

In [50]:
cab_df['date_time'] = pd.to_datetime(cab_df['time_stamp'], unit='ms')
cab_df['date_time'] = cab_df['date_time'].dt.floor('s')
cab_df['destination'] = cab_df['destination'] + ' Boston'
cab_df['source'] = cab_df['source']+ ' Boston'

### Get the Lat and Long columns

In [51]:
geolocator = GoogleV3(api_key=API)
location = [x for x in weather_df['location'].unique().tolist() 
            if type(x) == str]
latitude = []
longitude =  []
for i in range(0, len(location)):
    # remove things that does not seem usefull here
    try:
        address = location[i]
        loc = geolocator.geocode(address)
        latitude.append(loc.latitude)
        longitude.append(loc.longitude)
    except:
        # in the case the geolocator does not work, then add nan element to list
        # to keep the right size
        latitude.append(np.nan)
        longitude.append(np.nan)
# create a dataframe with the location, latitude and longitude
loc_df = pd.DataFrame({'location':location, 
                    'location_latitude': latitude,
                    'location_longitude':longitude})
# merge on weather_df and cab_df with loc_df to get the column 
weather_df = weather_df.merge(loc_df, on='location', how='left')


In [52]:
merged_df = cab_df.merge(weather_df, how='inner', left_on=['date_time', 'source'], right_on=['date_time', 'location'])

In [53]:
merged_df.columns

Index(['distance', 'cab_type', 'time_stamp_x', 'destination', 'source',
       'price', 'surge_multiplier', 'id', 'product_id', 'name', 'date_time',
       'temp', 'location', 'clouds', 'pressure', 'rain', 'time_stamp_y',
       'humidity', 'wind', 'time_hour', 'time_day', 'time_month', 'time_date',
       'rush_hr', 'DOY', 'location_latitude', 'location_longitude'],
      dtype='object')

In [54]:
#Making surge_multiplier as a discrete variable.
merged_df.loc[merged_df.surge_multiplier == 1, "surge_mult_d"] = 1
merged_df.loc[merged_df.surge_multiplier == 1.25, "surge_mult_d"] = 2
merged_df.loc[merged_df.surge_multiplier == 1.5, "surge_mult_d"] = 3
merged_df.loc[merged_df.surge_multiplier == 1.75, "surge_mult_d"] = 4
merged_df.loc[merged_df.surge_multiplier == 2, "surge_mult_d"] = 5

In [55]:
# temp, pressure, clouds, rain, humidity, wind, rush hr, DOY, lat, long
# ground truth = surge_multiplier

In [56]:
merged_df

Unnamed: 0,distance,cab_type,time_stamp_x,destination,source,price,surge_multiplier,id,product_id,name,...,wind,time_hour,time_day,time_month,time_date,rush_hr,DOY,location_latitude,location_longitude,surge_mult_d
0,0.55,Uber,1543426327706,South Station Boston,Theatre District Boston,7.0,1.0,74b72f4c-28bc-4640-a427-ec60b4b11f82,55c66225-fbe7-4fd5-9072-eab1ece5e23e,UberX,...,9.93,17,2,11,28,1,332,42.351866,-71.064262,1.0
1,1.57,Uber,1543426327658,North End Boston,Theatre District Boston,18.0,1.0,e5aa2477-afb5-4b83-a69d-18ee7d2bfd1d,6c84fd89-3f11-4782-9b50-97c468b19529,Black,...,9.93,17,2,11,28,1,332,42.351866,-71.064262,1.0
2,2.12,Lyft,1543426327658,Northeastern University Boston,Theatre District Boston,10.5,1.0,4d24470f-167a-4d1f-9cab-c123a0713491,lyft,Lyft,...,9.93,17,2,11,28,1,332,42.351866,-71.064262,1.0
3,2.12,Lyft,1543426327658,Northeastern University Boston,Theatre District Boston,7.0,1.0,7902bfaf-5987-499b-9c04-222f4182c581,lyft_line,Shared,...,9.93,17,2,11,28,1,332,42.351866,-71.064262,1.0
4,2.12,Lyft,1543426327658,Northeastern University Boston,Theatre District Boston,16.5,1.0,91129083-ed19-41fd-b16a-06c729ff0790,lyft_plus,Lyft XL,...,9.93,17,2,11,28,1,332,42.351866,-71.064262,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3543,0.64,Lyft,1543213743704,West End Boston,Haymarket Square Boston,27.5,1.0,06fb0785-132d-4d59-a147-00655bc1b8f9,lyft_luxsuv,Lux Black XL,...,1.43,6,0,11,26,1,330,42.360082,-71.058880,1.0
3544,0.64,Lyft,1543213743704,West End Boston,Haymarket Square Boston,16.5,1.0,2be8dbc1-2a40-43e4-9e69-691420768750,lyft_lux,Lux Black,...,1.43,6,0,11,26,1,330,42.360082,-71.058880,1.0
3545,0.49,Uber,1543399584174,North Station Boston,Haymarket Square Boston,7.0,1.0,9a79355e-26e1-4f81-936e-9fa3aa06f509,997acbb5-e102-41e1-b155-9df7de0a73f2,UberPool,...,5.68,10,2,11,28,0,332,42.360082,-71.058880,1.0
3546,0.49,Uber,1543399584174,North Station Boston,Haymarket Square Boston,7.0,1.0,f92fafbf-d78b-410b-a957-f15af2022518,55c66225-fbe7-4fd5-9072-eab1ece5e23e,UberX,...,5.68,10,2,11,28,0,332,42.360082,-71.058880,1.0


In [57]:
#defininf x and y variables
x = merged_df[['temp','clouds','pressure','rain','humidity','wind',
            'rush_hr', 'DOY', 'location_latitude','location_longitude']]
y = merged_df[['surge_mult_d']]

In [58]:
merged_df.surge_multiplier.unique()

array([1.  , 1.75, 2.  , 1.25, 1.5 ])

In [59]:
#plt.hist(merged_df['surge_mult_d'])

In [60]:
#plt.hist(merged_df['surge_multiplier'])

In [61]:
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    stratify=y, 
                                                    test_size=0.25)

In [62]:
x_test

Unnamed: 0,temp,clouds,pressure,rain,humidity,wind,rush_hr,DOY,location_latitude,location_longitude
1556,40.44,0.92,1013.75,0.000,0.93,2.96,0,330,42.342865,-71.100288
682,41.11,0.98,1014.35,0.000,0.92,1.81,1,330,42.366442,-71.061974
2665,40.49,0.85,994.06,0.000,0.65,9.59,0,332,42.364702,-71.054234
1645,37.94,0.76,996.81,0.000,0.71,11.19,0,333,42.351922,-71.055070
1206,40.76,1.00,993.28,0.000,0.66,9.83,0,332,42.366442,-71.061974
...,...,...,...,...,...,...,...,...,...,...
3265,38.65,0.50,997.93,0.002,0.70,10.51,0,333,42.351922,-71.055070
3318,37.82,0.45,998.91,0.000,0.70,9.41,0,333,42.360082,-71.058880
1082,38.54,0.67,997.22,0.000,0.71,10.07,0,333,42.366442,-71.061974
2056,40.72,1.00,1014.75,0.000,0.94,1.36,1,330,42.342865,-71.100288


In [63]:
y_test

Unnamed: 0,surge_mult_d
1556,1.0
682,1.0
2665,1.0
1645,1.0
1206,1.0
...,...
3265,1.0
3318,1.0
1082,1.0
2056,2.0


In [64]:
y_train

Unnamed: 0,surge_mult_d
1651,1.0
2671,1.0
2459,1.0
2335,2.0
1675,1.0
...,...
684,1.0
3362,1.0
2534,1.0
2079,1.0


In [70]:
y_test = y_test['surge_mult_d'].values


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [71]:
y_train = y_train['surge_mult_d'].values

In [72]:
y_train

array([1., 1., 1., ..., 1., 1., 1.])

In [73]:
from sklearn.svm import SVC
svc = SVC(kernel='linear')
#svc.fit(x_train, y_train.values.ravel())
svc.fit(x_train, y_train)

SVC(kernel='linear')

In [75]:
from sklearn.tree import DecisionTreeClassifier
dec_tree = DecisionTreeClassifier()
#dec_tree.fit(x_train, y_train.values.ravel())
dec_tree.fit(x_train, y_train)

DecisionTreeClassifier()

In [76]:
svc.score(x_test, y_test)


0.9594137542277339

In [77]:
dec_tree.score(x_test, y_test)

0.9526493799323562

In [None]:
#F1 Score = 2*(Recall * Precision) / (Recall + Precision)
#F1 score reaches its best value at 1 and worst score at 0.

In [79]:
# Import the model we are using
from sklearn.ensemble import RandomForestClassifier
# Instantiate model with 1000 decision trees
rf = RandomForestClassifier(n_estimators = 10, class_weight ='balanced')
# Train the model on training data
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)

In [80]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9267192784667418


In [85]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         1.0       0.98      0.94      0.96       851
         2.0       0.25      0.50      0.33        16
         3.0       0.38      0.45      0.42        11
         4.0       0.18      0.50      0.27         4
         5.0       0.31      0.80      0.44         5

    accuracy                           0.93       887
   macro avg       0.42      0.64      0.48       887
weighted avg       0.95      0.93      0.94       887



In [86]:
# Use the forest's predict method on the test data
predictions = rf.predict(x_train)
# Calculate the absolute errors
errors = abs(predictions - y_train)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 0.13 degrees.


In [87]:
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, 
                            y_pred)

In [88]:
conf_mat

array([[803,  24,   8,   9,   7],
       [  6,   8,   0,   0,   2],
       [  6,   0,   5,   0,   0],
       [  2,   0,   0,   2,   0],
       [  1,   0,   0,   0,   4]])