# Uber Fares Dataset

## Import Libraries and Read in Data

In [66]:
import sklearn as sk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor
from dateutil import parser

data = pd.read_csv('data/uber.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


## Custom Transformer

The custom transformer below converts the *pickup_datetime* into just the time of day so we can use this to predict fare amounts.

In [67]:
class ConvertToHours(BaseEstimator, TransformerMixin):
    def __init__(self, add_hours = True):
        self.add_hours = add_hours
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        dates = X[:, 3]
        hours = []
        for date in dates:
            hours.append(parser.parse(date).hour)
        return np.c_[X, hours]
    
attr_adder = ConvertToHours(add_hours = True)
data_extra_attribs = attr_adder.transform(data.values)
data['hour'] = data_extra_attribs[:, 9]
data.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1,19
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1,20
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1,21
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3,8
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5,17


## Standardize the Data

To standardize the data we will use the sklearn StandardScaler object. First we check which columns have values that we wish to standardize as we can see below the two columns that would benefit from standardizatoin are *fare_amount* and *passenger_count*. 

In [68]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  int64  
 1   key                200000 non-null  object 
 2   fare_amount        200000 non-null  float64
 3   pickup_datetime    200000 non-null  object 
 4   pickup_longitude   200000 non-null  float64
 5   pickup_latitude    200000 non-null  float64
 6   dropoff_longitude  199999 non-null  float64
 7   dropoff_latitude   199999 non-null  float64
 8   passenger_count    200000 non-null  int64  
 9   hour               200000 non-null  object 
dtypes: float64(5), int64(2), object(3)
memory usage: 15.3+ MB


In [69]:
standard = StandardScaler()
data_num = data.drop(['Unnamed: 0', 'key', 'pickup_datetime', 'pickup_longitude', 
                      'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'], 
                     axis = 1, inplace = False)

standard.fit(data_num)
X = standard.transform(data_num)
data_std = pd.DataFrame(X, columns = data_num.columns, index = data_num.index)
data['hour'] = data_std['hour']
data['fare_amount'] = data_std['fare_amount']
data['passenger_count'] = data_std['passenger_count']
data.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour
0,24238194,2015-05-07 19:52:06.0000003,-0.389826,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,-0.493895,0.845469
1,27835199,2009-07-17 20:04:56.0000002,-0.369627,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,-0.493895,0.998949
2,44984355,2009-08-24 21:45:00.00000061,0.155533,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,-0.493895,1.152428
3,25894730,2009-06-26 08:22:21.0000001,-0.612008,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,0.949114,-0.842809
4,17610152,2014-08-28 17:47:00.000000188,0.468608,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,2.392122,0.538509


## Split into Testing and Training

We will now split the data into testing and training datasets so we can fit some models.

In [70]:
features = data[['hour', 'passenger_count']] 
labels = data['fare_amount'] 

# 70% training / 30% testing
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.3) 

In [71]:
lab_enc = LabelEncoder()
encoded = lab_enc.fit_transform(y_train)

## Linear Regression and Mean Squared Error

In [72]:
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)
lin_predictions = lin_reg.predict(x_test)
lin_reg_mse = mean_squared_error(y_true = y_test, y_pred = lin_predictions)
lin_reg_mse

1.0279742393149958

## Decision Tree Regressor and Mean Absolute Percentage Error

In [73]:
tree = DecisionTreeRegressor()
tree.fit(x_train, y_train)
tree_predictions = tree.predict(x_test)
tree_mape = mean_absolute_percentage_error(y_true = y_test, y_pred = tree_predictions)
tree_mape

1.1335728012350015

## k-fold Cross Validation

In [74]:
def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Standard deviation: ", scores.std())

lin_scores = cross_val_score(lin_reg, x_train, y_train, scoring = "neg_mean_squared_error", cv = 10)
display_scores(lin_scores)

Scores:  [-1.02209937 -0.93615849 -0.97180656 -1.0923422  -0.99805719 -0.95472025
 -1.05910992 -0.97525276 -0.95203195 -0.91130721]
Mean:  -0.9872885895147772
Standard deviation:  0.0534771507002667


## Stratified K Fold

In [75]:
skf = StratifiedKFold()
for train_index, test_index in skf.split(x_train, encoded):
    print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [   221    538    902 ... 139997 139998 139999] TEST: [     0      1      2 ... 135644 137300 138577]
TRAIN: [     0      1      2 ... 139997 139998 139999] TEST: [   538    917   1248 ... 136236 137364 138862]
TRAIN: [     0      1      2 ... 139997 139998 139999] TEST: [   221   2117   3134 ... 138301 138660 139354]
TRAIN: [     0      1      2 ... 139997 139998 139999] TEST: [   902   1452   3017 ... 138615 139049 139711]
TRAIN: [     0      1      2 ... 139049 139354 139711] TEST: [  3693   3842   5689 ... 139997 139998 139999]




## GridSearchCV

In [76]:
parameters = {"splitter":["best","random"],
            "max_depth" : [1,3,5],
            "min_samples_leaf":[1,2,3],
            "min_weight_fraction_leaf":[0.1,0.2,0.3],
            "max_features":["auto",None],
            "max_leaf_nodes":[None,10] }

tuning_model = GridSearchCV(tree, param_grid = parameters, scoring = 'neg_mean_squared_error', cv = 3)

tuning_model.fit(x_train, y_train)
print(tuning_model.best_params_)
print(tuning_model.best_estimator_)

{'max_depth': 5, 'max_features': 'auto', 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.1, 'splitter': 'best'}
DecisionTreeRegressor(max_depth=5, max_features='auto',
                      min_weight_fraction_leaf=0.1)


## Ensamble of Methods

In [77]:
ensamble_avg = (tree_predictions + lin_predictions) / 2.0
mean_squared_error(y_test, ensamble_avg)

1.0241083336383765

## Pipeline

In [80]:
num_pipeline = Pipeline(steps = [
    ('std_scaler', StandardScaler())
])

preprocess = ColumnTransformer(transformers = [
    ('trans', ConvertToHours(), []),
    ('num', num_pipeline, ['passenger_count', 'hour'])
])

final_pipeline = Pipeline(steps = [
    ('preprocess', preprocess),
    ('regression', DecisionTreeRegressor())
])

x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.3)

final_pipeline.fit(x_train, y_train) 
final_pipeline.predict(x_test)

array([-0.02762508,  0.01827076,  0.03466956, ...,  0.11037051,
       -0.05737464, -0.05289617])