# Uber Fares Dataset

## Import Libraries and Read in Data

In [101]:
import sklearn as sk
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from dateutil import parser

data = pd.read_csv('data/uber.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


## Custom Transformer

The custom transformer below converts the *pickup_datetime* into just the time of day so we can use this to predict fare amounts.

In [102]:
class ConvertToHours(BaseEstimator, TransformerMixin):
    def __init__(self, add_hours = True):
        self.add_hours = add_hours
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        dates = X[:, 3]
        hours = []
        for date in dates:
            hours.append(parser.parse(date).hour)
        return np.c_[X, hours]
    
attr_adder = ConvertToHours(add_hours = True)
data_extra_attribs = attr_adder.transform(data.values)
data['hour'] = data_extra_attribs[:, 9]
data.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1,19
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1,20
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1,21
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3,8
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5,17


## Standardize the Data

To standardize the data we will use the sklearn StandardScaler object. First we check which columns have values that we wish to standardize as we can see below the two columns that would benefit from standardizatoin are *fare_amount* and *passenger_count*. 

In [103]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  int64  
 1   key                200000 non-null  object 
 2   fare_amount        200000 non-null  float64
 3   pickup_datetime    200000 non-null  object 
 4   pickup_longitude   200000 non-null  float64
 5   pickup_latitude    200000 non-null  float64
 6   dropoff_longitude  199999 non-null  float64
 7   dropoff_latitude   199999 non-null  float64
 8   passenger_count    200000 non-null  int64  
 9   hour               200000 non-null  object 
dtypes: float64(5), int64(2), object(3)
memory usage: 15.3+ MB


In [104]:
standard = StandardScaler()
data_num = data.drop(['Unnamed: 0', 'key', 'pickup_datetime', 'pickup_longitude', 
                      'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'], 
                     axis = 1, inplace = False)

standard.fit(data_num)
X = standard.transform(data_num)
data_std = pd.DataFrame(X, columns = data_num.columns, index = data_num.index)
data['hour'] = data_std['hour']
data['fare_amount'] = data_std['fare_amount']
data['passenger_count'] = data_std['passenger_count']
data.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour
0,24238194,2015-05-07 19:52:06.0000003,-0.389826,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,-0.493895,0.845469
1,27835199,2009-07-17 20:04:56.0000002,-0.369627,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,-0.493895,0.998949
2,44984355,2009-08-24 21:45:00.00000061,0.155533,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,-0.493895,1.152428
3,25894730,2009-06-26 08:22:21.0000001,-0.612008,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,0.949114,-0.842809
4,17610152,2014-08-28 17:47:00.000000188,0.468608,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,2.392122,0.538509


## Split into Testing and Training

We will now split the data into testing and training datasets so we can fit some models.

In [105]:
features = data[['hour', 'passenger_count']] 
labels = data['fare_amount'] 

# 70% training / 30% testing
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.3) 

## Linear Regression

In [106]:
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)
predictions = lin_reg.predict(x_test)
sgdc_mse = mean_squared_error(y_true = y_test, y_pred = predictions)
sgdc_mse

1.0033495898183182