In [52]:
import pandas as pd
import plotly.express as px
from numpy.random import randint
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import math
import geopy.distance
from sklearn.linear_model import LinearRegression
from datetime import datetime, timedelta, timezone
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

# Importing the dataset

In [42]:
%%time
uber = pd.read_csv("uber.csv")

CPU times: user 695 ms, sys: 134 ms, total: 830 ms
Wall time: 863 ms


# Cleaning the data:
Some latitude and longitude coordinates are not in this world so they are removed

After removing the impossible data points, we can see that there are still some latitude and longitude coordinates that are far from the others, some in Antartica and some in the ocean, while the vast majority of the data points are near New York.

In order to remove these points we will remove points which have > 2 standard deviations from the mean in both longitude and latitude.

In [43]:
# fig = figpx.scatter_geo(uber, lon="pickup_longitude", lat="pickup_latitude")
# fig.show()

In [44]:
class CalculateDistanceTraveled(BaseEstimator, TransformerMixin):
    def calc_distance(self, x):
        coords_1 = (x.pickup_latitude, x.pickup_longitude)
        coords_2 = (x.dropoff_latitude, x.dropoff_longitude)
        try:
            miles = geopy.distance.distance(coords_1, coords_2).miles
        except:
            return -1
        return miles
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X["distance_traveled"] = X.apply(lambda x : self.calc_distance(x), axis=1)
        return X

In [45]:
class RemoveOutliers(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.column_name = column_name
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        mean = X[self.column_name].mean()
        std = X[self.column_name].std()
        X = X[(X[self.column_name] < mean + 3*std) & (X[self.column_name] > mean - 3*std)]
        return X

In [46]:
class CalculateDistanceTraveled(BaseEstimator, TransformerMixin):
    def calc_distance(self, x):
        coords_1 = (x.pickup_latitude, x.pickup_longitude)
        coords_2 = (x.dropoff_latitude, x.dropoff_longitude)
        try:
            miles = geopy.distance.distance(coords_1, coords_2).miles
        except:
            return -1
        return miles
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X["distance_traveled"] = X.apply(lambda x : self.calc_distance(x), axis=1)
        return X

In [47]:
class TransformDateTime(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.column_name = column_name
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X[self.column_name] = pd.to_datetime(X[self.column_name])
        return X

In [48]:
class ExtractWeekday(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X["weekday"] = X["pickup_datetime"].dt.weekday
        return X

In [49]:
pipeline = Pipeline([
    ("RemoveOutliers-pickup_longitude", RemoveOutliers("pickup_longitude")),
    ("RemoveOutliers-pickup_latitude", RemoveOutliers("pickup_latitude")),
    ("RemoveOutliers-dropoff_latitude", RemoveOutliers("dropoff_latitude")),
    ("RemoveOutliers-dropoff_longitude", RemoveOutliers("dropoff_longitude")),
    ("CalculateDistanceTraveled", CalculateDistanceTraveled()),
    ("CustomTransformer_CalculatingDistanceTraveled", CalculateDistanceTraveled()),
    ("RemovingOutliers-distance_traveled", RemoveOutliers("distance_traveled")),
    ("TransformDateTime-pickup_datetime", TransformDateTime("pickup_datetime")),
    ("ExtractWeekday", ExtractWeekday()),
    ("OneHotEncoder", OneHotEncoder(sparse=False)),
])

In [50]:
%%time
uber = pipeline.fit_transform(uber)

CPU times: user 2min 38s, sys: 946 ms, total: 2min 39s
Wall time: 2min 40s


# Modelling

In [57]:
target = uber.fare_amount
params = uber.drop(["fare_amount"], axis=1)

In [51]:
model = RandomForestRegressor(n_estimators=10, max_depth=10, random_state=42)
forest_scores = cross_val_score(model, params, target, scoring="neg_mean_squared_error", cv=10)

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance_traveled,weekday
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,1.044594,3
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.994710,40.750325,1,1.525071,4
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.740770,-73.962565,40.772647,1,3.131464,0
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,1.032372,4
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,2.786061,3
...,...,...,...,...,...,...,...,...,...,...,...
199995,42598914,2012-10-28 10:49:00.00000053,3.0,2012-10-28 10:49:00+00:00,-73.987042,40.739367,-73.986525,40.740297,1,0.069673,6
199996,16382965,2014-03-14 01:09:00.0000008,7.5,2014-03-14 01:09:00+00:00,-73.984722,40.736837,-74.006672,40.739620,1,1.167951,4
199997,27804658,2009-06-29 00:42:00.00000078,30.9,2009-06-29 00:42:00+00:00,-73.986017,40.756487,-73.858957,40.692588,2,7.995752,0
199998,20259894,2015-05-20 14:56:25.0000004,14.5,2015-05-20 14:56:25+00:00,-73.997124,40.725452,-73.983215,40.695415,1,2.197512,2
