In [44]:
import pandas as pd
import plotly.express as px
from numpy.random import randint
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import math
import geopy.distance
from sklearn.linear_model import LinearRegression
from datetime import datetime, timedelta, timezone
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Importing the dataset

In [45]:
%%time
uber = pd.read_csv("uber.csv")

CPU times: user 155 ms, sys: 3.93 ms, total: 159 ms
Wall time: 158 ms


# Cleaning the data:
Some latitude and longitude coordinates are not in this world so they are removed

After removing the impossible data points, we can see that there are still some latitude and longitude coordinates that are far from the others, some in Antartica and some in the ocean, while the vast majority of the data points are near New York.

In order to remove these points we will remove points which have > 2 standard deviations from the mean in both longitude and latitude.

In [46]:
# fig = figpx.scatter_geo(uber, lon="pickup_longitude", lat="pickup_latitude")
# fig.show()

In [47]:
class CalculateDistanceTraveled(BaseEstimator, TransformerMixin):
    def calc_distance(self, x):
        coords_1 = (x.pickup_latitude, x.pickup_longitude)
        coords_2 = (x.dropoff_latitude, x.dropoff_longitude)
        try:
            miles = geopy.distance.distance(coords_1, coords_2).miles
        except:
            return -1
        return miles
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X["distance_traveled"] = X.apply(lambda x : self.calc_distance(x), axis=1)
        return X

In [48]:
class RemoveOutliers(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.column_name = column_name
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        mean = X[self.column_name].mean()
        std = X[self.column_name].std()
        X = X[(X[self.column_name] < mean + 3*std) & (X[self.column_name] > mean - 3*std)]
        return X

In [49]:
class CalculateDistanceTraveled(BaseEstimator, TransformerMixin):
    def calc_distance(self, x):
        coords_1 = (x.pickup_latitude, x.pickup_longitude)
        coords_2 = (x.dropoff_latitude, x.dropoff_longitude)
        try:
            miles = geopy.distance.great_circle(coords_1, coords_2).miles
        except:
            return -1
        return miles
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X["distance_traveled"] = X.apply(lambda x : self.calc_distance(x), axis=1)
        return X

In [50]:
class TransformDateTime(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.column_name = column_name
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X[self.column_name] = pd.to_datetime(X[self.column_name])
        return X

In [51]:
class ExtractWeekday(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X["weekday"] = X["pickup_datetime"].dt.weekday
        return X

In [52]:
pipeline = Pipeline([
    ("RemoveOutliers-pickup_longitude", RemoveOutliers("pickup_longitude")),
    ("RemoveOutliers-pickup_latitude", RemoveOutliers("pickup_latitude")),
    ("RemoveOutliers-dropoff_latitude", RemoveOutliers("dropoff_latitude")),
    ("RemoveOutliers-dropoff_longitude", RemoveOutliers("dropoff_longitude")),
    ("CalculateDistanceTraveled", CalculateDistanceTraveled()),
    ("CustomTransformer_CalculatingDistanceTraveled", CalculateDistanceTraveled()),
    ("RemovingOutliers-distance_traveled", RemoveOutliers("distance_traveled")),
    ("TransformDateTime-pickup_datetime", TransformDateTime("pickup_datetime")),
    ("ExtractWeekday", ExtractWeekday()),
    # ("OneHotEncode-Weekday", OneHotEncoder())
])

In [53]:
%%time
uber = pipeline.fit_transform(uber)

CPU times: user 18.5 s, sys: 20.1 ms, total: 18.5 s
Wall time: 18.5 s


In [54]:
uber = pd.concat([uber, pd.get_dummies(uber["weekday"])], axis=1)

In [55]:
uber.drop(["weekday"], axis=1, inplace=True)

In [56]:
uber.drop(["key", "Unnamed: 0"], axis=1, inplace=True)

In [57]:
uber.pickup_datetime = uber.pickup_datetime.dt.hour

In [58]:
uber.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance_traveled,0,1,2,3,4,5,6
0,7.5,19,-73.999817,40.738354,-73.999512,40.723217,1,1.04597,0,0,0,1,0,0,0
1,7.7,20,-73.994355,40.728225,-73.99471,40.750325,1,1.527078,0,0,0,0,1,0,0
2,12.9,21,-74.005043,40.74077,-73.962565,40.772647,1,3.129464,1,0,0,0,0,0,0
3,5.3,8,-73.976124,40.790844,-73.965316,40.803349,3,1.032524,0,0,0,0,1,0,0
4,16.0,17,-73.925023,40.744085,-73.973082,40.761247,5,2.78092,0,0,0,1,0,0,0


# Modelling

In [59]:
target = uber.fare_amount
params = uber.drop(["fare_amount"], axis=1)

In [61]:
params = StandardScaler().fit_transform(params)

### Random Forest

In [63]:
%%time
RF_model = RandomForestRegressor(n_estimators=20, max_depth=8, random_state=42)
forest_scores = cross_val_score(RF_model, params, target, scoring="r2", cv=5)
forest_scores.mean()

CPU times: user 36.1 s, sys: 578 µs, total: 36.1 s
Wall time: 36.1 s


0.7455270506970526

### Decision Tree

In [64]:
%%time
DT_model = DecisionTreeRegressor(max_depth=7, random_state=42)
DT_scores = cross_val_score(DT_model, params, target, scoring="r2", cv=5)
DT_scores.mean()

CPU times: user 2.61 s, sys: 72 µs, total: 2.61 s
Wall time: 2.61 s


0.7226487047639051

### Linear Regression

In [65]:
LR_model = LinearRegression()
LR_scores = cross_val_score(LR_model, params, target, scoring="r2", cv=5)
LR_scores.mean()

0.6609832198633138

### Gradient Boosting

In [66]:
%%time
XGB_model = XGBRegressor(n_estimators=3000, max_depth=4, random_state=42, tree_method="gpu_hist", learning_rate=0.01)
XGB_scores = cross_val_score(XGB_model, params, target, scoring="r2", cv=10)
XGB_scores.mean()

CPU times: user 53.6 s, sys: 90.1 ms, total: 53.7 s
Wall time: 45.5 s


0.7593611538110057

In [None]:
param_grid = {
    "n_estimators": [10, 100, 1000, 3000],
    "max_depth": [2, 4, 6, 8, 10],
    "learning_rate": [0.01],
    "tree_method": ["gpu_hist"],
}
XGB_model = XGBRegressor(random_state=42)
XGB_grid = GridSearchCV(XGB_model, param_grid, scoring="r2", cv=5, verbose=3)
XGB_grid.fit(params, target)

In [None]:
XGB_grid.best_score_

In [None]:
XGB_grid.best_params_