In [1]:
import pandas as pd
import plotly.express as px
from numpy.random import randint
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import math
import geopy.distance
from sklearn.linear_model import LinearRegression
from datetime import datetime, timedelta, timezone
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor # For gradient boosting model at the end. Comment out if not running
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

  from pandas import MultiIndex, Int64Index


# Importing the dataset

In [2]:
%%time
uber = pd.read_csv("uber.csv")

CPU times: user 154 ms, sys: 17.7 ms, total: 172 ms
Wall time: 171 ms


# Cleaning the data:

In [3]:
class RemoveOutliers(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.column_name = column_name
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        mean = X[self.column_name].mean()
        std = X[self.column_name].std()
        X = X[(X[self.column_name] < mean + 3*std) & (X[self.column_name] > mean - 3*std)]
        return X

In [4]:
data_cleaner = Pipeline([
    ("RemoveOutliers-pickup_longitude", RemoveOutliers("pickup_longitude")),
    ("RemoveOutliers-pickup_latitude", RemoveOutliers("pickup_latitude")),
    ("RemoveOutliers-dropoff_latitude", RemoveOutliers("dropoff_latitude")),
    ("RemoveOutliers-dropoff_longitude", RemoveOutliers("dropoff_longitude")),
])

In [5]:
uber = data_cleaner.fit_transform(uber)

# Splitting the data:

In [6]:
X_train, X_test, y_train, y_test = train_test_split(uber.drop(['fare_amount'], axis=1), uber['fare_amount'], test_size=0.2, random_state=42)

# Preparing the data:

In [7]:
class CalculateDistanceTraveled(BaseEstimator, TransformerMixin):
    def calc_distance(self, x):
        coords_1 = (x.pickup_latitude, x.pickup_longitude)
        coords_2 = (x.dropoff_latitude, x.dropoff_longitude)
        try:
            miles = geopy.distance.great_circle(coords_1, coords_2).miles
        except:
            return -1
        return miles
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X["distance_traveled"] = X.apply(lambda x : self.calc_distance(x), axis=1)
        return X

In [8]:
class TransformDateTime(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.column_name = column_name
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X[self.column_name] = pd.to_datetime(X[self.column_name])
        return X

In [9]:
class ExtractWeekday(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X["weekday"] = X["pickup_datetime"].dt.weekday
        return X

In [10]:
class ExtractHour(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X["pickup_datetime"] = X["pickup_datetime"].dt.hour
        return X

In [11]:
class OneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.column_name = column_name
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = pd.concat([X, pd.get_dummies(X[self.column_name])], axis=1)
        X.drop(self.column_name, axis=1, inplace=True)
        return X

In [12]:
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.column_name = column_name
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X.drop(self.column_name, axis=1, inplace=True)
        return X

In [13]:
data_preparation = Pipeline([
    ("CalculateDistanceTraveled", CalculateDistanceTraveled()),
    ("CustomTransformer_CalculatingDistanceTraveled", CalculateDistanceTraveled()),
    ("TransformDateTime-pickup_datetime", TransformDateTime("pickup_datetime")),
    ("ExtractWeekday", ExtractWeekday()),
    ("ExtractHour", ExtractHour()),
    ("OneHotEncode-Weekday", OneHotEncoder("weekday")),
    ("ColumnDropper-key", ColumnDropper("key")),
    ("ColumnDropper-unnamed", ColumnDropper("Unnamed: 0")),
    ("Scaler", StandardScaler())
])

# Full Pipeline:

In [14]:
full_pipeline = Pipeline([
    ("data_preparation", data_preparation),
    ("model", DecisionTreeRegressor(max_depth=7, random_state=42))
])

In [15]:
Dt_model = full_pipeline.fit(X_train, y_train)

In [16]:
Dt_model.score(X_test, y_test)

0.7939651597646665

# Data Preperation for other models

In [17]:
%%time
X_train = data_preparation.fit_transform(X_train)
X_test = data_preparation.fit_transform(X_test)

CPU times: user 8.09 s, sys: 18.4 ms, total: 8.11 s
Wall time: 8.11 s


# Model Exploration:

### Decision Tree

In [18]:
%%time
DT_model = DecisionTreeRegressor(max_depth=7, random_state=42)
DT_scores = cross_val_score(DT_model, X_train, y_train, scoring="r2", cv=5)
DT_scores.mean()

CPU times: user 1.79 s, sys: 3.54 ms, total: 1.8 s
Wall time: 1.79 s


0.7706759299559621

### Linear Regression

In [19]:
LR_model = LinearRegression()
LR_scores = cross_val_score(LR_model, X_train, y_train, scoring="r2", cv=5)
LR_scores.mean()

0.08338431372487043

### Random Forest

In [20]:
%%time
RF_model = RandomForestRegressor(n_estimators=20, max_depth=8, random_state=42)
forest_scores = cross_val_score(RF_model, X_train, y_train, scoring="r2", cv=5)
forest_scores.mean()

CPU times: user 26 s, sys: 701 ms, total: 26.7 s
Wall time: 25.7 s


0.7931103666607402

### Gradient Boosting - Do not run this section if you do not have a Cuda-Compatable GPU

In [21]:
%%time
XGB_model = XGBRegressor(n_estimators=3000, max_depth=4, random_state=42, tree_method="gpu_hist", learning_rate=0.01)
XGB_scores = cross_val_score(XGB_model, X_train, y_train, scoring="r2", cv=10)
XGB_scores.mean()

CPU times: user 40.8 s, sys: 207 ms, total: 41 s
Wall time: 33.7 s


0.7971188838124149

In [22]:
%%time
param_grid = {
    "n_estimators": [10, 100, 1000],
    "max_depth": [2, 4, 6, 8],
    "learning_rate": [0.01],
    "tree_method": ["gpu_hist"],
}
XGB_model = XGBRegressor(random_state=42)
XGB_grid = GridSearchCV(XGB_model, param_grid, scoring="r2", cv=5, verbose=3)
XGB_grid.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV 1/5] END learning_rate=0.01, max_depth=2, n_estimators=10, tree_method=gpu_hist;, score=-0.942 total time=   0.0s
[CV 2/5] END learning_rate=0.01, max_depth=2, n_estimators=10, tree_method=gpu_hist;, score=-0.889 total time=   0.0s
[CV 3/5] END learning_rate=0.01, max_depth=2, n_estimators=10, tree_method=gpu_hist;, score=-0.876 total time=   0.0s
[CV 4/5] END learning_rate=0.01, max_depth=2, n_estimators=10, tree_method=gpu_hist;, score=-0.812 total time=   0.0s
[CV 5/5] END learning_rate=0.01, max_depth=2, n_estimators=10, tree_method=gpu_hist;, score=-0.876 total time=   0.0s
[CV 1/5] END learning_rate=0.01, max_depth=2, n_estimators=100, tree_method=gpu_hist;, score=0.465 total time=   0.1s
[CV 2/5] END learning_rate=0.01, max_depth=2, n_estimators=100, tree_method=gpu_hist;, score=0.445 total time=   0.1s
[CV 3/5] END learning_rate=0.01, max_depth=2, n_estimators=100, tree_method=gpu_hist;, score=0.453 total time=   

GridSearchCV(cv=5,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, predictor=None,
                                    random_state=42, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
         

In [23]:
XGB_grid.best_score_

0.7952459077676868

In [24]:
XGB_grid.best_params_

{'learning_rate': 0.01,
 'max_depth': 6,
 'n_estimators': 1000,
 'tree_method': 'gpu_hist'}

# Ensamble of Methods:

In [25]:
class Ensemble_RandomForest_DecisionTree():
    def __init__(self):
        self.rfRegressor = RandomForestRegressor(n_estimators=20, max_depth=8, random_state=42)
        self.dtRegressor = DecisionTreeRegressor(max_depth=7, random_state=42)
    def fit(self, X, y):
        self.rfRegressor.fit(X, y)
        self.dtRegressor.fit(X, y)
    def predict(self, X):
        return .5*self.rfRegressor.predict(X) + .5*self.dtRegressor.predict(X)

In [26]:
%%time
Ensemble_RandomForest_DecisionTree_model = Ensemble_RandomForest_DecisionTree()
Ensemble_RandomForest_DecisionTree_model.fit(X_train, y_train)

CPU times: user 6.82 s, sys: 7.89 ms, total: 6.83 s
Wall time: 6.83 s


## Model Scoring

In [27]:
predictions = Ensemble_RandomForest_DecisionTree_model.predict(X_test)

In [28]:
r2_score(y_test, predictions)

0.6992457138548683

In [29]:
mean_squared_error(y_test, predictions)

28.901544437239988