In [48]:
import pandas as pd
import plotly.express as px
from numpy.random import randint
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import math
import geopy.distance
from sklearn.linear_model import LinearRegression
from datetime import datetime, timedelta, timezone
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Importing the dataset

In [49]:
%%time
uber = pd.read_csv("uber.csv")

CPU times: user 637 ms, sys: 103 ms, total: 740 ms
Wall time: 741 ms


# Cleaning the data:

In [50]:
class RemoveOutliers(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.column_name = column_name
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        mean = X[self.column_name].mean()
        std = X[self.column_name].std()
        X = X[(X[self.column_name] < mean + 3*std) & (X[self.column_name] > mean - 3*std)]
        return X

In [51]:
data_cleaner = Pipeline([
    ("RemoveOutliers-pickup_longitude", RemoveOutliers("pickup_longitude")),
    ("RemoveOutliers-pickup_latitude", RemoveOutliers("pickup_latitude")),
    ("RemoveOutliers-dropoff_latitude", RemoveOutliers("dropoff_latitude")),
    ("RemoveOutliers-dropoff_longitude", RemoveOutliers("dropoff_longitude")),
])

In [52]:
uber = data_cleaner.fit_transform(uber)

# Splitting the data:

In [53]:
X_train, X_test, y_train, y_test = train_test_split(uber.drop(['fare_amount'], axis=1), uber['fare_amount'], test_size=0.2, random_state=42)

# Preparing the data:

In [54]:
class CalculateDistanceTraveled(BaseEstimator, TransformerMixin):
    def calc_distance(self, x):
        coords_1 = (x.pickup_latitude, x.pickup_longitude)
        coords_2 = (x.dropoff_latitude, x.dropoff_longitude)
        try:
            miles = geopy.distance.great_circle(coords_1, coords_2).miles
        except:
            return -1
        return miles
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X["distance_traveled"] = X.apply(lambda x : self.calc_distance(x), axis=1)
        return X

In [55]:
class TransformDateTime(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.column_name = column_name
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X[self.column_name] = pd.to_datetime(X[self.column_name])
        return X

In [56]:
class ExtractWeekday(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X["weekday"] = X["pickup_datetime"].dt.weekday
        return X

In [57]:
class ExtractHour(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X["pickup_datetime"] = X["pickup_datetime"].dt.hour
        return X

In [58]:
class OneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.column_name = column_name
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X = pd.concat([X, pd.get_dummies(X[self.column_name])], axis=1)
        X.drop(self.column_name, axis=1, inplace=True)
        return X

In [59]:
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, column_name):
        self.column_name = column_name
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X.drop(self.column_name, axis=1, inplace=True)
        return X

In [60]:
data_preparation = Pipeline([
    ("CalculateDistanceTraveled", CalculateDistanceTraveled()),
    ("CustomTransformer_CalculatingDistanceTraveled", CalculateDistanceTraveled()),
    ("TransformDateTime-pickup_datetime", TransformDateTime("pickup_datetime")),
    ("ExtractWeekday", ExtractWeekday()),
    ("ExtractHour", ExtractHour()),
    ("OneHotEncode-Weekday", OneHotEncoder("weekday")),
    ("ColumnDropper-key", ColumnDropper("key")),
    ("ColumnDropper-unnamed", ColumnDropper("Unnamed: 0")),
    ("Scaler", StandardScaler())
])

# Full Pipeline:

In [61]:
full_pipeline = Pipeline([
    ("data_preparation", data_preparation),
    ("model", DecisionTreeRegressor(max_depth=7, random_state=42))
])

In [62]:
Dt_model = full_pipeline.fit(X_train, y_train)

In [63]:
Dt_model.score(X_test, y_test)

0.7939651597646665

# Data Preperation for other models

In [64]:
%%time
X_train = data_preparation.fit_transform(X_train)
X_test = data_preparation.fit_transform(X_test)

CPU times: user 31.2 s, sys: 400 ms, total: 31.6 s
Wall time: 31.7 s


# Model Exploration:

### Decision Tree

In [65]:
%%time
DT_model = DecisionTreeRegressor(max_depth=7, random_state=42)
DT_scores = cross_val_score(DT_model, X_train, y_train, scoring="r2", cv=5)
DT_scores.mean()

CPU times: user 3.91 s, sys: 51.5 ms, total: 3.96 s
Wall time: 3.97 s


0.7706759299559621

### Linear Regression

In [66]:
LR_model = LinearRegression()
LR_scores = cross_val_score(LR_model, X_train, y_train, scoring="r2", cv=5)
LR_scores.mean()

0.08338431372487234

### Random Forest

In [67]:
%%time
RF_model = RandomForestRegressor(n_estimators=20, max_depth=8, random_state=42)
forest_scores = cross_val_score(RF_model, X_train, y_train, scoring="r2", cv=5)
forest_scores.mean()

CPU times: user 54.4 s, sys: 541 ms, total: 55 s
Wall time: 54.3 s


0.7931103666607402

### Gradient Boosting

In [68]:
# %%time
# XGB_model = XGBRegressor(n_estimators=3000, max_depth=4, random_state=42, tree_method="gpu_hist", learning_rate=0.01)
# XGB_scores = cross_val_score(XGB_model, params, target, scoring="r2", cv=10)
# XGB_scores.mean()

In [69]:
# param_grid = {
#     "n_estimators": [10, 100, 1000, 3000],
#     "max_depth": [2, 4, 6, 8, 10],
#     "learning_rate": [0.01],
#     "tree_method": ["gpu_hist"],
# }
# XGB_model = XGBRegressor(random_state=42)
# XGB_grid = GridSearchCV(XGB_model, param_grid, scoring="r2", cv=5, verbose=3)
# XGB_grid.fit(params, target)

In [70]:
# XGB_grid.best_score_

In [71]:
# XGB_grid.best_params_

# Ensamble of Methods:

In [72]:
class Ensemble_RandomForest_DecisionTree():
    def __init__(self):
        self.rfRegressor = RandomForestRegressor(n_estimators=20, max_depth=8, random_state=42)
        self.dtRegressor = DecisionTreeRegressor(max_depth=7, random_state=42)
    def fit(self, X, y):
        self.rfRegressor.fit(X, y)
        self.dtRegressor.fit(X, y)
    def predict(self, X):
        return .5*self.rfRegressor.predict(X) + .5*self.dtRegressor.predict(X)

In [73]:
%%time
Ensemble_RandomForest_DecisionTree_model = Ensemble_RandomForest_DecisionTree()
Ensemble_RandomForest_DecisionTree_model.fit(X_train, y_train)

CPU times: user 10 s, sys: 64.3 ms, total: 10.1 s
Wall time: 10.1 s


## Model Scoring

In [74]:
predictions = Ensemble_RandomForest_DecisionTree_model.predict(X_test)

In [75]:
r2_score(y_test, predictions)

0.6992660192306956

In [76]:
mean_squared_error(y_test, predictions)

28.8995931542531