In [1]:
import mlflow

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import r2_score
from math import *
from datetime import datetime

In [2]:
mlflow.__version__

'2.2.2'

In [3]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('demo-experiment')

2023/05/05 15:44:59 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2023/05/05 15:44:59 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

<Experiment: artifact_location='/Users/preethampathi/Desktop/case_studies/actual/mlruns/1', creation_time=1683326699382, experiment_id='1', last_update_time=1683326699382, lifecycle_stage='active', name='demo-experiment', tags={}>

In [4]:
uber = pd.read_csv('../data/uber.csv')
uber.drop('Unnamed: 0', axis=1, inplace=True)

uber['pickup_year']=pd.DatetimeIndex(uber['pickup_datetime']).year
uber['pickup_month']=pd.DatetimeIndex(uber['pickup_datetime']).month
uber['pickup_day']=pd.DatetimeIndex(uber['pickup_datetime']).day
uber['pickup_hour']=pd.DatetimeIndex(uber['pickup_datetime']).hour
uber['pickup_minute']=pd.DatetimeIndex(uber['pickup_datetime']).minute
uber['pickup_second']=pd.DatetimeIndex(uber['pickup_datetime']).second

# cycling encoding for pickup_month, pickup_day, pickup_hour, pickup_minute, pickup_second

uber['pickup_month_sin'] = np.sin(uber['pickup_month']*(2.*np.pi/12))
uber['pickup_month_cos'] = np.cos(uber['pickup_month']*(2.*np.pi/12))
uber['pickup_day_sin'] = np.sin(uber['pickup_day']*(2.*np.pi/31))
uber['pickup_day_cos'] = np.cos(uber['pickup_day']*(2.*np.pi/31))
uber['pickup_hour_sin'] = np.sin(uber['pickup_hour']*(2.*np.pi/24))
uber['pickup_hour_cos'] = np.cos(uber['pickup_hour']*(2.*np.pi/24))
uber['pickup_minute_sin'] = np.sin(uber['pickup_minute']*(2.*np.pi/60))
uber['pickup_minute_cos'] = np.cos(uber['pickup_minute']*(2.*np.pi/60))
uber['pickup_second_sin'] = np.sin(uber['pickup_second']*(2.*np.pi/60))
uber['pickup_second_cos'] = np.cos(uber['pickup_second']*(2.*np.pi/60))

uber.drop('pickup_month', axis=1, inplace=True)
uber.drop('pickup_day', axis=1, inplace=True)
uber.drop('pickup_hour', axis=1, inplace=True)
uber.drop('pickup_minute', axis=1, inplace=True)
uber.drop('pickup_second', axis=1, inplace=True)

uber.drop('pickup_datetime', axis=1, inplace=True)
uber.drop('key', axis=1, inplace=True)
uber.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month_sin,pickup_month_cos,pickup_day_sin,pickup_day_cos,pickup_hour_sin,pickup_hour_cos,pickup_minute_sin,pickup_minute_cos,pickup_second_sin,pickup_second_cos
0,7.5,-73.999817,40.738354,-73.999512,40.723217,1,2015,0.5,-0.866025,0.988468,0.151428,-0.965926,0.258819,-0.743145,0.6691306,0.587785,0.809017
1,7.7,-73.994355,40.728225,-73.99471,40.750325,1,2009,-0.5,-0.866025,-0.299363,-0.954139,-0.866025,0.5,0.406737,0.9135455,-0.406737,0.913545
2,12.9,-74.005043,40.74077,-73.962565,40.772647,1,2009,-0.8660254,-0.5,-0.988468,0.151428,-0.707107,0.707107,-1.0,-1.83697e-16,0.0,1.0
3,5.3,-73.976124,40.790844,-73.965316,40.803349,3,2009,1.224647e-16,-1.0,-0.848644,0.528964,0.866025,-0.5,0.743145,-0.6691306,0.809017,-0.587785
4,16.0,-73.925023,40.744085,-73.973082,40.761247,5,2014,-0.8660254,-0.5,-0.571268,0.820763,-0.965926,-0.258819,-0.978148,0.2079117,0.0,1.0


In [5]:
uber[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']].describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
count,200000.0,200000.0,199999.0,199999.0
mean,-72.527638,39.935885,-72.525292,39.92389
std,11.437787,7.720539,13.117408,6.794829
min,-1340.64841,-74.015515,-3356.6663,-881.985513
25%,-73.992065,40.734796,-73.991407,40.733823
50%,-73.981823,40.752592,-73.980093,40.753042
75%,-73.967154,40.767158,-73.963658,40.768001
max,57.418457,1644.421482,1153.572603,872.697628


In [6]:
uber.drop(uber[uber['pickup_longitude']>90].index, inplace=True)
uber.drop(uber[uber['pickup_latitude']>90].index, inplace=True)
uber.drop(uber[uber['dropoff_longitude']>90].index, inplace=True)
uber.drop(uber[uber['dropoff_latitude']>90].index, inplace=True)

uber.drop(uber[uber['pickup_longitude']<-90].index, inplace=True)
uber.drop(uber[uber['pickup_latitude']<-90].index, inplace=True)
uber.drop(uber[uber['dropoff_longitude']<-90].index, inplace=True)
uber.drop(uber[uber['dropoff_latitude']<-90].index, inplace=True)

uber.drop(uber[uber['passenger_count'] > 5].index, axis=0, inplace = True)
uber.drop(uber[uber['passenger_count'] == 0].index, axis=0, inplace = True)
uber.drop(uber[uber['fare_amount'] < 2.5].index, axis=0, inplace = True)

uber.dropna(inplace=True)

uber.shape

(194981, 17)

In [7]:
uber[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']].describe()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
count,194981.0,194981.0,194981.0,194981.0
mean,-72.503114,39.920694,-72.515315,39.926626
std,10.434251,6.087745,10.388957,6.069382
min,-89.933333,-74.015515,-75.458979,-74.01575
25%,-73.992058,40.734779,-73.9914,40.733862
50%,-73.981828,40.752568,-73.98009,40.753045
75%,-73.967167,40.767145,-73.963662,40.768003
max,40.808425,48.01876,40.831932,45.031598


In [8]:
uber['pickup_coords'] = list(zip(uber.pickup_latitude, uber.pickup_longitude))
uber['dropoff_coords'] = list(zip(uber.dropoff_latitude, uber.dropoff_longitude))

uber.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month_sin,pickup_month_cos,pickup_day_sin,pickup_day_cos,pickup_hour_sin,pickup_hour_cos,pickup_minute_sin,pickup_minute_cos,pickup_second_sin,pickup_second_cos,pickup_coords,dropoff_coords
0,7.5,-73.999817,40.738354,-73.999512,40.723217,1,2015,0.5,-0.866025,0.988468,0.151428,-0.965926,0.258819,-0.743145,0.6691306,0.587785,0.809017,"(40.73835372924805, -73.99981689453125)","(40.72321701049805, -73.99951171875)"
1,7.7,-73.994355,40.728225,-73.99471,40.750325,1,2009,-0.5,-0.866025,-0.299363,-0.954139,-0.866025,0.5,0.406737,0.9135455,-0.406737,0.913545,"(40.728225, -73.994355)","(40.750325, -73.99471)"
2,12.9,-74.005043,40.74077,-73.962565,40.772647,1,2009,-0.8660254,-0.5,-0.988468,0.151428,-0.707107,0.707107,-1.0,-1.83697e-16,0.0,1.0,"(40.74077, -74.005043)","(40.772647, -73.962565)"
3,5.3,-73.976124,40.790844,-73.965316,40.803349,3,2009,1.224647e-16,-1.0,-0.848644,0.528964,0.866025,-0.5,0.743145,-0.6691306,0.809017,-0.587785,"(40.790844, -73.976124)","(40.803349, -73.965316)"
4,16.0,-73.925023,40.744085,-73.973082,40.761247,5,2014,-0.8660254,-0.5,-0.571268,0.820763,-0.965926,-0.258819,-0.978148,0.2079117,0.0,1.0,"(40.744085, -73.925023)","(40.761247, -73.97308199999999)"


In [9]:
import geopy.distance

def distance(row):
    return geopy.distance.geodesic(row['pickup_coords'], row['dropoff_coords']).miles

uber['distance'] = uber.apply(distance, axis=1)
uber.drop(uber[uber['distance'] > 130].index, axis=0, inplace = True)
uber.drop(uber[uber['distance'] == 0].index, axis=0, inplace = True)


uber.drop('pickup_coords', axis=1, inplace=True)
uber.drop('dropoff_coords', axis=1, inplace=True)

uber.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_year,pickup_month_sin,pickup_month_cos,pickup_day_sin,pickup_day_cos,pickup_hour_sin,pickup_hour_cos,pickup_minute_sin,pickup_minute_cos,pickup_second_sin,pickup_second_cos,distance
0,7.5,-73.999817,40.738354,-73.999512,40.723217,1,2015,0.5,-0.866025,0.988468,0.151428,-0.965926,0.258819,-0.743145,0.6691306,0.587785,0.809017,1.044594
1,7.7,-73.994355,40.728225,-73.99471,40.750325,1,2009,-0.5,-0.866025,-0.299363,-0.954139,-0.866025,0.5,0.406737,0.9135455,-0.406737,0.913545,1.525071
2,12.9,-74.005043,40.74077,-73.962565,40.772647,1,2009,-0.8660254,-0.5,-0.988468,0.151428,-0.707107,0.707107,-1.0,-1.83697e-16,0.0,1.0,3.131464
3,5.3,-73.976124,40.790844,-73.965316,40.803349,3,2009,1.224647e-16,-1.0,-0.848644,0.528964,0.866025,-0.5,0.743145,-0.6691306,0.809017,-0.587785,1.032372
4,16.0,-73.925023,40.744085,-73.973082,40.761247,5,2014,-0.8660254,-0.5,-0.571268,0.820763,-0.965926,-0.258819,-0.978148,0.2079117,0.0,1.0,2.786061


In [11]:
X = uber.drop(['fare_amount','pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'], axis=1)
y = uber['fare_amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [12]:
ntrees = [20,40,60,80,100]
mtrys = [3,4,5]
max_depth = [2,3,4,5,6,7,8,9,10]

for i in ntrees:
    for j in mtrys:
        for k in max_depth:
            with mlflow.start_run():
                model = RandomForestRegressor(n_estimators=i, max_features=j, max_depth=k)
                model.fit(X_train, y_train)
                predictions = model.predict(X_test)
                score = r2_score(y_test, predictions)
                mlflow.log_param("ntrees", i)
                mlflow.log_param("mtrys", j)
                mlflow.log_param("max_depth", k)
                mlflow.log_metric("r2_score", score)
                mlflow.sklearn.log_model(model, "model")
                mlflow.log_artifact("mlflow.db")

mlflow.end_run()



In [15]:
ntrees = [20,40,60,80,100]
max_depth = [2,3,4,5,6,7,8,9,10]

for i in ntrees:
	for k in max_depth:
		with mlflow.start_run():
			model = GradientBoostingRegressor(n_estimators=i, max_depth=k)
			model.fit(X_train, y_train)
			predictions = model.predict(X_test)
			score = r2_score(y_test, predictions)
			mlflow.log_param("ntrees", i)
			mlflow.log_param("max_depth", k)
			mlflow.log_metric("r2_score", score)
			mlflow.sklearn.log_model(model, "model")
			mlflow.log_artifact("mlflow.db")

mlflow.end_run()

In [19]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.7538306844295084

In [20]:
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor

gb = GradientBoostingRegressor()
gb.fit(X_train, y_train)
gb.score(X_test, y_test)

0.7620080614316104

In [21]:
ada = AdaBoostRegressor()
ada.fit(X_train, y_train)
ada.score(X_test, y_test)

0.4801602492619239

In [16]:
from xgboost import XGBRegressor

xgb = XGBRegressor()
xgb.fit(X_train, y_train)
xgb.score(X_test, y_test)

0.7544926552698323

In [17]:
# do the same for XGBRegressor

ntrees = [20,40,60,80,100]
max_depth = [2,3,4,5,6,7,8,9,10]

for i in ntrees:
	for k in max_depth:
		with mlflow.start_run():
			model = XGBRegressor(n_estimators=i, max_depth=k)
			model.fit(X_train, y_train)
			predictions = model.predict(X_test)
			score = r2_score(y_test, predictions)
			mlflow.log_param("ntrees", i)
			mlflow.log_param("max_depth", k)
			mlflow.log_metric("r2_score", score)
			mlflow.sklearn.log_model(model, "model")
			mlflow.log_artifact("mlflow.db")

mlflow.end_run()