In [None]:
import matplotlib.pyplot as plt
import mlflow
import pandas as pd
import pickle
import seaborn as sns
import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.svm import LinearSVR


from sklearn.metrics import mean_squared_error

DATA_PATH = "../../data/"
DATASET = "green"
TARGET = "duration"

In [2]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

2023/06/04 10:53:44 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2023/06/04 10:53:44 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

<Experiment: artifact_location='/home/hilbert/zoomcamps/prefect/3_3/mlruns/1', creation_time=1685868824464, experiment_id='1', last_update_time=1685868824464, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [11]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df

dftr = read_dataframe(f"{DATA_PATH}{DATASET}_tripdata_2022-01.parquet")
dfvl = read_dataframe(f"{DATA_PATH}{DATASET}_tripdata_2022-02.parquet")
print(dftr.shape, dfvl.shape)

(59603, 21) (66097, 21)


In [15]:
dftr['PU_DO'] = dftr['PULocationID'] + '_' + dftr['DOLocationID']
dfvl['PU_DO'] = dfvl['PULocationID'] + '_' + dfvl['DOLocationID']

categorical = ['PU_DO']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = dftr[categorical + numerical].to_dict(orient='records')
val_dicts = dfvl[categorical + numerical].to_dict(orient='records')

xtr = dv.fit_transform(train_dicts)
xvl = dv.transform(val_dicts)

ytr = dftr[TARGET].values
yvl = dfvl[TARGET].values
print(xtr.shape, xvl.shape, ytr.shape, yvl.shape)

(59603, 6683) (66097, 6683) (59603,) (66097,)


In [19]:
lr = LinearRegression()
lr.fit(xtr, ytr)
ypr = lr.predict(xvl)
print(mean_squared_error(yvl, ypr, squared=False))

with open('models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

6.901396794757165


In [25]:
with mlflow.start_run():
    mlflow.set_tag("taxi",  "lin_reg")
    mlflow.log_param("train-data-path", f"{DATA_PATH}{DATASET}_tripdata_2022-01.parquet")
    mlflow.log_param("valid-data-path", f"{DATA_PATH}{DATASET}_tripdata_2022-02.parquet")
    
    alpha = 0.1
    mlflow.log_param("alpha", alpha)
    lr = Lasso(alpha)
    lr.fit(xtr, ytr)

    ypr = lr.predict(xvl)
    rmse = mean_squared_error(yvl, ypr, squared=False)
    mlflow.log_metric("rmse", rmse)

    mlflow.log_artifact(
        local_path="models/lin_reg.bin",
        artifact_path="models_pickle"
    )

In [26]:
train = xgb.DMatrix(xtr, label=ytr)
valid = xgb.DMatrix(xvl, label=yvl)

def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("taxi", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, "valid")],
            early_stopping_rounds=50
        )
        ypr = booster.predict(valid)
        rmse = mean_squared_error(yvl, ypr, squared=False)
        mlflow.log_metric("rmse", rmse)
        
    return {"loss": rmse, "status": STATUS_OK}

search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[0]	valid-rmse:6.64152                                
[1]	valid-rmse:6.09365                                
[2]	valid-rmse:6.04293                                
[3]	valid-rmse:6.03958                                
[4]	valid-rmse:6.03010                                
[5]	valid-rmse:6.02624                                
[6]	valid-rmse:6.01310                                
[7]	valid-rmse:6.01405                                
[8]	valid-rmse:6.01122                                
[9]	valid-rmse:6.00818                                
[10]	valid-rmse:6.00598                               
[11]	valid-rmse:6.00350                               
[12]	valid-rmse:6.00121                               
[13]	valid-rmse:5.99898                               
[14]	valid-rmse:6.00044                               
[15]	valid-rmse:5.99709                               
[16]	valid-rmse:5.99288                               
[17]	valid-rmse:5.99297                               
[18]	valid

In [None]:
mlflow.sklearn.autolog()

for modelClass in (RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, LinearSVR):
    with mlflow.start_run() as run:
        mlflow.set_tag("taxiModel", modelClass.__name__)
        mlflow.log_param("train-data-path", f"{DATA_PATH}{DATASET}_tripdata_2022-01.parquet")
        mlflow.log_param("valid-data-path", f"{DATA_PATH}{DATASET}_tripdata_2022-02.parquet")
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")
        
        mlmodel = model_class()
        mlmodel.fit(xtr, ytr)

        ypr = mlmodel.predict(xvl)
        rmse = mean_squared_error(yvl, ypr, squared=False)
        mlflow.log_metric("rmse", rmse)