In [1]:
!python -V

Python 3.9.13


In [2]:
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
import pickle

import mlflow
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('nyc-taxi-duration-experiment')

import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope



In [3]:
def read_and_clean(df):
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df['duration'] = df.duration.apply(lambda td: td.total_seconds()/60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    df = df[(df.trip_distance > 1)&(df.trip_distance < 25)]
    df = df[(df.total_amount > 1)&(df.total_amount < 150)]
    df = df[df['passenger_count'] != 0]  
    df['PU_DO_pair'] = df['PULocationID'].astype(str) + '_' + df['DOLocationID'].astype(str)
    return df

In [4]:
df_train = pd.read_parquet('./data/green_tripdata_2021-01.parquet')
df_val = pd.read_parquet('./data/green_tripdata_2021-02.parquet')

In [5]:
df_train = read_and_clean(df_train)
df_val = read_and_clean(df_val)

In [6]:
len(df_train),len(df_val)

(61515, 51622)

In [7]:
categorical = ['PU_DO_pair']
# ['PULocationID','DOLocationID']
numerical = ['trip_distance','fare_amount']
target = 'duration'
# Pre Processing - Numerical
scaler = StandardScaler()
df_train[numerical] = scaler.fit_transform(df_train[numerical])
df_val[numerical] = scaler.transform(df_val[numerical])
train_dicts = df_train[categorical+numerical].to_dict(orient='records')
val_dicts = df_val[categorical+numerical].to_dict(orient='records')
# Pre Processing - Categorical
df_train[categorical] = df_train[categorical].astype(str)
df_val[categorical] = df_val[categorical].astype(str)
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
y_train = df_train[target].values
X_val = dv.transform(val_dicts)
y_val = df_val[target].values

In [8]:
linear_model = LinearRegression()
linear_model.fit(X_train,y_train)
y_pred_linear = linear_model.predict(X_val)
mean_squared_error(y_val,y_pred_linear,squared=False)
with open('models/linear_model.bin','wb') as f_out:
    pickle.dump((scaler,dv,linear_model),f_out)

In [9]:
ridge_model = Ridge(alpha=0.0001)
ridge_model.fit(X_train,y_train)
y_pred_ridge = ridge_model.predict(X_val)
mean_squared_error(y_val,y_pred_ridge,squared=False)
with open('models/ridge_model.bin','wb') as f_out:
    pickle.dump((scaler,dv,ridge_model),f_out)

In [10]:
lasso_model = Lasso(alpha=0.0001)
lasso_model.fit(X_train,y_train)
y_pred_lasso = lasso_model.predict(X_val)
mean_squared_error(y_val,y_pred_lasso,squared=False)
with open('models/lasso_model.bin','wb') as f_out:
    pickle.dump((scaler,dv,lasso_model),f_out)

In [11]:
# Track single trial in an MLFlow epxeriment
with mlflow.start_run():

    mlflow.set_tag('developer','sam')

    mlflow.log_param('train-data-path','./data/green_tripdata_2021-01.parquet')
    mlflow.log_param('val-data-path','./data/green_tripdata_2021-02.parquet')

    alpha = 10
    mlflow.log_param('alpha',alpha)
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(X_train,y_train)
    y_pred_lasso = lasso_model.predict(X_val)
    rmse = mean_squared_error(y_val,y_pred_lasso,squared=False)
    mlflow.log_metric('rmse',rmse)

In [12]:
train = xgb.DMatrix(X_train,label=y_train)
valid = xgb.DMatrix(X_val, label = y_val)

In [13]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('model','xgboost')
        mlflow.log_params(params)
        xgb_model = xgb.train(
            params = params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid,'validation')],
            early_stopping_rounds=50
        )
        y_pred = xgb_model.predict(valid)
        rmse = mean_squared_error(y_val,y_pred,squared=False)
        mlflow.log_metric('rmse',rmse)

    return {'loss':rmse,'status':STATUS_OK}

In [14]:
search_space = {
    'max_depth':scope.int(hp.quniform('max_depth',4,100,1)),
    'learning_rate':hp.loguniform('learning_rate',-3,0),
    'reg_alpha':hp.loguniform('reg_alpha',-5,-1),
    'reg_lambda':hp.loguniform('reg_lambda',-6,-1),
    'min_child_weight':hp.loguniform('min_child_weight',-1,3),
    'objective':'reg:linear',
    'seed':42
}

best_result = fmin(
    fn = objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals=50,
    trials = Trials()
)

DuplicateLabel: reg_alpha