In [1]:
!python -V

Python 3.9.13


In [2]:
# data manipulation and storage
import pandas as pd

# plotting and graphs
import seaborn as sns
import matplotlib.pyplot as plt

# data preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler

# models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
import xgboost as xgb

# model performance metrics
from sklearn.metrics import mean_squared_error

# saving model to file
import pickle

# mlflow for experiment tracking
import mlflow
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('nyc-taxi-duration-experiment')

# hyper-parameter optimization
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [3]:
df_train = pd.read_parquet('./data/train_data.parquet')
df_val = pd.read_parquet('./data/validation_data.parquet')

In [4]:
categorical = ['PU_DO_pair']
# ['PULocationID','DOLocationID']
numerical = ['trip_distance','fare_amount']
target = 'duration'
# Pre Processing - Numerical
scaler = StandardScaler()
df_train[numerical] = scaler.fit_transform(df_train[numerical])
df_val[numerical] = scaler.transform(df_val[numerical])
train_dicts = df_train[categorical+numerical].to_dict(orient='records')
val_dicts = df_val[categorical+numerical].to_dict(orient='records')
# Pre Processing - Categorical
df_train[categorical] = df_train[categorical].astype(str)
df_val[categorical] = df_val[categorical].astype(str)
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
y_train = df_train[target].values
X_val = dv.transform(val_dicts)
y_val = df_val[target].values

In [5]:
train = xgb.DMatrix(X_train,label=y_train)
valid = xgb.DMatrix(X_val, label = y_val)

In [7]:
# define objective function
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('model','xgboost')
        mlflow.log_params(params)
        xgb_model = xgb.train(
            params = params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid,'validation')],
            early_stopping_rounds=50
        )
        y_pred = xgb_model.predict(valid)
        rmse = mean_squared_error(y_val,y_pred,squared=False)
        mlflow.log_metric('rmse',rmse)

    return {'loss':rmse,'status':STATUS_OK}

# define hyper-parameter search space
search_space = {
    'max_depth':scope.int(hp.quniform('max_depth',4,100,1)),
    'learning_rate':hp.loguniform('learning_rate',-3,0),
    'reg_alpha':hp.loguniform('reg_alpha',-5,-1),
    'reg_lambda':hp.loguniform('reg_lambda',-6,-1),
    'min_child_weight':hp.loguniform('min_child_weight',-1,3),
    'objective':'reg:linear',
    'seed':42
}

# Perform hyper-parameter optimization
best_result = fmin(
    fn = objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals=50,
    trials = Trials()
)   

[0]	validation-rmse:14.64786                          
[1]	validation-rmse:10.27208                          
[2]	validation-rmse:7.97441                           
[3]	validation-rmse:6.85351                           
[4]	validation-rmse:6.29722                           
[5]	validation-rmse:6.04535                           
[6]	validation-rmse:5.91314                           
[7]	validation-rmse:5.84456                           
[8]	validation-rmse:5.80933                           
[9]	validation-rmse:5.78830                           
[10]	validation-rmse:5.77625                          
[11]	validation-rmse:5.76456                          
[12]	validation-rmse:5.76037                          
[13]	validation-rmse:5.75917                          
[14]	validation-rmse:5.75700                          
[15]	validation-rmse:5.75587                          
[16]	validation-rmse:5.75390                          
[17]	validation-rmse:5.75366                          
[18]	valid

KeyboardInterrupt: 

In [6]:
# Perform single trial
# with Auto Logging for MLFlow enabled

with mlflow.start_run():

    train = xgb.DMatrix(X_train,label=y_train)
    valid = xgb.DMatrix(X_val,label=y_val)

    params = {
        'learning_rate'	:0.09675711924293533,
        'max_depth':	31,
        'min_child_weight':0.41292138902768033,
        'objective':	'reg:linear',
        'reg_alpha':	0.34559561068374306,
        'reg_lambda':	0.01739291741989833,
        'seed':	42
    }

    mlflow.log_params(params)
    mlflow.xgboost.autolog(disable=True)

    booster = xgb.train(
        params = params,
        dtrain = train,
        num_boost_round = 1000,
        evals = [(valid,'validation')],
        early_stopping_rounds = 50
    )

    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val,y_pred,squared=False)
    mlflow.log_metric('rmse',rmse)

    with open('models/vectorizer.b','wb') as f_out:
        pickle.dump(dv,f_out)
    
    with open('models/scaler.b','wb') as f_out:
        pickle.dump(scaler,f_out)

    mlflow.log_artifact('models/vectorizer.b',artifact_path="preprocessor")
    mlflow.log_artifact('models/scaler.b',artifact_path="preprocessor")
    mlflow.xgboost.log_model(booster,artifact_path="models_mlflow")

[0]	validation-rmse:20.59416
[1]	validation-rmse:18.89255
[2]	validation-rmse:17.36444
[3]	validation-rmse:15.99935
[4]	validation-rmse:14.78185
[5]	validation-rmse:13.69304
[6]	validation-rmse:12.72729
[7]	validation-rmse:11.86636
[8]	validation-rmse:11.10763
[9]	validation-rmse:10.43255
[10]	validation-rmse:9.83785
[11]	validation-rmse:9.31981
[12]	validation-rmse:8.85881
[13]	validation-rmse:8.45440
[14]	validation-rmse:8.10439
[15]	validation-rmse:7.79829
[16]	validation-rmse:7.53456
[17]	validation-rmse:7.30543
[18]	validation-rmse:7.10715
[19]	validation-rmse:6.93526
[20]	validation-rmse:6.78723
[21]	validation-rmse:6.65716
[22]	validation-rmse:6.54524
[23]	validation-rmse:6.44711
[24]	validation-rmse:6.36037
[25]	validation-rmse:6.28807
[26]	validation-rmse:6.22514
[27]	validation-rmse:6.17297
[28]	validation-rmse:6.12676
[29]	validation-rmse:6.08537
[30]	validation-rmse:6.05075
[31]	validation-rmse:6.01976
[32]	validation-rmse:5.99249
[33]	validation-rmse:5.96867
[34]	validatio

