In [7]:
!python -V

Python 3.9.13


In [8]:
# data manipulation and storage
import pandas as pd

# plotting and graphs
import seaborn as sns
import matplotlib.pyplot as plt

# data preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler

# models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
import xgboost as xgb

# model performance metrics
from sklearn.metrics import mean_squared_error

# saving model to file
import pickle

# mlflow for experiment tracking
import mlflow
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('nyc-taxi-duration-experiment')

# hyper-parameter optimization
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [9]:
df_train = pd.read_parquet('./data/train_data.parquet')
df_val = pd.read_parquet('./data/validation_data.parquet')

In [10]:
categorical = ['PU_DO_pair']
# ['PULocationID','DOLocationID']
numerical = ['trip_distance','fare_amount']
target = 'duration'
# Pre Processing - Numerical
scaler = StandardScaler()
df_train[numerical] = scaler.fit_transform(df_train[numerical])
df_val[numerical] = scaler.transform(df_val[numerical])
train_dicts = df_train[categorical+numerical].to_dict(orient='records')
val_dicts = df_val[categorical+numerical].to_dict(orient='records')
# Pre Processing - Categorical
df_train[categorical] = df_train[categorical].astype(str)
df_val[categorical] = df_val[categorical].astype(str)
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
y_train = df_train[target].values
X_val = dv.transform(val_dicts)
y_val = df_val[target].values

In [11]:
train = xgb.DMatrix(X_train,label=y_train)
valid = xgb.DMatrix(X_val, label = y_val)

In [None]:
# define objective function
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('model','xgboost')
        mlflow.log_params(params)
        xgb_model = xgb.train(
            params = params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid,'validation')],
            early_stopping_rounds=50
        )
        y_pred = xgb_model.predict(valid)
        rmse = mean_squared_error(y_val,y_pred,squared=False)
        mlflow.log_metric('rmse',rmse)

    return {'loss':rmse,'status':STATUS_OK}

# define hyper-parameter search space
search_space = {
    'max_depth':scope.int(hp.quniform('max_depth',4,100,1)),
    'learning_rate':hp.loguniform('learning_rate',-3,0),
    'reg_alpha':hp.loguniform('reg_alpha',-5,-1),
    'reg_lambda':hp.loguniform('reg_lambda',-6,-1),
    'min_child_weight':hp.loguniform('min_child_weight',-1,3),
    'objective':'reg:linear',
    'seed':42
}

# Perform 50 Trials
best_result = fmin(
    fn = objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals=50,
    trials = Trials()
)   

In [12]:
# Perform single trial
# with Auto Logging for MLFlow enabled

params = {
    'learning_rate'	:0.1700079110741563,
    'max_depth':	16,
    'min_child_weight':2.37717271395477,
    'objective':	'reg:linear',
    'reg_alpha':	0.363258077609188,
    'reg_lambda':	0.011733914718256189,
    'seed':	42
}

mlflow.xgboost.autolog()

xgb.train(
    params = params,
    dtrain = train,
    num_boost_round = 1000,
    evals = [(valid,'validation')],
    early_stopping_rounds = 50
)

2022/10/20 08:17:29 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'e4bd2f876e48428aa3590f2978e95aaf', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:19.16700
[1]	validation-rmse:16.45230
[2]	validation-rmse:14.23091
[3]	validation-rmse:12.44161
[4]	validation-rmse:11.00238
[5]	validation-rmse:9.86649
[6]	validation-rmse:8.95757
[7]	validation-rmse:8.24955
[8]	validation-rmse:7.69220
[9]	validation-rmse:7.27076
[10]	validation-rmse:6.94454
[11]	validation-rmse:6.68103
[12]	validation-rmse:6.47129
[13]	validation-rmse:6.31366
[14]	validation-rmse:6.18344
[15]	validation-rmse:6.08396
[16]	validation-rmse:6.01352
[17]	validation-rmse:5.94952
[18]	validation-rmse:5.90587
[19]	validation-rmse:5.86713
[20]	validation-rmse:5.82054
[21]	validation-rmse:5.79967
[22]	validation-rmse:5.78225
[23]	validation-rmse:5.76755
[24]	validation-rmse:5.75323
[25]	validation-rmse:5.74527
[26]	validation-rmse:5.73492
[27]	validation-rmse:5.72813
[28]	validation-rmse:5.72246
[29]	validation-rmse:5.71917
[30]	validation-rmse:5.71479
[31]	validation-rmse:5.71183
[32]	validation-rmse:5.70939
[33]	validation-rmse:5.70770
[34]	validation-rms

<xgboost.core.Booster at 0x7f9d87642cd0>