In [19]:
!python -V

Python 3.9.13


In [12]:
# data manipulation and storage
import pandas as pd

# plotting and graphs
import seaborn as sns
import matplotlib.pyplot as plt

# data preprocessing
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler

# models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
import xgboost as xgb

# model performance metrics
from sklearn.metrics import mean_squared_error

# saving model to file
import pickle

# mlflow for experiment tracking
import mlflow
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('nyc-taxi-duration-experiment')

# hyper-parameter optimization
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [20]:
df_train = pd.read_parquet('./data/train_data.parquet')
df_val = pd.read_parquet('./data/validation_data.parquet')

In [15]:
categorical = ['PU_DO_pair']
# ['PULocationID','DOLocationID']
numerical = ['trip_distance','fare_amount']
target = 'duration'
# Pre Processing - Numerical
scaler = StandardScaler()
df_train[numerical] = scaler.fit_transform(df_train[numerical])
df_val[numerical] = scaler.transform(df_val[numerical])
train_dicts = df_train[categorical+numerical].to_dict(orient='records')
val_dicts = df_val[categorical+numerical].to_dict(orient='records')
# Pre Processing - Categorical
df_train[categorical] = df_train[categorical].astype(str)
df_val[categorical] = df_val[categorical].astype(str)
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
y_train = df_train[target].values
X_val = dv.transform(val_dicts)
y_val = df_val[target].values

In [11]:
# Track single trial in an MLFlow epxeriment
with mlflow.start_run():

    mlflow.set_tag('developer','sam')

    mlflow.log_param('train-data-path','./data/green_tripdata_2021-01.parquet')
    mlflow.log_param('val-data-path','./data/green_tripdata_2021-02.parquet')

    alpha = 10
    mlflow.log_param('alpha',alpha)
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(X_train,y_train)
    y_pred_lasso = lasso_model.predict(X_val)
    rmse = mean_squared_error(y_val,y_pred_lasso,squared=False)
    mlflow.log_metric('rmse',rmse)

In [16]:
train = xgb.DMatrix(X_train,label=y_train)
valid = xgb.DMatrix(X_val, label = y_val)

In [None]:
# define objective function
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('model','xgboost')
        mlflow.log_params(params)
        xgb_model = xgb.train(
            params = params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid,'validation')],
            early_stopping_rounds=50
        )
        y_pred = xgb_model.predict(valid)
        rmse = mean_squared_error(y_val,y_pred,squared=False)
        mlflow.log_metric('rmse',rmse)

    return {'loss':rmse,'status':STATUS_OK}

# define hyper-parameter search space
search_space = {
    'max_depth':scope.int(hp.quniform('max_depth',4,100,1)),
    'learning_rate':hp.loguniform('learning_rate',-3,0),
    'reg_alpha':hp.loguniform('reg_alpha',-5,-1),
    'reg_lambda':hp.loguniform('reg_lambda',-6,-1),
    'min_child_weight':hp.loguniform('min_child_weight',-1,3),
    'objective':'reg:linear',
    'seed':42
}

# Perform 50 Trials
best_result = fmin(
    fn = objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals=50,
    trials = Trials()
)   

In [18]:
# Perform single trial
# with Auto Logging for MLFlow enabled

params = {
    'learning_rate'	:0.3684464953620537,
    'max_depth':	26,
    'min_child_weight':1.7718990147468716,
    'objective':	'reg:linear',
    'reg_alpha':	0.007003237008779855,
    'reg_lambda':	0.2740301294083138,
    'seed':	42
}

mlflow.xgboost.autolog()

xgb.train(
    params = params,
    dtrain = train,
    num_boost_round = 1000,
    evals = [(valid,'validation')],
    early_stopping_rounds = 50
)

2022/10/19 15:05:56 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '286cb97a58414adbad76946c4e7f65cf', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:15.35528
[1]	validation-rmse:11.10603
[2]	validation-rmse:8.68452
[3]	validation-rmse:7.34160
[4]	validation-rmse:6.64989
[5]	validation-rmse:6.28914
[6]	validation-rmse:6.08637
[7]	validation-rmse:5.97482
[8]	validation-rmse:5.91450
[9]	validation-rmse:5.86644
[10]	validation-rmse:5.84206
[11]	validation-rmse:5.82689
[12]	validation-rmse:5.81749
[13]	validation-rmse:5.81223
[14]	validation-rmse:5.80828
[15]	validation-rmse:5.80637
[16]	validation-rmse:5.80343
[17]	validation-rmse:5.80197
[18]	validation-rmse:5.80099
[19]	validation-rmse:5.79935
[20]	validation-rmse:5.79831
[21]	validation-rmse:5.79668
[22]	validation-rmse:5.79601
[23]	validation-rmse:5.79501
[24]	validation-rmse:5.79419
[25]	validation-rmse:5.79260
[26]	validation-rmse:5.79219
[27]	validation-rmse:5.79167
[28]	validation-rmse:5.78964
[29]	validation-rmse:5.78871
[30]	validation-rmse:5.78780
[31]	validation-rmse:5.78740
[32]	validation-rmse:5.78618
[33]	validation-rmse:5.78529
[34]	validation-rmse:5



<xgboost.core.Booster at 0x7f9f22bd7f10>