In [20]:
import os
import pickle
import click

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import mlflow
mlflow.set_experiment("NYC_taxi_trip")

def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


data_path = "./output"
def run_train(data_path: str):
    with mlflow.start_run():
        mlflow.autolog()
        X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
        X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

        rf = RandomForestRegressor(max_depth=10, random_state=0)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)

        rmse = mean_squared_error(y_val, y_pred, squared=False)
        #mlflow.log_metric("rmse",rmse)
        X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))
        y_pred = rf.predict(X_test)

        rmse_test = mean_squared_error(y_test, y_pred, squared=False)
        mlflow.log_metric("rmse_test",rmse_test)
if __name__ == '__main__':
    run_train(data_path)

2023/05/28 17:13:59 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


# HPO

In [28]:
import os
import pickle
import click
import mlflow
import optuna

from optuna.samplers import TPESampler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("random-forest-hyperopt")


def load_pickle(filename):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def run_optimization(data_path: str, num_trials: int):

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 10, 50, 1),
            'max_depth': trial.suggest_int('max_depth', 1, 20, 1),
            'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, 1),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4, 1),
            'random_state': 42,
            'n_jobs': -1
        }
        with mlflow.start_run():
            mlflow.set_tag("model","RandomForestRegressor")
            mlflow.log_params(params)
            rf = RandomForestRegressor(**params)
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_val)
            rmse = mean_squared_error(y_val, y_pred, squared=False)
            mlflow.log_metric("rmse",rmse)
            X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))
            y_pred = rf.predict(X_test)

            rmse_test = mean_squared_error(y_test, y_pred, squared=False)
            mlflow.log_metric("rmse_test",rmse_test)
        return rmse

    sampler = TPESampler(seed=42)
    study = optuna.create_study(direction="minimize", sampler=sampler)
    study.optimize(objective, n_trials=num_trials)


if __name__ == '__main__':
    run_optimization("./output",10)

[32m[I 2023-05-28 18:49:44,434][0m A new study created in memory with name: no-name-7a2f4dd1-671a-4c48-9b68-627160f2bbc2[0m
[32m[I 2023-05-28 18:49:50,078][0m Trial 0 finished with value: 2.451379690825458 and parameters: {'n_estimators': 25, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 3}. Best is trial 0 with value: 2.451379690825458.[0m
[32m[I 2023-05-28 18:49:53,565][0m Trial 1 finished with value: 2.4667366020368333 and parameters: {'n_estimators': 16, 'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 0 with value: 2.451379690825458.[0m
[32m[I 2023-05-28 18:49:59,061][0m Trial 2 finished with value: 2.449827329704216 and parameters: {'n_estimators': 34, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 2 with value: 2.449827329704216.[0m
[32m[I 2023-05-28 18:50:03,471][0m Trial 3 finished with value: 2.460983516558473 and parameters: {'n_estimators': 44, 'max_depth': 5, 'min_samples_split': 3, '