In [1]:
# This block of code is used to add the root folder of the project to the path so that src can be imported.
import os
import sys

root_folder = os.path.dirname(os.path.abspath(""))
if not root_folder in sys.path:
    sys.path.append(root_folder)

In [2]:
import os
from pathlib import Path

from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import wandb

from src import read_trips, process_trips, create_pipeline, save_model

In [3]:
PROJECT_NAME = "tip-prediction-random-forest"
MODEL_NAME = f"{PROJECT_NAME}-model"

DATA_DIR = Path("../data")
MODEL_DIR = Path("../models")

# Load data

In [4]:
trips_train = read_trips(DATA_DIR, color="green", year="2022", month="1")
trips_val = read_trips(DATA_DIR, color="green", year="2022", month="2")
trips_test = read_trips(DATA_DIR, color="green", year="2022", month="3")

trips_train = process_trips(trips_train)
trips_val = process_trips(trips_val)
trips_test = process_trips(trips_test)

target = "tip_amount"
categorical_cols = ["PU_DO"]
numerical_cols = ["trip_distance"]
used_cols = categorical_cols + numerical_cols

X_train = trips_train[used_cols]
y_train = trips_train[target]

X_val = trips_val[used_cols]
y_val = trips_val[target]

Standard deviation of duration: 78.22
Fraction of the records left after dropping the outliers: 0.9537242979438355
Standard deviation of duration: 78.88
Fraction of the records left after dropping the outliers: 0.9524200636896786
Standard deviation of duration: 78.87
Fraction of the records left after dropping the outliers: 0.948686606312948


# Using Wandb to track experiments

1. Before starting, make sure you have a wandb account and log in to it by API key.
2. Create a new wandb run by using `wandb.init()`.
3. To log an artifact, create an Artifact object by `wandb.Artifact()`, and add the artifact by `add_file()`. Then log the artifact by `wandb.log_artifact()`.
4. During training, log the metrics you want to track by using `wandb.log()`. The input should be a dictionary.

In [None]:
wandb.login()

In [None]:
wandb.init(project=PROJECT_NAME, job_type="train")

In [6]:
train_artifact = wandb.Artifact("green_2022_01", type="train_data")
train_artifact.add_file(DATA_DIR / "green_tripdata_2022-01.parquet")
wandb.run.log_artifact(train_artifact)

val_artifact = wandb.Artifact("green_2022_02", type="val_data")
val_artifact.add_file(DATA_DIR / "green_tripdata_2022-02.parquet")
wandb.run.log_artifact(val_artifact)

pipe = create_pipeline(RandomForestRegressor(max_depth=10, random_state=0))
pipe.fit(X_train, y_train)

rmse = mean_squared_error(y_val, pipe.predict(X_val), squared=False)
wandb.log({"RMSE": rmse})

save_model(MODEL_DIR, "rf_predictor.pkl", pipe)

artifact = wandb.Artifact(MODEL_NAME, type="model")
artifact.add_file(MODEL_DIR / "rf_predictor.pkl")
wandb.log_artifact(artifact)

<wandb.sdk.wandb_artifacts.Artifact at 0x1037d5100>

# Tuning hyperparameters by sweep

The step to tune hyperparameters by sweep is as follows:

1. Wrap the training code in a function.
2. Create a search space of hyperparameters.
3. Create a sweep id by `wandb.sweep()` with the search space and project name.
4. Run the sweep by `wandb.agent()` with the sweep id and the function created in step 1.

In [6]:
def run_train():
    wandb.init()
    config = wandb.config

    train_artifact = wandb.Artifact("green_2022_01", type="train_data")
    train_artifact.add_file(DATA_DIR / "green_tripdata_2022-01.parquet")
    wandb.run.log_artifact(train_artifact)

    val_artifact = wandb.Artifact("green_2022_02", type="val_data")
    val_artifact.add_file(DATA_DIR / "green_tripdata_2022-02.parquet")
    wandb.run.log_artifact(val_artifact)

    pipe = create_pipeline(RandomForestRegressor(**config, random_state=0))
    pipe.fit(X_train, y_train)

    rmse = mean_squared_error(y_val, pipe.predict(X_val), squared=False)
    wandb.log({"rmse_val": rmse})

    save_model(MODEL_DIR, "rf_predictor.pkl", pipe)

    artifact = wandb.Artifact(MODEL_NAME, type="model")
    artifact.add_file(MODEL_DIR / "rf_predictor.pkl")
    wandb.log_artifact(artifact)

sweep_config = {
    "method": "bayes",
    "metric": {"name": "rmse_val", "goal": "minimize"},
    "parameters": {
        "max_depth": {
            "distribution": "int_uniform",
            "min": 1,
            "max": 20,
        },
        "n_estimators": {
            "distribution": "int_uniform",
            "min": 10,
            "max": 50,
        },
        "min_samples_split": {
            "distribution": "int_uniform",
            "min": 2,
            "max": 10,
        },
        "min_samples_leaf": {
            "distribution": "int_uniform",
            "min": 1,
            "max": 4,
        },
    },
}

sweep_id = wandb.sweep(sweep_config, project=PROJECT_NAME)
wandb.agent(sweep_id, function=run_train, count=5)

Create sweep with ID: ts2dft8c
Sweep URL: https://wandb.ai/shunlungchang/tip-prediction-random-forest/sweeps/ts2dft8c


[34m[1mwandb[0m: Agent Starting Run: 35fsvvw4 with config:
[34m[1mwandb[0m: 	max_depth: 2
[34m[1mwandb[0m: 	min_samples_leaf: 1
[34m[1mwandb[0m: 	min_samples_split: 9
[34m[1mwandb[0m: 	n_estimators: 39
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
rmse_val,▁

0,1
rmse_val,2.48652


[34m[1mwandb[0m: Agent Starting Run: 4jae811t with config:
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	min_samples_leaf: 3
[34m[1mwandb[0m: 	min_samples_split: 2
[34m[1mwandb[0m: 	n_estimators: 16
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
rmse_val,▁

0,1
rmse_val,2.47129


[34m[1mwandb[0m: Agent Starting Run: tkbexseg with config:
[34m[1mwandb[0m: 	max_depth: 14
[34m[1mwandb[0m: 	min_samples_leaf: 3
[34m[1mwandb[0m: 	min_samples_split: 9
[34m[1mwandb[0m: 	n_estimators: 17
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
rmse_val,▁

0,1
rmse_val,2.4486


[34m[1mwandb[0m: Agent Starting Run: 725l0ct5 with config:
[34m[1mwandb[0m: 	max_depth: 1
[34m[1mwandb[0m: 	min_samples_leaf: 2
[34m[1mwandb[0m: 	min_samples_split: 6
[34m[1mwandb[0m: 	n_estimators: 19
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
rmse_val,▁

0,1
rmse_val,2.5407


[34m[1mwandb[0m: Agent Starting Run: l9expevq with config:
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	min_samples_leaf: 1
[34m[1mwandb[0m: 	min_samples_split: 3
[34m[1mwandb[0m: 	n_estimators: 49
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


0,1
rmse_val,▁

0,1
rmse_val,2.45517


#  Registering the best model from the sweep to the model registry

1. Use `wandb.Api()` to request the information of the sweep.
2. Sort the runs by the metric you want to use to select the best model. And the first run is the best model.
3. Use `run.logger_artifact()` to get the path of the model artifact on wandb server.
4. Use wandb.Api().artifacts() to get the artifact object and link it to the model registry by `link()`.

In [7]:
api = wandb.Api()
sweep = api.sweep(f"shunlungchang/tip-prediction-random-forest/{sweep_id}")
runs = sorted(sweep.runs, key=lambda run: run.summary.get("rmse_val", 0))

In [8]:
for artifact in runs[0].logged_artifacts():
    if artifact.type == "model":
        artifact_name = artifact.name
        break

In [9]:
artifact = wandb.Api().artifact(f"{runs[0].entity}/{runs[0].project}/{artifact_name}")
artifact.link(f"{runs[0].entity}/{runs[0].project}/tip-predictor", aliases=["latest"])

True