In [None]:
# This block of code is used to add the root folder of the project to the path so that src can be imported.
import os
import sys

root_folder = os.path.dirname(os.path.abspath(""))
if not root_folder in sys.path:
    sys.path.append(root_folder)

# Homework

In [None]:
import os
from pathlib import Path

from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import wandb

from src import read_trips, process_trips, create_pipeline, save_model

In [None]:
PROJECT_NAME = "tip-prediction-random-forest"
MODEL_NAME = f"{PROJECT_NAME}-model"

DATA_DIR = Path("../data")
MODEL_DIR = Path("../models")

In [None]:
trips_train = read_trips(DATA_DIR, color="green", year="2022", month="1")
trips_val = read_trips(DATA_DIR, color="green", year="2022", month="2")
trips_test = read_trips(DATA_DIR, color="green", year="2022", month="3")

trips_train = process_trips(trips_train)
trips_val = process_trips(trips_val)
trips_test = process_trips(trips_test)

target = "tip_amount"
categorical_cols = ["PU_DO"]
numerical_cols = ["trip_distance"]
used_cols = categorical_cols + numerical_cols

X_train = trips_train[used_cols]
y_train = trips_train[target]

X_val = trips_val[used_cols]
y_val = trips_val[target]

In [None]:
dv = DictVectorizer()
X_hw = dv.fit_transform(trips_train[used_cols].to_dict(orient="records"))

save_model(MODEL_DIR, "dv.pkl", dv)
os.path.getsize(MODEL_DIR / "dv.pkl")

In [None]:
wandb.init(project=PROJECT_NAME, job_type="train")

In [None]:
pipe = create_pipeline(RandomForestRegressor(max_depth=10, random_state=0))
pipe.fit(X_train, y_train)

rmse = mean_squared_error(y_val, pipe.predict(X_val), squared=False)
wandb.log({"RMSE": rmse})

save_model(MODEL_DIR, "rf_predictor.pkl", pipe)

artifact = wandb.Artifact(MODEL_NAME, type="model")
artifact.add_file(MODEL_DIR / "rf_predictor.pkl")
wandb.log_artifact(artifact)

In [None]:
def run_train():
    wandb.init()
    config = wandb.config
    pipe = create_pipeline(RandomForestRegressor(**config, random_state=0))
    pipe.fit(X_train, y_train)

    rmse = mean_squared_error(y_val, pipe.predict(X_val), squared=False)
    wandb.log({"rmse_val": rmse})

    save_model(MODEL_DIR, "rf_predictor.pkl", pipe)

    artifact = wandb.Artifact(MODEL_NAME, type="model")
    artifact.add_file(MODEL_DIR / "rf_predictor.pkl")
    wandb.log_artifact(artifact)


sweep_config = {
    "method": "bayes",
    "metric": {"name": "rmse_val", "goal": "minimize"},
    "parameters": {
        "max_depth": {
            "distribution": "int_uniform",
            "min": 1,
            "max": 20,
        },
        "n_estimators": {
            "distribution": "int_uniform",
            "min": 10,
            "max": 50,
        },
        "min_samples_split": {
            "distribution": "int_uniform",
            "min": 2,
            "max": 10,
        },
        "min_samples_leaf": {
            "distribution": "int_uniform",
            "min": 1,
            "max": 4,
        },
    },
}

sweep_id = wandb.sweep(sweep_config, project=PROJECT_NAME)
wandb.agent(sweep_id, function=run_train, count=5)