In [1]:
# This block of code is used to add the root folder of the project to the path so that src can be imported.
import os
import sys

root_folder = os.path.dirname(os.path.abspath(""))
if not root_folder in sys.path:
    sys.path.append(root_folder)

In [2]:
from pathlib import Path

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
import wandb

from src import read_trips, process_trips, save_model

In [3]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mshunlungchang[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
DATA_DIR = Path("../data")
MODEL_DIR = Path("../models")

trips_train = read_trips(DATA_DIR, color="green", year="2021", month="1")
trips_val = read_trips(DATA_DIR, color="green", year="2021", month="2")

trips_train = process_trips(trips_train)
trips_val = process_trips(trips_val)

target = "duration"
categorical_cols = ["PU_DO"]
numerical_cols = ["trip_distance"]
used_cols = categorical_cols + numerical_cols

X_train = trips_train[used_cols].to_dict(orient="records")
y_train = trips_train[target]

X_val = trips_val[used_cols].to_dict(orient="records")
y_val = trips_val[target]

Standard deviation of duration: 59.34
Fraction of the records left after dropping the outliers: 0.9658903787344154
Standard deviation of duration: 53.17
Fraction of the records left after dropping the outliers: 0.9589450535835966


In [5]:
PROJECT_NAME = "duration-prediction-lasso"
MODEL_NAME = f"{PROJECT_NAME}-model"

In [6]:
wandb.init(project=PROJECT_NAME, job_type="train")

In [7]:
params = {"alpha": 0.1}

pipe = Pipeline([("vectorizer", DictVectorizer()), ("predictor", Lasso(**params))])
pipe.fit(X_train, y_train)

rmse = mean_squared_error(y_val, pipe.predict(X_val), squared=False)
wandb.log({"rmse_val": rmse})

save_model(MODEL_DIR, "lasso.pkl", pipe)

artifact = wandb.Artifact(MODEL_NAME, type="model")
artifact.add_file(MODEL_DIR / "lasso.pkl")
wandb.log_artifact(artifact)

<wandb.sdk.wandb_artifacts.Artifact at 0x2869600d0>

In [8]:
def run_train():
    wandb.init(project=PROJECT_NAME)
    config = wandb.config

    pipe = Pipeline([("vectorizer", DictVectorizer()), ("predictor", Lasso(**config))])
    pipe.fit(X_train, y_train)
    rmse = mean_squared_error(y_val, pipe.predict(X_val), squared=False)
    wandb.log({"rmse_val": rmse})

    save_model(MODEL_DIR, "lasso.pkl", pipe)

    artifact = wandb.Artifact(MODEL_NAME, type="model")
    artifact.add_file(MODEL_DIR / "lasso.pkl")
    wandb.log_artifact(artifact)


sweep_config = {
    "method": "bayes",
    "metric": {"name": "rmse_val", "goal": "minimize"},
    "parameters": {"alpha": {"distribution": "uniform", "min": 0.001, "max": 1.0}},
}
sweep_id = wandb.sweep(sweep_config, project=PROJECT_NAME)
wandb.agent(sweep_id, function=run_train, count=10)
wandb.finish()



Create sweep with ID: 7rmazmsr
Sweep URL: https://wandb.ai/shunlungchang/duration-prediction-lasso/sweeps/7rmazmsr
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>


[34m[1mwandb[0m: Agent Starting Run: k7p57t3a with config:
[34m[1mwandb[0m: 	alpha: 0.11994196148846951
Exception in thread Exception in thread NetStatThr:
Traceback (most recent call last):
  File "/Users/shun_lung_chang/.pyenv/versions/3.9.16/lib/python3.9/threading.py", line 980, in _bootstrap_inner
ChkStopThr:
Traceback (most recent call last):
  File "/Users/shun_lung_chang/.pyenv/versions/3.9.16/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/Users/shun_lung_chang/.pyenv/versions/3.9.16/lib/python3.9/threading.py", line 917, in run
    self.run()
  File "/Users/shun_lung_chang/.pyenv/versions/3.9.16/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/shun_lung_chang/python_projects/mlops_zoomcamp/.venv/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 278, in check_stop_status
    self._target(*self._args, **self._kwargs)
  File "/Users/shun_lung_chang/python_projects/mlops_zoo

0,1
rmse_val,▁

0,1
rmse_val,12.17245


[34m[1mwandb[0m: Agent Starting Run: 05hri929 with config:
[34m[1mwandb[0m: 	alpha: 0.042104232555095274


0,1
rmse_val,▁

0,1
rmse_val,11.8319


[34m[1mwandb[0m: Agent Starting Run: 00iyz1gl with config:
[34m[1mwandb[0m: 	alpha: 0.2668894206618422


0,1
rmse_val,▁

0,1
rmse_val,12.21258


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 4h7upxmf with config:
[34m[1mwandb[0m: 	alpha: 0.06143965787933759


0,1
rmse_val,▁

0,1
rmse_val,12.00166


[34m[1mwandb[0m: Agent Starting Run: b065848g with config:
[34m[1mwandb[0m: 	alpha: 0.6180267676293099


0,1
rmse_val,▁

0,1
rmse_val,12.21258


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: vzb05wk0 with config:
[34m[1mwandb[0m: 	alpha: 0.12311122640771902


0,1
rmse_val,▁

0,1
rmse_val,12.17756


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 0ahe147d with config:
[34m[1mwandb[0m: 	alpha: 0.7352033489117287


0,1
rmse_val,▁

0,1
rmse_val,12.21258


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: vjgzwef9 with config:
[34m[1mwandb[0m: 	alpha: 0.4144953927366956


0,1
rmse_val,▁

0,1
rmse_val,12.21258


[34m[1mwandb[0m: Agent Starting Run: ipilov0r with config:
[34m[1mwandb[0m: 	alpha: 0.02118705400736241


0,1
rmse_val,▁

0,1
rmse_val,11.54553


[34m[1mwandb[0m: Agent Starting Run: jcti6e4x with config:
[34m[1mwandb[0m: 	alpha: 0.5543675489305002


0,1
rmse_val,▁

0,1
rmse_val,12.21258


Error in callback <function _WandbInit._pause_backend at 0x286948dc0> (for post_run_cell):


BrokenPipeError: [Errno 32] Broken pipe

# Homework

In [2]:
import os
from pathlib import Path

from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
import wandb

from src import read_trips, process_trips, save_model

In [3]:
PROJECT_NAME = "tip-prediction-random-forest"
MODEL_NAME = f"{PROJECT_NAME}-model"

DATA_DIR = Path("../data")
MODEL_DIR = Path("../models")

In [4]:
trips_train = read_trips(DATA_DIR, color="green", year="2022", month="1")
trips_val = read_trips(DATA_DIR, color="green", year="2022", month="2")
trips_test = read_trips(DATA_DIR, color="green", year="2022", month="3")

trips_train = process_trips(trips_train)
trips_val = process_trips(trips_val)
trips_test = process_trips(trips_test)

target = "tip_amount"
categorical_cols = ["PU_DO"]
numerical_cols = ["trip_distance"]
used_cols = categorical_cols + numerical_cols

X_train = trips_train[used_cols].to_dict(orient="records")
y_train = trips_train[target]

X_val = trips_val[used_cols].to_dict(orient="records")
y_val = trips_val[target]

Standard deviation of duration: 78.22
Fraction of the records left after dropping the outliers: 0.9537242979438355
Standard deviation of duration: 78.88
Fraction of the records left after dropping the outliers: 0.9524200636896786
Standard deviation of duration: 78.87
Fraction of the records left after dropping the outliers: 0.948686606312948


In [5]:
dv = DictVectorizer()
X_hw = dv.fit_transform(trips_train[used_cols].to_dict(orient="records"))

save_model(MODEL_DIR, "dv.pkl", dv)
os.path.getsize(MODEL_DIR / "dv.pkl")

153660

In [8]:
wandb.login()



True

In [9]:
wandb.init(project=PROJECT_NAME, job_type="train")

In [10]:
pipe = Pipeline(
    [
        ("vectorizer", DictVectorizer()),
        ("predictor", RandomForestRegressor(max_depth=10, random_state=0)),
    ]
)
pipe.fit(X_train, y_train)

rmse = mean_squared_error(y_val, pipe.predict(X_val), squared=False)
wandb.log({"RMSE": rmse})

save_model(MODEL_DIR, "rf_predictor.pkl", pipe)

artifact = wandb.Artifact(MODEL_NAME, type="model")
artifact.add_file(MODEL_DIR / "rf_predictor.pkl")
wandb.log_artifact(artifact)

<wandb.sdk.wandb_artifacts.Artifact at 0x28646ef70>

In [11]:
def run_train():
    wandb.init(project=PROJECT_NAME)
    config = wandb.config

    pipe = Pipeline(
        [
            ("vectorizer", DictVectorizer()),
            ("predictor", RandomForestRegressor(**config, random_state=0)),
        ]
    )
    pipe.fit(X_train, y_train)
    rmse = mean_squared_error(y_val, pipe.predict(X_val), squared=False)
    wandb.log({"rmse_val": rmse})

    save_model(MODEL_DIR, "rf_predictor.pkl", pipe)

    artifact = wandb.Artifact(MODEL_NAME, type="model")
    artifact.add_file(MODEL_DIR / "rf_predictor.pkl")
    wandb.log_artifact(artifact)


sweep_config = {
    "method": "bayes",
    "metric": {"name": "rmse_val", "goal": "minimize"},
    "parameters": {
        "max_depth": {
            "distribution": "int_uniform",
            "min": 1,
            "max": 20,
        },
        "n_estimators": {
            "distribution": "int_uniform",
            "min": 10,
            "max": 50,
        },
        "min_samples_split": {
            "distribution": "int_uniform",
            "min": 2,
            "max": 10,
        },
        "min_samples_leaf": {
            "distribution": "int_uniform",
            "min": 1,
            "max": 4,
        },
    },
}

sweep_id = wandb.sweep(sweep_config, project=PROJECT_NAME)
wandb.agent(sweep_id, function=run_train, count=5)
wandb.finish()



Create sweep with ID: 3zle5q6w
Sweep URL: https://wandb.ai/shunlungchang/tip-prediction-random-forest/sweeps/3zle5q6w
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>
<IPython.core.display.HTML object>


[34m[1mwandb[0m: Agent Starting Run: 2d13ytc7 with config:
[34m[1mwandb[0m: 	max_depth: 7
[34m[1mwandb[0m: 	min_samples_leaf: 2
[34m[1mwandb[0m: 	min_samples_split: 2
[34m[1mwandb[0m: 	n_estimators: 21


Exception in thread NetStatThr:
Exception in thread ChkStopThr:
Traceback (most recent call last):
  File "/Users/shun_lung_chang/.pyenv/versions/3.9.16/lib/python3.9/threading.py", line 980, in _bootstrap_inner
Traceback (most recent call last):
  File "/Users/shun_lung_chang/.pyenv/versions/3.9.16/lib/python3.9/threading.py", line 980, in _bootstrap_inner
    self.run()
  File "/Users/shun_lung_chang/.pyenv/versions/3.9.16/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/shun_lung_chang/python_projects/mlops_zoomcamp/.venv/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 260, in check_network_status
    self.run()
  File "/Users/shun_lung_chang/.pyenv/versions/3.9.16/lib/python3.9/threading.py", line 917, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/shun_lung_chang/python_projects/mlops_zoomcamp/.venv/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 278, in check_stop_status
    self._l

0,1
rmse_val,▁

0,1
rmse_val,2.45475


[34m[1mwandb[0m: Agent Starting Run: p1pp68lw with config:
[34m[1mwandb[0m: 	max_depth: 8
[34m[1mwandb[0m: 	min_samples_leaf: 2
[34m[1mwandb[0m: 	min_samples_split: 4
[34m[1mwandb[0m: 	n_estimators: 14
[34m[1mwandb[0m: Currently logged in as: [33mshunlungchang[0m. Use [1m`wandb login --relogin`[0m to force relogin


0,1
rmse_val,▁

0,1
rmse_val,2.45465


[34m[1mwandb[0m: Agent Starting Run: im7zzupc with config:
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	min_samples_leaf: 2
[34m[1mwandb[0m: 	min_samples_split: 7
[34m[1mwandb[0m: 	n_estimators: 17


0,1
rmse_val,▁

0,1
rmse_val,2.46279


[34m[1mwandb[0m: Agent Starting Run: gba6rss2 with config:
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	min_samples_leaf: 2
[34m[1mwandb[0m: 	min_samples_split: 4
[34m[1mwandb[0m: 	n_estimators: 16


0,1
rmse_val,▁

0,1
rmse_val,2.4534


[34m[1mwandb[0m: Agent Starting Run: njjynwha with config:
[34m[1mwandb[0m: 	max_depth: 12
[34m[1mwandb[0m: 	min_samples_leaf: 3
[34m[1mwandb[0m: 	min_samples_split: 3
[34m[1mwandb[0m: 	n_estimators: 16


0,1
rmse_val,▁

0,1
rmse_val,2.45081


Error in callback <function _WandbInit._pause_backend at 0x2864f34c0> (for post_run_cell):


BrokenPipeError: [Errno 32] Broken pipe