In [7]:
import google.cloud.aiplatform as aiplatform
from google.cloud.aiplatform import hyperparameter_tuning as hpt

BUCKET_URI = f"gs://sid-vertex-mlops"
REGION = "us-central1"

aiplatform.init(project="udemy-mlops", staging_bucket=BUCKET_URI)

In [8]:
! rm -rf custom
! mkdir custom

setup_py = "import setuptools\n\nsetuptools.setup(\n\n    install_requires=[\n\n        'cloudml-hypertune','gcsfs',\n\n    ],\n\n    packages=setuptools.find_packages())"
! echo "$setup_py" > custom/setup.py

! mkdir custom/trainer
! touch custom/trainer/__init__.py

In [None]:
%%writefile custom/trainer/task.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from google.cloud import storage
from joblib import dump
from sklearn.pipeline import make_pipeline
import hypertune
import argparse

storage_client = storage.Client()
bucket = storage_client.bucket("sid-vertex-mlops")

parser = argparse.ArgumentParser()
parser.add_argument("--n_estimators", dest="n_estimators",default=20, type=int, help="Number of estimators")

args = parser.parse_args()

def load_data(filename):
    df = pd.read_csv(filename)
    return df

def preprocess_data(df):
    df = df.rename(columns={'weathersit':'weather',
                            'yr':'year',
                            'mnth':'month',
                            'hr':'hour',
                            'hum':'humidity',
                            'cnt':'count'})
    df = df.drop(columns=['instant', 'dteday', 'year'])
    cols = ['season', 'month', 'hour', 'holiday', 'weekday', 'workingday', 'weather']
    for col in cols:
        df[col] = df[col].astype('category')
    df['count'] = np.log(df['count'])
    df_oh = df.copy()
    for col in cols:
        df_oh = one_hot_encoding(df_oh, col)
    X = df_oh.drop(columns=['atemp', 'windspeed', 'casual', 'registered', 'count'], axis=1)
    y = df_oh['count']
    return X, y

def one_hot_encoding(data, column):
    data = pd.concat([data, pd.get_dummies(data[column], prefix=column, drop_first=True)], axis=1)
    data = data.drop([column], axis=1)
    return data

def train_model(x_train, y_train,n_estimators):
    model = RandomForestRegressor(max_depth=None,n_estimators=n_estimators)
    pipeline = make_pipeline(model)
    pipeline.fit(x_train, y_train)
    return pipeline


filename = 'gs://sid-vertex-mlops/bike-share/hour.csv'
df = load_data(filename)
X, y = preprocess_data(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

n_estimators = args.n_estimators

pipeline = train_model(X_train, y_train,n_estimators)
y_pred = pipeline.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

hpt = hypertune.HyperTune()

hpt.report_hyperparameter_tuning_metric(
    hyperparameter_metric_tag='RMSE',
    metric_value=rmse
)
print('RMSE:', rmse)

In [None]:
! rm -f custom.tar custom.tar.gz
! tar cvf custom.tar custom
! gzip custom.tar
! gsutil cp custom.tar.gz $BUCKET_URI/trainer_bikeshare.tar.gz

In [None]:
DISK_TYPE = "pd-ssd" 
DISK_SIZE = 100  # GB

disk_spec = {"boot_disk_type": DISK_TYPE, "boot_disk_size_gb": DISK_SIZE}
machine_spec = {"machine_type":"n1-standard-4", "accelerator_count": 0}
TRAIN_IMAGE = "us-docker.pkg.dev/vertex-ai/training/scikit-learn-cpu.0-23:latest"

worker_pool_spec = [
    {
        "replica_count": 1,
        "machine_spec": machine_spec,
        "disk_spec": disk_spec,
        "python_package_spec": {
            "executor_image_uri": TRAIN_IMAGE,
            "package_uris": [BUCKET_URI + "/trainer_bikeshare.tar.gz"],
            "python_module": "trainer.task"
        }
    }
]

job = aiplatform.CustomJob(
    display_name="bikeshare_hpt_tuning",
    worker_pool_specs=worker_pool_spec
)

hpt_job = aiplatform.HyperparameterTuningJob(
    display_name="bikeshare_hpt_job",
    custom_job=job,
    metric_spec={
        "RMSE": "minimize",
    },
    parameter_spec={
        "n_estimators": hpt.IntegerParameterSpec(min=35, max=40, scale="linear"),
    },
    search_algorithm=None,
    max_trial_count=3,
    parallel_trial_count=3
)

hpt_job.run()

In [None]:
best = (None, None, None, 0.0)
for trial in hpt_job.trials:
    # Keep track of the best outcome
    if float(trial.final_measurement.metrics[0].value) > best[3]:
        try:
            best = (
                trial.id,
                float(trial.parameters[0].value),
                float(trial.parameters[1].value),
                float(trial.final_measurement.metrics[0].value),
            )
        except:
            best = (
                trial.id,
                float(trial.parameters[0].value),
                None,
                float(trial.final_measurement.metrics[0].value),
            )

print(best)