# Distributed Hyperparameter Tuning with Hyperopt

Note. Only for single node models, not for Spark ML.

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

# df = pd.read_csv(f"{DA.paths.datasets}/airbnb/sf-listings/airbnb-cleaned.csv".replace("dbfs:/", "/dbfs/")).drop(["zipcode"], axis=1)

file_path = 'dbfs:/mnt/dbacademy-datasets/scalable-machine-learning-with-apache-spark/v02/airbnb/sf-listings/sf-listings-2019-03-06-clean.delta/'
print(file_path)
airbnb_df = spark.read.format("delta").load(file_path)
df = airbnb_df.toPandas()

# split 80/20 train-test
X_train, X_test, y_train, y_test = train_test_split(df.drop(["price"], axis=1),
                                                    df[["price"]].values.ravel(),
                                                    test_size = 0.2,
                                                    random_state = 42)

### Objective Function

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, r2_score
from numpy import mean
  
def objective_function(params):
    # set the hyperparameters that we want to tune:
    max_depth = params["max_depth"]
    max_features = params["max_features"]

    regressor = RandomForestRegressor(max_depth=max_depth, max_features=max_features, random_state=42)

    # Evaluate predictions
    r2 = mean(cross_val_score(regressor, X_train, y_train, cv=3))

    # Note: since we aim to maximize r2, we need to return it as a negative value ("loss": -metric)
    return -r2

### Search Space

In [None]:
from hyperopt import hp

max_features_choices =  ["auto", "sqrt", "log2"]
search_space = {
    "max_depth": hp.quniform("max_depth",2,10,1),
    "max_features": hp.choice("max_features",max_features_choices)
}

In [None]:
### Train Models Parallely for different hyperparameter configuration

from hyperopt import fmin, tpe, SparkTrials
import mlflow
import numpy as np

# Number of models to evaluate
num_evals = 8
# Number of models to train concurrently
spark_trials = SparkTrials(parallelism=1)
# Automatically logs to MLflow
best_hyperparam = fmin(fn=objective_function,
                       space=search_space,
                       max_evals=num_evals,
                       trials=spark_trials,
                       algo=tpe.suggest)

# Re-train best model and log metrics on test dataset
with mlflow.start_run(run_name="best_model"):
    # get optimal hyperparameter values
    best_max_depth = best_hyperparam["max_depth"]
    best_max_features = max_features_choices[best_hyperparam["max_features"]]
    print(best_max_features)

    # train model on entire training data
    regressor = RandomForestRegressor(max_depth=best_max_depth, max_features=best_max_features, random_state=42)
    regressor.fit(X_train, y_train)

    # evaluate on holdout/test data
    r2 = regressor.score(X_test, y_test)

    # Log param and metric for the final model
    mlflow.log_param("max_depth", best_max_depth)
    mlflow.log_param("max_features", best_max_features)
    mlflow.log_metric("loss", r2)