In [0]:
 %sh
 rm -r /dbfs/hyperopt_lab
 mkdir /dbfs/hyperopt_lab
 wget -O /dbfs/hyperopt_lab/penguins.csv https://raw.githubusercontent.com/MicrosoftLearning/mslearn-databricks/main/data/penguins.csv


rm: cannot remove '/dbfs/hyperopt_lab': No such file or directory
--2024-03-14 04:15:12--  https://raw.githubusercontent.com/MicrosoftLearning/mslearn-databricks/main/data/penguins.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9533 (9.3K) [text/plain]
Saving to: ‘/dbfs/hyperopt_lab/penguins.csv’

     0K .........                                             100% 1.34M=0.007s

2024-03-14 04:15:13 (1.34 MB/s) - ‘/dbfs/hyperopt_lab/penguins.csv’ saved [9533/9533]



In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

data = spark.read.format("csv").option("header", "true").load("/hyperopt_lab/penguins.csv")

data = data.dropna().select(col("Island").astype("string"),
                            col("CulmenLength").astype("float"),
                            col("CulmenDepth").astype("float"),
                            col("FlipperLength").astype("float"),
                            col("BodyMass").astype("float"),
                            col("Species").astype("int"))

display(data.sample(0.2).head(4))

Island,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
Torgersen,40.29999923706055,18.0,195.0,3250.0,0
Torgersen,38.59999847412109,21.200000762939453,191.0,3800.0,0
Biscoe,37.79999923706055,18.299999237060547,174.0,3400.0,0
Biscoe,35.29999923706055,18.899999618530277,187.0,3800.0,0


In [0]:
# Split the data into two datasets: One for training, and another for testing
splits = data.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1]

print("Training Rows: ", train.count(), "Testing Rows: ", test.count())

Training Rows:  234 Testing Rows:  108


#### Optimize hyperparameter values for training a model

In [0]:
from hyperopt import STATUS_OK
import mlflow
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


def objective(params):
    # Train a model using the provided hyperparameter value
    catFeature = "Island"
    numFeatures = ["CulmenLength", "CulmenDepth", "FlipperLength", "BodyMass"]
    catIndexer = StringIndexer(inputCol=catFeature, outputCol=catFeature + "Idx")
    numVector = VectorAssembler(inputCols=numFeatures, outputCol="numericFeatures")
    numScaler = MinMaxScaler(inputCol = numVector.getOutputCol(), outputCol="normalizedFeatures")
    featureVector = VectorAssembler(inputCols=["IslandIdx", "normalizedFeatures"], outputCol="Features")
    mlAlgo = DecisionTreeClassifier(labelCol="Species",    
                                    featuresCol="Features",
                                    maxDepth=params['MaxDepth'], maxBins=params['MaxBins'])
    pipeline = Pipeline(stages=[catIndexer, numVector, numScaler, featureVector, mlAlgo])
    model = pipeline.fit(train)
       
    # Evaluate the model to get the target metric
    prediction = model.transform(test)
    eval = MulticlassClassificationEvaluator(labelCol="Species", predictionCol="prediction", metricName="accuracy")
    accuracy = eval.evaluate(prediction)
       
    # Hyperopt tries to minimize the objective function, so you must return the negative accuracy.
    return {'loss': -accuracy, 'status': STATUS_OK}

In [0]:
from hyperopt import fmin, tpe, hp
   
# Define a search space for two hyperparameters (maxDepth and maxBins)
search_space = {
    'MaxDepth': hp.randint('MaxDepth', 10),
    'MaxBins': hp.choice('MaxBins', [10, 20, 30])
}
   
# Specify an algorithm for the hyperparameter optimization process
algo=tpe.suggest
   
# Call the training function iteratively to find the optimal hyperparameter values
argmin = fmin(
  fn=objective,
  space=search_space,
  algo=algo,
  max_evals=6)
   
print("Best param values: ", argmin)

  0%|          | 0/6 [00:00<?, ?trial/s, best loss=?] 17%|█▋        | 1/6 [00:06<00:32,  6.48s/trial, best loss: -0.9722222222222222] 33%|███▎      | 2/6 [00:09<00:18,  4.55s/trial, best loss: -0.9907407407407407] 50%|█████     | 3/6 [00:12<00:11,  3.87s/trial, best loss: -0.9907407407407407] 67%|██████▋   | 4/6 [00:15<00:06,  3.35s/trial, best loss: -0.9907407407407407] 83%|████████▎ | 5/6 [00:17<00:03,  3.07s/trial, best loss: -0.9907407407407407]100%|██████████| 6/6 [00:20<00:00,  2.78s/trial, best loss: -0.9907407407407407]100%|██████████| 6/6 [00:20<00:00,  3.35s/trial, best loss: -0.9907407407407407]
Best param values:  {'MaxBins': 0, 'MaxDepth': 7}


#### Use the Trials class to log run details
In addition to using MLflow experiment runs to log details of each iteration, you can also use the hyperopt.Trials class to record and view details of each run.

In [0]:
from hyperopt import Trials
   
# Create a Trials object to track each run
trial_runs = Trials()
   
argmin = fmin(
  fn=objective,
  space=search_space,
  algo=algo,
  max_evals=3,
  trials=trial_runs)
   
print("Best param values: ", argmin)

  0%|          | 0/3 [00:00<?, ?trial/s, best loss=?] 33%|███▎      | 1/3 [00:02<00:05,  2.68s/trial, best loss: -0.9814814814814815] 67%|██████▋   | 2/3 [00:05<00:02,  2.65s/trial, best loss: -0.9814814814814815]100%|██████████| 3/3 [00:07<00:00,  2.30s/trial, best loss: -0.9814814814814815]100%|██████████| 3/3 [00:07<00:00,  2.40s/trial, best loss: -0.9814814814814815]
Best param values:  {'MaxBins': 2, 'MaxDepth': 9}


In [0]:
# Get details from each trial run
print ("trials:")
for trial in trial_runs.trials:
    print ("\n", trial)

trials:

 {'state': 2, 'tid': 0, 'spec': None, 'result': {'loss': -0.9814814814814815, 'status': 'ok'}, 'misc': {'tid': 0, 'cmd': ('domain_attachment', 'FMinIter_Domain'), 'workdir': None, 'idxs': {'MaxBins': [0], 'MaxDepth': [0]}, 'vals': {'MaxBins': [2], 'MaxDepth': [9]}}, 'exp_key': None, 'owner': None, 'version': 0, 'book_time': datetime.datetime(2024, 3, 14, 4, 57, 55, 658000), 'refresh_time': datetime.datetime(2024, 3, 14, 4, 57, 58, 336000)}

 {'state': 2, 'tid': 1, 'spec': None, 'result': {'loss': -0.9629629629629629, 'status': 'ok'}, 'misc': {'tid': 1, 'cmd': ('domain_attachment', 'FMinIter_Domain'), 'workdir': None, 'idxs': {'MaxBins': [1], 'MaxDepth': [1]}, 'vals': {'MaxBins': [1], 'MaxDepth': [9]}}, 'exp_key': None, 'owner': None, 'version': 0, 'book_time': datetime.datetime(2024, 3, 14, 4, 57, 58, 340000), 'refresh_time': datetime.datetime(2024, 3, 14, 4, 58, 0, 966000)}

 {'state': 2, 'tid': 2, 'spec': None, 'result': {'loss': -0.9629629629629629, 'status': 'ok'}, 'misc':