In [0]:
from pyspark.sql.types import  StringType
from pyspark.sql.functions import col
from pyspark.ml.feature import StringIndexer,VectorAssembler,MinMaxScaler
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from hyperopt import STATUS_OK
import mlflow
from pyspark.ml import Pipeline
from hyperopt import fmin, tpe,hp
from hyperopt import Trials

In [0]:
pip install hyperopt mlflow

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
Collecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.6/1.6 MB 8.7 MB/s eta 0:00:00
Collecting mlflow
  Downloading mlflow-2.14.1-py3-none-any.whl (25.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 25.8/25.8 MB 36.7 MB/s eta 0:00:00
Collecting networkx>=2.2
  Downloading networkx-3.3-py3-none-any.whl (1.7 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.7/1.7 MB 55.6 MB/s eta 0:00:00
Collecting tqdm
  Downloading tqdm-4.66.4-py3-none-any.whl (78 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 78.3/78.3 kB 7.1 MB/s eta 0:00:00
Collecting cloudpickle
  Downloading cloudpickle-3.0.0-py3-none-any.whl (20 kB)
Collecting future
  Downloading future-1.0.0-py3-none-any.whl (491 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 491.3/491.3 kB 32.8 MB/s eta 0:00:00
Collecting py4j
  Downloading py4j-0.10.

In [0]:
 %sh
 rm -r /dbfs/hyperopt_lab
 mkdir /dbfs/hyperopt_lab
 wget -O /dbfs/hyperopt_lab/penguins.csv https://raw.githubusercontent.com/MicrosoftLearning/mslearn-databricks/main/data/penguins.csv

rm: cannot remove '/dbfs/hyperopt_lab': No such file or directory
--2024-06-29 05:18:04--  https://raw.githubusercontent.com/MicrosoftLearning/mslearn-databricks/main/data/penguins.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9533 (9.3K) [text/plain]
Saving to: ‘/dbfs/hyperopt_lab/penguins.csv’

     0K .........                                             100% 1.02M=0.009s

2024-06-29 05:18:05 (1.02 MB/s) - ‘/dbfs/hyperopt_lab/penguins.csv’ saved [9533/9533]



In [0]:
dbutils.fs.ls("hyperopt_lab")

[FileInfo(path='dbfs:/hyperopt_lab/penguins.csv', name='penguins.csv', size=9533, modificationTime=1719638285000)]

In [0]:
data = spark.read.format("csv").option("header", "true").load("dbfs:/hyperopt_lab/penguins.csv")
data = data.dropna().select(col("Island").astype("string"),
                          col("CulmenLength").astype("float"),
                          col("CulmenDepth").astype("float"),
                          col("FlipperLength").astype("float"),
                          col("BodyMass").astype("float"),
                          col("Species").astype("int")
)

In [0]:
display(data.sample(fraction=0.1))

Island,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
Biscoe,37.9,18.6,172.0,3150.0,0
Dream,40.9,18.9,184.0,3900.0,0
Dream,36.0,17.9,190.0,3450.0,0
Biscoe,35.5,16.2,195.0,3350.0,0
Torgersen,36.7,18.8,187.0,3800.0,0
Dream,37.3,17.8,191.0,3350.0,0
Dream,39.6,18.1,186.0,4450.0,0
Biscoe,37.9,18.6,193.0,2925.0,0
Biscoe,42.7,18.3,196.0,4075.0,0
Torgersen,36.2,17.2,187.0,3150.0,0


In [0]:
splits = data.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1]
print ("Training Rows:", train.count(), " Testing Rows:", test.count())

Training Rows: 238  Testing Rows: 104


In [0]:
def objective(params):
    """
        Define the objective Function
    """

    # Train a model using the provided hyperparameter value
    catFeature = "Island"
    numFeatures = ["CulmenLength", "CulmenDepth", "FlipperLength", "BodyMass"]
    catIndexer = StringIndexer(inputCol=catFeature, outputCol=catFeature + "Idx")
    numVector = VectorAssembler(inputCols=numFeatures, outputCol="numericFeatures")
    numScaler = MinMaxScaler(inputCol = numVector.getOutputCol(), outputCol="normalizedFeatures")
    featureVector = VectorAssembler(inputCols=["IslandIdx", "normalizedFeatures"], outputCol="Features")
    mlAlgo=DecisionTreeClassifier(
        labelCol="Species",
        featuresCol="Features",
        maxDepth=params["MaxDepth"],
        maxBins=params["MaxBins"]
    )

    pipeline=Pipeline(
        stages=[catIndexer,numVector,numScaler,featureVector,mlAlgo]
        )
    model=pipeline.fit(dataset=train)

    prediction=model.transform(dataset=test)
    eval=MulticlassClassificationEvaluator(
        labelCol="Species",
        predictionCol="prediction",
        metricName="accuracy"
    )

    accuracy=eval.evaluate(dataset=prediction)

    # Hyperopt tries to Minimize the objective function, so you must return the negative accuracy
    return {
            "loss":-accuracy,
            "status":STATUS_OK
            }

In [0]:
searchSpace={
    "MaxDepth":hp.randint('MaxDepth',10),
    'MaxBins': hp.choice('MaxBins',[10,20,30])
}


In [0]:
# Specify an algorithm for the hyperparameter optimization process
algo=tpe.suggest

In [0]:
# Call the Training function iteratively to find the optimal hyperparameter values
argmin=fmin(
    fn=objective,
    space=searchSpace,
    algo=algo,
    max_evals=6
)

  0%|          | 0/6 [00:00<?, ?trial/s, best loss=?] 17%|█▋        | 1/6 [00:04<00:23,  4.63s/trial, best loss: -0.9903846153846154] 33%|███▎      | 2/6 [00:08<00:17,  4.36s/trial, best loss: -0.9903846153846154] 50%|█████     | 3/6 [00:12<00:11,  3.89s/trial, best loss: -0.9903846153846154] 67%|██████▋   | 4/6 [00:16<00:07,  3.95s/trial, best loss: -0.9903846153846154] 83%|████████▎ | 5/6 [00:19<00:03,  3.78s/trial, best loss: -0.9903846153846154]100%|██████████| 6/6 [00:23<00:00,  3.71s/trial, best loss: -0.9903846153846154]100%|██████████| 6/6 [00:23<00:00,  3.87s/trial, best loss: -0.9903846153846154]


In [0]:
print(f"Best Param Value: {argmin}")

Best Param Value: {'MaxBins': 2, 'MaxDepth': 7}


In [0]:
trialRuns=Trials()

argmin=fmin(
    fn=objective,
    space=searchSpace,
    algo=algo,
    max_evals=3,
    trials=trialRuns
)

  0%|          | 0/3 [00:00<?, ?trial/s, best loss=?] 33%|███▎      | 1/3 [00:03<00:07,  3.85s/trial, best loss: -0.9711538461538461] 67%|██████▋   | 2/3 [00:06<00:03,  3.42s/trial, best loss: -0.9903846153846154]100%|██████████| 3/3 [00:10<00:00,  3.49s/trial, best loss: -0.9903846153846154]100%|██████████| 3/3 [00:10<00:00,  3.51s/trial, best loss: -0.9903846153846154]


In [0]:
print("Trials:")

for trial in trialRuns.trials:
    print(f"\n{trial}")

Trials:

{'state': 2, 'tid': 0, 'spec': None, 'result': {'loss': -0.9711538461538461, 'status': 'ok'}, 'misc': {'tid': 0, 'cmd': ('domain_attachment', 'FMinIter_Domain'), 'workdir': None, 'idxs': {'MaxBins': [0], 'MaxDepth': [0]}, 'vals': {'MaxBins': [1], 'MaxDepth': [8]}}, 'exp_key': None, 'owner': None, 'version': 0, 'book_time': datetime.datetime(2024, 6, 29, 5, 53, 36, 884000), 'refresh_time': datetime.datetime(2024, 6, 29, 5, 53, 40, 735000)}

{'state': 2, 'tid': 1, 'spec': None, 'result': {'loss': -0.9903846153846154, 'status': 'ok'}, 'misc': {'tid': 1, 'cmd': ('domain_attachment', 'FMinIter_Domain'), 'workdir': None, 'idxs': {'MaxBins': [1], 'MaxDepth': [1]}, 'vals': {'MaxBins': [2], 'MaxDepth': [9]}}, 'exp_key': None, 'owner': None, 'version': 0, 'book_time': datetime.datetime(2024, 6, 29, 5, 53, 40, 738000), 'refresh_time': datetime.datetime(2024, 6, 29, 5, 53, 43, 846000)}

{'state': 2, 'tid': 2, 'spec': None, 'result': {'loss': -0.9807692307692307, 'status': 'ok'}, 'misc': {

In [0]:
for trial in trialRuns.trials:
    print(f"\n{trial['result']}")


{'loss': -0.9711538461538461, 'status': 'ok'}

{'loss': -0.9903846153846154, 'status': 'ok'}

{'loss': -0.9807692307692307, 'status': 'ok'}


In [0]:
for trial in trialRuns.trials:
    print(f"\n{trial['misc']['vals']}")


{'MaxBins': [1], 'MaxDepth': [8]}

{'MaxBins': [2], 'MaxDepth': [9]}

{'MaxBins': [1], 'MaxDepth': [2]}
