### Ingest data

In [0]:
 %sh
 rm -r /dbfs/ml_lab
 mkdir /dbfs/ml_lab
 wget -O /dbfs/ml_lab/penguins.csv https://raw.githubusercontent.com/MicrosoftLearning/mslearn-databricks/main/data/penguins.csv

--2024-12-20 05:09:33--  https://raw.githubusercontent.com/MicrosoftLearning/mslearn-databricks/main/data/penguins.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9533 (9.3K) [text/plain]
Saving to: ‘/dbfs/ml_lab/penguins.csv’

     0K .........                                             100% 1.61M=0.006s

2024-12-20 05:09:35 (1.61 MB/s) - ‘/dbfs/ml_lab/penguins.csv’ saved [9533/9533]



### Explore and clean up the data

The data itself consists of measurements of the following details of penguins that have been observed in Antarctica:

- Island: The island in Antarctica where the penguin was observed.
- CulmenLength: The length in mm of the penguin’s culmen (bill).
- CulmenDepth: The depth in mm of the penguin’s culmen.
- FlipperLength: The length in mm of the penguin’s flipper.
- BodyMass: The body mass of the penguin in grams.
- Species: An integer value that represents the species of the penguin:
  - 0: Adelie
  - 1: Gentoo
  - 2: Chinstrap

In [0]:
df = spark.read.format("csv").option("header", "true").load("/ml_lab/penguins.csv")
display(df.limit(10))

Island,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
Torgersen,39.1,18.7,181.0,3750.0,0
Torgersen,39.5,17.4,186.0,3800.0,0
Torgersen,40.3,18.0,195.0,3250.0,0
Torgersen,,,,,0
Torgersen,36.7,19.3,193.0,3450.0,0
Torgersen,39.3,20.6,190.0,3650.0,0
Torgersen,38.9,17.8,181.0,3625.0,0
Torgersen,39.2,19.6,195.0,4675.0,0
Torgersen,34.1,18.1,193.0,3475.0,0
Torgersen,42.0,20.2,190.0,4250.0,0


### Remove the rows with incomplete data

In [0]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
   
data = df.dropna().select(col("Island").astype("string"),
                           col("CulmenLength").astype("float"),
                          col("CulmenDepth").astype("float"),
                          col("FlipperLength").astype("float"),
                          col("BodyMass").astype("float"),
                          col("Species").astype("int")
                          )
display(data.limit(10))

Island,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
Torgersen,39.1,18.7,181.0,3750.0,0
Torgersen,39.5,17.4,186.0,3800.0,0
Torgersen,40.3,18.0,195.0,3250.0,0
Torgersen,36.7,19.3,193.0,3450.0,0
Torgersen,39.3,20.6,190.0,3650.0,0
Torgersen,38.9,17.8,181.0,3625.0,0
Torgersen,39.2,19.6,195.0,4675.0,0
Torgersen,34.1,18.1,193.0,3475.0,0
Torgersen,42.0,20.2,190.0,4250.0,0
Torgersen,37.8,17.1,186.0,3300.0,0


### Split the data

We’ll use 70% of the data for training, and hold back 30% for testing.

In [0]:
splits = data.randomSplit([0.7, 0.3])
train = splits[0]
test = splits[1]
print ("Training Rows:", train.count(), " Testing Rows:", test.count())

Training Rows: 232  Testing Rows: 110


### Perform feature engineering
**Encode categorical features**

In [0]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="Island", outputCol="IslandIdx")
indexedData = indexer.fit(train).transform(train).drop("Island")
display(indexedData.limit(10))

CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species,IslandIdx
34.5,18.1,187.0,2900.0,0,0.0
35.0,17.9,190.0,3450.0,0,0.0
35.0,17.9,192.0,3725.0,0,0.0
35.5,16.2,195.0,3350.0,0,0.0
35.9,19.2,189.0,3800.0,0,0.0
36.4,17.1,184.0,2850.0,0,0.0
37.6,19.1,194.0,3750.0,0,0.0
37.7,16.0,183.0,3075.0,0,0.0
37.7,18.7,180.0,3600.0,0,0.0
37.8,18.3,174.0,3400.0,0,0.0


**Normalize (scale) numeric features**

In [0]:
from pyspark.ml.feature import VectorAssembler, MinMaxScaler

# Create a vector column containing all numeric features
numericFeatures = ["CulmenLength", "CulmenDepth", "FlipperLength", "BodyMass"]
numericColVector = VectorAssembler(inputCols=numericFeatures, outputCol="numericFeatures")
vectorizedData = numericColVector.transform(indexedData)
   
# Use a MinMax scaler to normalize the numeric values in the vector
minMax = MinMaxScaler(inputCol = numericColVector.getOutputCol(), outputCol="normalizedFeatures")
scaledData = minMax.fit(vectorizedData).transform(vectorizedData)
   
# Display the data with numeric feature vectors (before and after scaling)
compareNumerics = scaledData.select("numericFeatures", "normalizedFeatures")
display(compareNumerics.limit(10))

numericFeatures,normalizedFeatures
"Map(vectorType -> dense, length -> 4, values -> List(34.5, 18.100000381469727, 187.0, 2900.0))","Map(vectorType -> dense, length -> 4, values -> List(0.10084038731674229, 0.5952381222696814, 0.2542372881355932, 0.05555555555555555))"
"Map(vectorType -> dense, length -> 4, values -> List(35.0, 17.899999618530273, 190.0, 3450.0))","Map(vectorType -> dense, length -> 4, values -> List(0.12184878798428217, 0.5714285065527647, 0.3050847457627119, 0.20833333333333334))"
"Map(vectorType -> dense, length -> 4, values -> List(35.0, 17.899999618530273, 192.0, 3725.0))","Map(vectorType -> dense, length -> 4, values -> List(0.12184878798428217, 0.5714285065527647, 0.3389830508474576, 0.2847222222222222))"
"Map(vectorType -> dense, length -> 4, values -> List(35.5, 16.200000762939453, 195.0, 3350.0))","Map(vectorType -> dense, length -> 4, values -> List(0.14285718865182206, 0.3690476812202672, 0.3898305084745763, 0.18055555555555555))"
"Map(vectorType -> dense, length -> 4, values -> List(35.900001525878906, 19.200000762939453, 189.0, 3800.0))","Map(vectorType -> dense, length -> 4, values -> List(0.15966397329840482, 0.726190554582076, 0.288135593220339, 0.3055555555555556))"
"Map(vectorType -> dense, length -> 4, values -> List(36.400001525878906, 17.100000381469727, 184.0, 2850.0))","Map(vectorType -> dense, length -> 4, values -> List(0.1806723739659447, 0.47619049781574513, 0.2033898305084746, 0.041666666666666664))"
"Map(vectorType -> dense, length -> 4, values -> List(37.599998474121094, 19.100000381469727, 194.0, 3750.0))","Map(vectorType -> dense, length -> 4, values -> List(0.23109240734293868, 0.7142857467236177, 0.3728813559322034, 0.2916666666666667))"
"Map(vectorType -> dense, length -> 4, values -> List(37.70000076293945, 16.0, 183.0, 3075.0))","Map(vectorType -> dense, length -> 4, values -> List(0.23529418364527296, 0.34523806550335046, 0.1864406779661017, 0.10416666666666667))"
"Map(vectorType -> dense, length -> 4, values -> List(37.70000076293945, 18.700000762939453, 180.0, 3600.0))","Map(vectorType -> dense, length -> 4, values -> List(0.23529418364527296, 0.6666667423551079, 0.13559322033898305, 0.25))"
"Map(vectorType -> dense, length -> 4, values -> List(37.79999923706055, 18.299999237060547, 174.0, 3400.0))","Map(vectorType -> dense, length -> 4, values -> List(0.23949579966623008, 0.6190475109212744, 0.03389830508474576, 0.19444444444444445))"


The **numericFeatures** column in the results contains a vector for each row. The vector includes four unscaled numeric values (the original measurements of the penguin). You can use the ▸ toggle to see the discrete values more clearly.

The **normalizedFeatures** column also contains a vector for each penguin observation, but this time the values in the vector are normalized to a relative scale based on the minimum and maximum values for each measurement.

### Prepare features and labels for training

In [0]:
featVect = VectorAssembler(inputCols=["IslandIdx", "normalizedFeatures"], outputCol="featuresVector")
preparedData = featVect.transform(scaledData)[col("featuresVector").alias("features"), col("Species").alias("label")]
display(preparedData.limit(10))

features,label
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.10084038731674229, 0.5952381222696814, 0.2542372881355932, 0.05555555555555555))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.12184878798428217, 0.5714285065527647, 0.3050847457627119, 0.20833333333333334))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.12184878798428217, 0.5714285065527647, 0.3389830508474576, 0.2847222222222222))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.14285718865182206, 0.3690476812202672, 0.3898305084745763, 0.18055555555555555))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.15966397329840482, 0.726190554582076, 0.288135593220339, 0.3055555555555556))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.1806723739659447, 0.47619049781574513, 0.2033898305084746, 0.041666666666666664))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.23109240734293868, 0.7142857467236177, 0.3728813559322034, 0.2916666666666667))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.23529418364527296, 0.34523806550335046, 0.1864406779661017, 0.10416666666666667))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.23529418364527296, 0.6666667423551079, 0.13559322033898305, 0.25))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.23949579966623008, 0.6190475109212744, 0.03389830508474576, 0.19444444444444445))",0


The **features** vector contains five values (the encoded island and the normalized culmen length, culmen depth, flipper length, and body mass). The label contains a simple integer code that indicates the class of penguin species.

### Train a machine learning model

Models are trained using an algorithm that tries to establish a relationship between the features and labels.<br>
Since in this case you want to train a model that predicts a category of class , you need to use a classification algorithm.<br>

To train the model, you will fit the logistic regression algorithm to the training data.

In [0]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10, regParam=0.3)
model = lr.fit(preparedData)
print ("Model trained!")

Model trained!


### Test the model

Now that you have a trained model, you can test it with the data you held back. Before you can do this, you need to perform the same feature engineering transformations to the test data as you applied to the training data

In [0]:
# Prepare the test data
indexedTestData = indexer.fit(test).transform(test).drop("Island")
vectorizedTestData = numericColVector.transform(indexedTestData)
scaledTestData = minMax.fit(vectorizedTestData).transform(vectorizedTestData)
preppedTestData = featVect.transform(scaledTestData)[col("featuresVector").alias("features"), col("Species").alias("label")]
   
# Get predictions
prediction = model.transform(preppedTestData)
predicted = prediction.select("features", "probability", col("prediction").astype("Int"), col("label").alias("trueLabel"))
display(predicted)

features,probability,prediction,trueLabel
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.08301889671469634, 0.7215189109378106, 0.2037037037037037, 0.296875))","Map(vectorType -> dense, length -> 3, values -> List(0.8725526455576519, 0.05256123761421712, 0.07488611682813091))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.09811329391767394, 0.4683543723127361, 0.16666666666666666, 0.09375))","Map(vectorType -> dense, length -> 3, values -> List(0.8509166775972573, 0.06307135535758125, 0.08601196704516141))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.1283019443727889, 0.43037978809358224, 0.09259259259259259, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(0.8525456273739734, 0.05153240974231355, 0.09592196288371302))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.16981132075471697, 0.48101264753129347, 0.16666666666666666, 0.234375))","Map(vectorType -> dense, length -> 3, values -> List(0.8027768072037349, 0.09423485687015784, 0.1029883359261072))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.17735851935620578, 0.8607594554689053, 0.25925925925925924, 0.4375))","Map(vectorType -> dense, length -> 3, values -> List(0.8301686404006263, 0.06894069815809299, 0.10089066144128057))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.18867924528301885, 0.48101264753129347, 0.09259259259259259, 0.1015625))","Map(vectorType -> dense, length -> 3, values -> List(0.8235891407374437, 0.06343954671837863, 0.11297131254417773))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.24528301886792453, 0.9493671405622887, 0.2777777777777778, 0.328125))","Map(vectorType -> dense, length -> 3, values -> List(0.8117819385227097, 0.05314979078383194, 0.1350682706934584))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.279245340599204, 0.5949366416252734, 0.2037037037037037, 0.109375))","Map(vectorType -> dense, length -> 3, values -> List(0.7635579192075426, 0.07506086829211074, 0.16138121250034662))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.279245340599204, 0.7215189109378106, 0.07407407407407407, 0.34375))","Map(vectorType -> dense, length -> 3, values -> List(0.7993068905557995, 0.06510997294883684, 0.13558313649536377))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.32075471698113206, 0.6075949168438307, 0.2962962962962963, 0.34375))","Map(vectorType -> dense, length -> 3, values -> List(0.6807784267188504, 0.15128180967862878, 0.16793976360252086))",0,0


**The results include the following columns:**

- **features**: The prepared features data from the test dataset.
- **probability**: The probability calculated by the model for each class. This consists of a vector containing three probability values (because there are three classes) which add up to a total of 1.0 (its assumed that there’s a 100% probability that the penguin belongs to one of the three species classes).
- **prediction**: The predicted class label (the one with the highest probability).
- **trueLabel**: The actual known label value from the test data.

Get evaluation metrics for a classification model based on the results from the test data:

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
   
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
   
# Simple accuracy
accuracy = evaluator.evaluate(prediction, {evaluator.metricName:"accuracy"})
print("Accuracy:", accuracy)
   
# Individual class metrics
labels = [0,1,2]
print("\nIndividual class metrics:")
for label in sorted(labels):
    print ("Class %s" % (label))
   
    # Precision
    precision = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                                evaluator.metricName:"precisionByLabel"})
    print("\tPrecision:", precision)
   
    # Recall
    recall = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                             evaluator.metricName:"recallByLabel"})
    print("\tRecall:", recall)
   
    # F1 score
    f1 = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                         evaluator.metricName:"fMeasureByLabel"})
    print("\tF1 Score:", f1)
   
# Weighted (overall) metrics
overallPrecision = evaluator.evaluate(prediction, {evaluator.metricName:"weightedPrecision"})
print("Overall Precision:", overallPrecision)
overallRecall = evaluator.evaluate(prediction, {evaluator.metricName:"weightedRecall"})
print("Overall Recall:", overallRecall)
overallF1 = evaluator.evaluate(prediction, {evaluator.metricName:"weightedFMeasure"})
print("Overall F1 Score:", overallF1)

Accuracy: 0.8909090909090909

Individual class metrics:
Class 0
	Precision: 0.8095238095238095
	Recall: 1.0
	F1 Score: 0.8947368421052632
Class 1
	Precision: 1.0
	Recall: 1.0
	F1 Score: 1.0
Class 2
	Precision: 1.0
	Recall: 0.29411764705882354
	F1 Score: 0.45454545454545453
Overall Precision: 0.9116883116883117
Overall Recall: 0.8909090909090909
Overall F1 Score: 0.8668986515876469


The evaluation metrics that are calculated for multiclass classification include:

- **Accuracy**: The proportion of overall predictions that were correct.
- Per-class metrics:
  - **Precision**: The proportion of predictions of this class that were correct.
  - **Recall**: The proportion of actual instances of this class that were correctly predicted.
  - **F1 score**: A combined metric for precision and recall
- Combined (weighted) precision, recall, and F1 metrics for all classes.

### Use a pipeline

A more efficient way to build and use models is to encapsulate the transformers used to prepare the data and the model used to train it in a pipeline.

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler
from pyspark.ml.classification import LogisticRegression
   
catFeature = "Island"
numFeatures = ["CulmenLength", "CulmenDepth", "FlipperLength", "BodyMass"]
   
# Define the feature engineering and model training algorithm steps
catIndexer = StringIndexer(inputCol=catFeature, outputCol=catFeature + "Idx")
numVector = VectorAssembler(inputCols=numFeatures, outputCol="numericFeatures")
numScaler = MinMaxScaler(inputCol = numVector.getOutputCol(), outputCol="normalizedFeatures")
featureVector = VectorAssembler(inputCols=["IslandIdx", "normalizedFeatures"], outputCol="Features")
algo = LogisticRegression(labelCol="Species", featuresCol="Features", maxIter=10, regParam=0.3)
   
# Chain the steps as stages in a pipeline
pipeline = Pipeline(stages=[catIndexer, numVector, numScaler, featureVector, algo])
   
# Use the pipeline to prepare data and fit the model algorithm
model = pipeline.fit(train)
print ("Model trained!")

Model trained!


**Apply the pipeline to the test data**

In [0]:
prediction = model.transform(test)
predicted = prediction.select("Features", "probability", col("prediction").astype("Int"), col("Species").alias("trueLabel"))
display(predicted.limit(10))

Features,probability,prediction,trueLabel
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.13445379632853066, 0.6904761310067009, 0.2542372881355932, 0.3055555555555556))","Map(vectorType -> dense, length -> 3, values -> List(0.8343748232932918, 0.07321510217287593, 0.09241007453383218))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.15126058097511344, 0.4523808820988284, 0.22033898305084745, 0.125))","Map(vectorType -> dense, length -> 3, values -> List(0.8062933109314849, 0.0887624963245295, 0.10494419274398556))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.18487398998690183, 0.416666685588777, 0.15254237288135594, 0.041666666666666664))","Map(vectorType -> dense, length -> 3, values -> List(0.8060558551933897, 0.07543643104273086, 0.1185077137638796))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.23109240734293868, 0.46428568995728675, 0.22033898305084745, 0.25))","Map(vectorType -> dense, length -> 3, values -> List(0.7436768681300743, 0.12855197945523753, 0.12777115241468823))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.23949579966623008, 0.8214285633190956, 0.3050847457627119, 0.4305555555555556))","Map(vectorType -> dense, length -> 3, values -> List(0.7774307645824201, 0.09469051669940122, 0.12787871871817852))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.25210080801047857, 0.46428568995728675, 0.15254237288135594, 0.13194444444444445))","Map(vectorType -> dense, length -> 3, values -> List(0.7663484151314317, 0.09169398082984477, 0.14195760403872348))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.3151260100130982, 0.9047619912629805, 0.3220338983050847, 0.3333333333333333))","Map(vectorType -> dense, length -> 3, values -> List(0.7501739617873091, 0.07627757064917157, 0.17354846756351935))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.35294119532722085, 0.5714285065527647, 0.2542372881355932, 0.1388888888888889))","Map(vectorType -> dense, length -> 3, values -> List(0.6887886233281758, 0.10735127722674817, 0.2038600994450761))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.35294119532722085, 0.6904761310067009, 0.13559322033898305, 0.3472222222222222))","Map(vectorType -> dense, length -> 3, values -> List(0.729508897725432, 0.09366552729004732, 0.17682557498452067))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.3991596126832577, 0.583333314411223, 0.3389830508474576, 0.3472222222222222))","Map(vectorType -> dense, length -> 3, values -> List(0.593441799921908, 0.19682465306148222, 0.20973354701660973))",0,0


### Try a different algorithm

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler
from pyspark.ml.classification import DecisionTreeClassifier
   
catFeature = "Island"
numFeatures = ["CulmenLength", "CulmenDepth", "FlipperLength", "BodyMass"]
   
# Define the feature engineering and model steps
catIndexer = StringIndexer(inputCol=catFeature, outputCol=catFeature + "Idx")
numVector = VectorAssembler(inputCols=numFeatures, outputCol="numericFeatures")
numScaler = MinMaxScaler(inputCol = numVector.getOutputCol(), outputCol="normalizedFeatures")
featureVector = VectorAssembler(inputCols=["IslandIdx", "normalizedFeatures"], outputCol="Features")
algo = DecisionTreeClassifier(labelCol="Species", featuresCol="Features", maxDepth=10)
   
# Chain the steps as stages in a pipeline
pipeline = Pipeline(stages=[catIndexer, numVector, numScaler, featureVector, algo])
   
# Use the pipeline to prepare data and fit the model algorithm
model = pipeline.fit(train)
print ("Model trained!")

Model trained!


In [0]:
# Get predictions
prediction = model.transform(test)
predicted = prediction.select("Features", "probability", col("prediction").astype("Int"), col("Species").alias("trueLabel"))
   
# Generate evaluation metrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
   
evaluator = MulticlassClassificationEvaluator(labelCol="Species", predictionCol="prediction")
   
# Simple accuracy
accuracy = evaluator.evaluate(prediction, {evaluator.metricName:"accuracy"})
print("Accuracy:", accuracy)
   
# Class metrics
labels = [0,1,2]
print("\nIndividual class metrics:")
for label in sorted(labels):
    print ("Class %s" % (label))
   
    # Precision
    precision = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                                    evaluator.metricName:"precisionByLabel"})
    print("\tPrecision:", precision)
   
    # Recall
    recall = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                             evaluator.metricName:"recallByLabel"})
    print("\tRecall:", recall)
   
    # F1 score
    f1 = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                         evaluator.metricName:"fMeasureByLabel"})
    print("\tF1 Score:", f1)
   
# Weighed (overall) metrics
overallPrecision = evaluator.evaluate(prediction, {evaluator.metricName:"weightedPrecision"})
print("Overall Precision:", overallPrecision)
overallRecall = evaluator.evaluate(prediction, {evaluator.metricName:"weightedRecall"})
print("Overall Recall:", overallRecall)
overallF1 = evaluator.evaluate(prediction, {evaluator.metricName:"weightedFMeasure"})
print("Overall F1 Score:", overallF1)

Accuracy: 0.9545454545454546

Individual class metrics:
Class 0
	Precision: 0.9423076923076923
	Recall: 0.9607843137254902
	F1 Score: 0.9514563106796117
Class 1
	Precision: 1.0
	Recall: 1.0
	F1 Score: 1.0
Class 2
	Precision: 0.875
	Recall: 0.8235294117647058
	F1 Score: 0.8484848484848485
Overall Precision: 0.9539335664335664
Overall Recall: 0.9545454545454545
Overall F1 Score: 0.9540774024445693


### Save the model

In [0]:
model.save("/models/newpenguin.model")

**Load the model and use it to predict the species for a new penguin observation.**

In [0]:
from pyspark.ml.pipeline import PipelineModel

persistedModel = PipelineModel.load("/models/newpenguin.model")
   
newData = spark.createDataFrame ([{"Island": "Biscoe",
                                  "CulmenLength": 47.6,
                                  "CulmenDepth": 14.5,
                                  "FlipperLength": 215,
                                  "BodyMass": 5400}])
   
   
predictions = persistedModel.transform(newData)
display(predictions.select("Island", "CulmenDepth", "CulmenLength", "FlipperLength", "BodyMass", col("prediction").alias("PredictedSpecies")))

Island,CulmenDepth,CulmenLength,FlipperLength,BodyMass,PredictedSpecies
Biscoe,14.5,47.6,215,5400,1.0
