In [None]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import col

from pyspark.ml.feature import StringIndexer,VectorAssembler,MinMaxScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.pipeline import PipelineModel

In [None]:
 %sh
 rm -r /dbfs/ml_lab
 mkdir /dbfs/ml_lab
 wget -O /dbfs/ml_lab/penguins.csv https://raw.githubusercontent.com/MicrosoftLearning/mslearn-databricks/main/data/penguins.csv

rm: cannot remove '/dbfs/ml_lab': No such file or directory
--2024-06-25 08:47:11--  https://raw.githubusercontent.com/MicrosoftLearning/mslearn-databricks/main/data/penguins.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9533 (9.3K) [text/plain]
Saving to: ‘/dbfs/ml_lab/penguins.csv’

     0K .........                                             100% 1.07M=0.009s

2024-06-25 08:47:11 (1.07 MB/s) - ‘/dbfs/ml_lab/penguins.csv’ saved [9533/9533]



In [None]:
dbutils.fs().ls(path="./ml_lab")

[FileInfo(path='dbfs:/ml_lab/penguins.csv', name='penguins.csv', size=9533, modificationTime=1719305231000)]

In [None]:
df=spark.read.load(
    path="dbfs:/ml_lab/penguins.csv",
    format="csv",
    header=True)

In [None]:
display(df.limit(num=10))

Island,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
Torgersen,39.1,18.7,181.0,3750.0,0
Torgersen,39.5,17.4,186.0,3800.0,0
Torgersen,40.3,18.0,195.0,3250.0,0
Torgersen,,,,,0
Torgersen,36.7,19.3,193.0,3450.0,0
Torgersen,39.3,20.6,190.0,3650.0,0
Torgersen,38.9,17.8,181.0,3625.0,0
Torgersen,39.2,19.6,195.0,4675.0,0
Torgersen,34.1,18.1,193.0,3475.0,0
Torgersen,42.0,20.2,190.0,4250.0,0


In [None]:
data=df.dropna().select(
    col(col="Island").astype("String"),
    col(col="CulmenLength").astype("float"),
    col(col="CulmenDepth").astype("float"),
    col(col="FlipperLength").astype("float"),
    col(col="BodyMass").astype("float"),
    col(col="Species").astype("int")
)

In [None]:
display(data.limit(num=10))

Island,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
Torgersen,39.1,18.7,181.0,3750.0,0
Torgersen,39.5,17.4,186.0,3800.0,0
Torgersen,40.3,18.0,195.0,3250.0,0
Torgersen,36.7,19.3,193.0,3450.0,0
Torgersen,39.3,20.6,190.0,3650.0,0
Torgersen,38.9,17.8,181.0,3625.0,0
Torgersen,39.2,19.6,195.0,4675.0,0
Torgersen,34.1,18.1,193.0,3475.0,0
Torgersen,42.0,20.2,190.0,4250.0,0
Torgersen,37.8,17.1,186.0,3300.0,0


In [None]:
data.count()

342

#### Splitting the Data

In [None]:
splits=data.randomSplit(weights=[0.7,0.3])
train=splits[0]
test=splits[1]

In [None]:
type(splits)

list

In [None]:
display(input=train.limit(num=10))

Island,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
Biscoe,34.5,18.1,187.0,2900.0,0
Biscoe,35.0,17.9,190.0,3450.0,0
Biscoe,35.0,17.9,192.0,3725.0,0
Biscoe,35.7,16.9,185.0,3150.0,0
Biscoe,35.9,19.2,189.0,3800.0,0
Biscoe,36.4,17.1,184.0,2850.0,0
Biscoe,36.5,16.6,181.0,2850.0,0
Biscoe,37.6,17.0,185.0,3600.0,0
Biscoe,37.7,16.0,183.0,3075.0,0
Biscoe,37.7,18.7,180.0,3600.0,0


#### String Indexer

In [None]:
index=StringIndexer(
    inputCol="Island",
    outputCol="IslandIdx"
)

indexedData=index.fit(dataset=train)\
    .transform(dataset=train)\
        .drop("Island")

display(indexedData.limit(num=10))



CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species,IslandIdx
34.5,18.1,187.0,2900.0,0,0.0
35.0,17.9,190.0,3450.0,0,0.0
35.0,17.9,192.0,3725.0,0,0.0
35.7,16.9,185.0,3150.0,0,0.0
35.9,19.2,189.0,3800.0,0,0.0
36.4,17.1,184.0,2850.0,0,0.0
36.5,16.6,181.0,2850.0,0,0.0
37.6,17.0,185.0,3600.0,0,0.0
37.7,16.0,183.0,3075.0,0,0.0
37.7,18.7,180.0,3600.0,0,0.0


#### Vectorizing and Scaling

In [None]:
numericFeatures=["CulmenLength","CulmenDepth","FlipperLength","BodyMass"]
numericColVector=VectorAssembler(
  inputCols=numericFeatures,
  outputCol="numericFeatures"
)
vectorizedData=numericColVector.transform(dataset=indexedData)

display(vectorizedData.limit(num=10))

CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species,IslandIdx,numericFeatures
34.5,18.1,187.0,2900.0,0,0.0,"Map(vectorType -> dense, length -> 4, values -> List(34.5, 18.100000381469727, 187.0, 2900.0))"
35.0,17.9,190.0,3450.0,0,0.0,"Map(vectorType -> dense, length -> 4, values -> List(35.0, 17.899999618530273, 190.0, 3450.0))"
35.0,17.9,192.0,3725.0,0,0.0,"Map(vectorType -> dense, length -> 4, values -> List(35.0, 17.899999618530273, 192.0, 3725.0))"
35.7,16.9,185.0,3150.0,0,0.0,"Map(vectorType -> dense, length -> 4, values -> List(35.70000076293945, 16.899999618530273, 185.0, 3150.0))"
35.9,19.2,189.0,3800.0,0,0.0,"Map(vectorType -> dense, length -> 4, values -> List(35.900001525878906, 19.200000762939453, 189.0, 3800.0))"
36.4,17.1,184.0,2850.0,0,0.0,"Map(vectorType -> dense, length -> 4, values -> List(36.400001525878906, 17.100000381469727, 184.0, 2850.0))"
36.5,16.6,181.0,2850.0,0,0.0,"Map(vectorType -> dense, length -> 4, values -> List(36.5, 16.600000381469727, 181.0, 2850.0))"
37.6,17.0,185.0,3600.0,0,0.0,"Map(vectorType -> dense, length -> 4, values -> List(37.599998474121094, 17.0, 185.0, 3600.0))"
37.7,16.0,183.0,3075.0,0,0.0,"Map(vectorType -> dense, length -> 4, values -> List(37.70000076293945, 16.0, 183.0, 3075.0))"
37.7,18.7,180.0,3600.0,0,0.0,"Map(vectorType -> dense, length -> 4, values -> List(37.70000076293945, 18.700000762939453, 180.0, 3600.0))"


In [None]:
minMax=MinMaxScaler(
    inputCol=numericColVector.getOutputCol(),
    outputCol="normalizedFeatures"
)
scaledData=minMax.fit(dataset=vectorizedData)\
    .transform(dataset=vectorizedData)

In [None]:
compareNumerics=scaledData['numericFeatures','normalizedFeatures']
display(compareNumerics.limit(num=10))

numericFeatures,normalizedFeatures
"Map(vectorType -> dense, length -> 4, values -> List(34.5, 18.100000381469727, 187.0, 2900.0))","Map(vectorType -> dense, length -> 4, values -> List(0.08727278275923295, 0.5903615011568758, 0.22807017543859648, 0.05555555555555555))"
"Map(vectorType -> dense, length -> 4, values -> List(35.0, 17.899999618530273, 190.0, 3450.0))","Map(vectorType -> dense, length -> 4, values -> List(0.10545460094105114, 0.5662650242480307, 0.2807017543859649, 0.20833333333333334))"
"Map(vectorType -> dense, length -> 4, values -> List(35.0, 17.899999618530273, 192.0, 3725.0))","Map(vectorType -> dense, length -> 4, values -> List(0.10545460094105114, 0.5662650242480307, 0.3157894736842105, 0.2847222222222222))"
"Map(vectorType -> dense, length -> 4, values -> List(35.70000076293945, 16.899999618530273, 185.0, 3150.0))","Map(vectorType -> dense, length -> 4, values -> List(0.13090917413884942, 0.44578309930587445, 0.19298245614035087, 0.125))"
"Map(vectorType -> dense, length -> 4, values -> List(35.900001525878906, 19.200000762939453, 189.0, 3800.0))","Map(vectorType -> dense, length -> 4, values -> List(0.13818192915482955, 0.7228916645534547, 0.2631578947368421, 0.3055555555555556))"
"Map(vectorType -> dense, length -> 4, values -> List(36.400001525878906, 17.100000381469727, 184.0, 2850.0))","Map(vectorType -> dense, length -> 4, values -> List(0.1563637473366477, 0.46987957621471965, 0.17543859649122806, 0.041666666666666664))"
"Map(vectorType -> dense, length -> 4, values -> List(36.5, 16.600000381469727, 181.0, 2850.0))","Map(vectorType -> dense, length -> 4, values -> List(0.16000005548650567, 0.4096386137436415, 0.12280701754385964, 0.041666666666666664))"
"Map(vectorType -> dense, length -> 4, values -> List(37.599998474121094, 17.0, 185.0, 3600.0))","Map(vectorType -> dense, length -> 4, values -> List(0.19999999999999998, 0.45783133776029705, 0.19298245614035087, 0.25))"
"Map(vectorType -> dense, length -> 4, values -> List(37.70000076293945, 16.0, 183.0, 3075.0))","Map(vectorType -> dense, length -> 4, values -> List(0.20363644686612214, 0.33734941281814085, 0.15789473684210525, 0.10416666666666667))"
"Map(vectorType -> dense, length -> 4, values -> List(37.70000076293945, 18.700000762939453, 180.0, 3600.0))","Map(vectorType -> dense, length -> 4, values -> List(0.20363644686612214, 0.6626507020823765, 0.10526315789473684, 0.25))"


### Assembling String Column and Normalized Column

In [None]:
featVect=VectorAssembler(
  inputCols=["IslandIdx","normalizedFeatures"],
  outputCol="featuresVector"
)

preppedData=featVect.transform(dataset=scaledData)\
  .select(
    col(col="featuresVector").alias("features"),
    col(col="Species").alias("label")
    )

In [None]:
display(preppedData.limit(num=10))

features,label
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.08727278275923295, 0.5903615011568758, 0.22807017543859648, 0.05555555555555555))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.10545460094105114, 0.5662650242480307, 0.2807017543859649, 0.20833333333333334))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.10545460094105114, 0.5662650242480307, 0.3157894736842105, 0.2847222222222222))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.13090917413884942, 0.44578309930587445, 0.19298245614035087, 0.125))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.13818192915482955, 0.7228916645534547, 0.2631578947368421, 0.3055555555555556))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.1563637473366477, 0.46987957621471965, 0.17543859649122806, 0.041666666666666664))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.16000005548650567, 0.4096386137436415, 0.12280701754385964, 0.041666666666666664))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.19999999999999998, 0.45783133776029705, 0.19298245614035087, 0.25))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.20363644686612214, 0.33734941281814085, 0.15789473684210525, 0.10416666666666667))",0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.20363644686612214, 0.6626507020823765, 0.10526315789473684, 0.25))",0


In [None]:
lr=LogisticRegression(
                labelCol="label",
                featuresCol="features",
                maxIter=10,
                regParam=0.3
                      )

model=lr.fit(dataset=preppedData)
print("Model Trained")

Model Trained


### Test

In [None]:
indexedTestData=index.fit(test).transform(dataset=test).drop("Island")
vectorizedTestData=numericColVector.transform(dataset=indexedTestData)
scaledTestData=minMax.fit(dataset=vectorizedTestData).transform(dataset=vectorizedTestData)
preppedTestData=featVect.transform(dataset=scaledTestData)\
  .select(
    col("featuresVector").alias("features"), 
    col("Species").alias("label")
    )
   

In [None]:
prediction=model.transform(preppedTestData)

In [None]:
display(input=prediction.limit(num=10))

features,label,rawPrediction,probability,prediction
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.0, 0.5853658905347008, 0.20754716981132074, 0.05555555555555555))",0,"Map(vectorType -> dense, length -> 3, values -> List(1.8988715333094563, -1.161127821925246, -0.7377437113842138))","Map(vectorType -> dense, length -> 3, values -> List(0.8940617756037744, 0.04192052307124133, 0.0640177013249843))",0.0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.02732240551067438, 0.5609755530235373, 0.3018867924528302, 0.2847222222222222))",0,"Map(vectorType -> dense, length -> 3, values -> List(1.595420829602848, -0.6416654036572383, -0.9537554259456141))","Map(vectorType -> dense, length -> 3, values -> List(0.8439422758098211, 0.09010700111338511, 0.06595072307679374))",0.0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.07650281881125273, 0.7195122816391056, 0.24528301886792453, 0.3055555555555556))",0,"Map(vectorType -> dense, length -> 3, values -> List(1.6738833043037042, -0.9288841468754764, -0.7449991574282337))","Map(vectorType -> dense, length -> 3, values -> List(0.859779165924335, 0.06368239312336502, 0.07653844095230002))",0.0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.10928962204269752, 0.4024390570114661, 0.09433962264150943, 0.041666666666666664))",0,"Map(vectorType -> dense, length -> 3, values -> List(1.5418720546282072, -0.9520134678556232, -0.589858586772587))","Map(vectorType -> dense, length -> 3, values -> List(0.8324867982795087, 0.06875378855772167, 0.09875941316276968))",0.0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.16939883078481668, 0.4512194994302959, 0.16981132075471697, 0.25))",0,"Map(vectorType -> dense, length -> 3, values -> List(1.255952885155436, -0.591645860915911, -0.6643070242395299))","Map(vectorType -> dense, length -> 3, values -> List(0.7667629433397387, 0.12085348420566365, 0.11238357245459746))",0.0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.19672123629549104, 0.3902438882558843, 0.4150943396226415, 0.3125))",0,"Map(vectorType -> dense, length -> 3, values -> List(0.8485323503549025, -0.03689950458395129, -0.8116328457709561))","Map(vectorType -> dense, length -> 3, values -> List(0.6239690659944775, 0.2574096895711211, 0.11862124443440145))",0.0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.20218584246967264, 0.8170731664767653, 0.2641509433962264, 0.3333333333333333))",0,"Map(vectorType -> dense, length -> 3, values -> List(1.4103292296689123, -0.9445779168950867, -0.465751312773833))","Map(vectorType -> dense, length -> 3, values -> List(0.8012231999482174, 0.07603793901610158, 0.12273886103568109))",0.0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.23497264570111742, 0.4756098369414594, 0.07547169811320754, 0.3055555555555556))",0,"Map(vectorType -> dense, length -> 3, values -> List(1.1737353364659078, -0.6413153054048361, -0.5324200310610775))","Map(vectorType -> dense, length -> 3, values -> List(0.7438305822782277, 0.12111768243351854, 0.13505173528825373))",0.0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.2786884528275142, 0.5365854481158709, 0.18867924528301885, 0.2222222222222222))",0,"Map(vectorType -> dense, length -> 3, values -> List(1.0505824960828787, -0.6769659419072593, -0.3736165541756257))","Map(vectorType -> dense, length -> 3, values -> List(0.7050094306102545, 0.12529397065679002, 0.1696965987329555))",0.0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.2841530590016958, 0.6829267753723604, 0.1509433962264151, 0.2361111111111111))",0,"Map(vectorType -> dense, length -> 3, values -> List(1.216569832087365, -0.9565199018187437, -0.2600499302686286))","Map(vectorType -> dense, length -> 3, values -> List(0.7450266623984118, 0.08480294511184581, 0.17017039248974228))",0.0


In [None]:
predicted=prediction.select(
    "features",
    "probability",
    col(col="prediction").astype("Int"),
    col(col="label").alias("trueLabel")
)

In [None]:
display(input=predicted.limit(num=10))

features,probability,prediction,trueLabel
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.08333338910375374, 0.5662650242480307, 0.2962962962962963, 0.2777777777777778))","Map(vectorType -> dense, length -> 3, values -> List(0.7795338225317141, 0.12084252410048975, 0.0996236533677962))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.09649124861717243, 0.6867469491901869, 0.2037037037037037, 0.30158730158730157))","Map(vectorType -> dense, length -> 3, values -> List(0.8181973804688984, 0.08049349121581105, 0.10130912831529064))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.10526321072987195, 0.36144588972698605, 0.35185185185185186, 0.15873015873015872))","Map(vectorType -> dense, length -> 3, values -> List(0.7066087430848458, 0.18236395941253772, 0.11102729750261643))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.1491228539821084, 0.4096386137436415, 0.09259259259259259, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(0.786772468645995, 0.08040374596525818, 0.1328237853887468))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.19736839463506403, 0.45783133776029705, 0.16666666666666666, 0.23809523809523808))","Map(vectorType -> dense, length -> 3, values -> List(0.7136206719439336, 0.14218101511088402, 0.14419831294518234))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.22368428097316262, 0.5903615011568758, 0.16666666666666666, 0.3492063492063492))","Map(vectorType -> dense, length -> 3, values -> List(0.7135263452112417, 0.13473532653737852, 0.15173832825137987))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.25, 0.48192781466914225, 0.07407407407407407, 0.30158730158730157))","Map(vectorType -> dense, length -> 3, values -> List(0.6964720786502637, 0.1391327965512316, 0.1643951247985048))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.2850876811395369, 0.9036145519666889, 0.2777777777777778, 0.3333333333333333))","Map(vectorType -> dense, length -> 3, values -> List(0.7175980205870682, 0.0889081365152727, 0.19349384289765917))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.2894737458515173, 0.6867469491901869, 0.14814814814814814, 0.2222222222222222))","Map(vectorType -> dense, length -> 3, values -> List(0.7083988911805076, 0.0937516135185035, 0.1978494953009889))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.3245614269910542, 0.6867469491901869, 0.07407407407407407, 0.3492063492063492))","Map(vectorType -> dense, length -> 3, values -> List(0.6837211213066857, 0.10880617949178766, 0.20747269920152672))",0,0


In [None]:
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")
   
# Simple accuracy
accuracy = evaluator.evaluate(prediction, {evaluator.metricName:"accuracy"})
print("Accuracy:", accuracy)
   
# Individual class metrics
labels = [0,1,2]
print("\nIndividual class metrics:")
for label in sorted(labels):
    print ("Class %s" % (label))
   
    # Precision
    precision = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                                evaluator.metricName:"precisionByLabel"})
    print("\tPrecision:", precision)
   
    # Recall
    recall = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                             evaluator.metricName:"recallByLabel"})
    print("\tRecall:", recall)
   
    # F1 score
    f1 = evaluator.evaluate(prediction, {evaluator.metricLabel:label,
                                         evaluator.metricName:"fMeasureByLabel"})
    print("\tF1 Score:", f1)
   
# Weighted (overall) metrics
overallPrecision = evaluator.evaluate(prediction, {evaluator.metricName:"weightedPrecision"})
print("Overall Precision:", overallPrecision)
overallRecall = evaluator.evaluate(prediction, {evaluator.metricName:"weightedRecall"})
print("Overall Recall:", overallRecall)
overallF1 = evaluator.evaluate(prediction, {evaluator.metricName:"weightedFMeasure"})
print("Overall F1 Score:", overallF1)

Accuracy: 0.9714285714285714

Individual class metrics:
Class 0
	Precision: 0.9591836734693877
	Recall: 0.9791666666666666
	F1 Score: 0.9690721649484536
Class 1
	Precision: 1.0
	Recall: 1.0
	F1 Score: 1.0
Class 2
	Precision: 0.9230769230769231
	Recall: 0.8571428571428571
	F1 Score: 0.888888888888889
Overall Precision: 0.9710846976153098
Overall Recall: 0.9714285714285715
Overall F1 Score: 0.9710467463044783


In [None]:
display(input=test.limit(num=10))

Island,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species
Biscoe,34.5,18.1,187.0,2900.0,0
Biscoe,35.0,17.9,192.0,3725.0,0
Biscoe,35.9,19.2,189.0,3800.0,0
Biscoe,36.5,16.6,181.0,2850.0,0
Biscoe,37.6,17.0,185.0,3600.0,0
Biscoe,38.1,16.5,198.0,3825.0,0
Biscoe,38.2,20.0,190.0,3900.0,0
Biscoe,38.8,17.2,180.0,3800.0,0
Biscoe,39.6,17.7,186.0,3500.0,0
Biscoe,39.7,18.9,184.0,3550.0,0


In [None]:
testPrediction=model.transform(preppedTestData)

predicted=testPrediction.select(
    "features",
    "probability",
    col(col="prediction").astype("Int"),
    col(col="label").alias("trueLabel")
)


In [None]:
display(input=predicted.limit(num=10))

features,probability,prediction,trueLabel
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.08333338910375374, 0.5662650242480307, 0.2962962962962963, 0.2777777777777778))","Map(vectorType -> dense, length -> 3, values -> List(0.7795338225317141, 0.12084252410048975, 0.0996236533677962))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.09649124861717243, 0.6867469491901869, 0.2037037037037037, 0.30158730158730157))","Map(vectorType -> dense, length -> 3, values -> List(0.8181973804688984, 0.08049349121581105, 0.10130912831529064))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.10526321072987195, 0.36144588972698605, 0.35185185185185186, 0.15873015873015872))","Map(vectorType -> dense, length -> 3, values -> List(0.7066087430848458, 0.18236395941253772, 0.11102729750261643))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.1491228539821084, 0.4096386137436415, 0.09259259259259259, 0.0))","Map(vectorType -> dense, length -> 3, values -> List(0.786772468645995, 0.08040374596525818, 0.1328237853887468))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.19736839463506403, 0.45783133776029705, 0.16666666666666666, 0.23809523809523808))","Map(vectorType -> dense, length -> 3, values -> List(0.7136206719439336, 0.14218101511088402, 0.14419831294518234))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.22368428097316262, 0.5903615011568758, 0.16666666666666666, 0.3492063492063492))","Map(vectorType -> dense, length -> 3, values -> List(0.7135263452112417, 0.13473532653737852, 0.15173832825137987))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.25, 0.48192781466914225, 0.07407407407407407, 0.30158730158730157))","Map(vectorType -> dense, length -> 3, values -> List(0.6964720786502637, 0.1391327965512316, 0.1643951247985048))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.2850876811395369, 0.9036145519666889, 0.2777777777777778, 0.3333333333333333))","Map(vectorType -> dense, length -> 3, values -> List(0.7175980205870682, 0.0889081365152727, 0.19349384289765917))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.2894737458515173, 0.6867469491901869, 0.14814814814814814, 0.2222222222222222))","Map(vectorType -> dense, length -> 3, values -> List(0.7083988911805076, 0.0937516135185035, 0.1978494953009889))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.3245614269910542, 0.6867469491901869, 0.07407407407407407, 0.3492063492063492))","Map(vectorType -> dense, length -> 3, values -> List(0.6837211213066857, 0.10880617949178766, 0.20747269920152672))",0,0


#### Pipeline

In [None]:
catFeature="Island"
numFeatures=["CulmenLength", "CulmenDepth", "FlipperLength", "BodyMass"]

In [None]:
catIndexer=StringIndexer(inputCol=catFeature,outputCol=catFeature+"Idx")
numVector=VectorAssembler(inputCols=numFeatures,outputCol="numericFeatures")
numScaler=MinMaxScaler(inputCol=numVector.getOutputCol(),outputCol="normalizedFeatures")
featureVector=VectorAssembler(
    inputCols=[catFeature+"Idx","normalizedFeatures"],
    outputCol="Features")

In [None]:
algo=LogisticRegression(labelCol="Species",featuresCol="Features",maxIter=10,regParam=0.3)

In [None]:
pipeline=Pipeline(stages=[
        catIndexer,
        numVector,
        numScaler,
        featureVector,
        algo
    ])

In [None]:
model=pipeline.fit(dataset=train)
print("Model Trained")

Model Trained


In [None]:
pipelinePrediction=model.transform(dataset=test)
predicted=pipelinePrediction.select(
    "Features",
    "probability",
    col(col="prediction").astype("Int"),
    col(col="Species").alias("trueLabel")
)


In [None]:
display(input=predicted.limit(num=10))

Features,probability,prediction,trueLabel
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.08727278275923295, 0.5903615011568758, 0.22807017543859648, 0.05555555555555555))","Map(vectorType -> dense, length -> 3, values -> List(0.8526225672189949, 0.055296650667445844, 0.09208078211355922))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.10545460094105114, 0.5662650242480307, 0.3157894736842105, 0.2847222222222222))","Map(vectorType -> dense, length -> 3, values -> List(0.7977670807761029, 0.11242186899701208, 0.08981105022688503))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.13818192915482955, 0.7228916645534547, 0.2631578947368421, 0.3055555555555556))","Map(vectorType -> dense, length -> 3, values -> List(0.824263358979145, 0.0774344383386742, 0.09830220268218076))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.16000005548650567, 0.4096386137436415, 0.12280701754385964, 0.041666666666666664))","Map(vectorType -> dense, length -> 3, values -> List(0.7972146587668646, 0.08172809670764468, 0.12105724452549084))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.19999999999999998, 0.45783133776029705, 0.19298245614035087, 0.25))","Map(vectorType -> dense, length -> 3, values -> List(0.7398618653164828, 0.13393891971637015, 0.12619921496714712))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.21818181818181817, 0.397590375289219, 0.42105263157894735, 0.3125))","Map(vectorType -> dense, length -> 3, values -> List(0.6059402028499907, 0.2668137794235992, 0.12724601772641012))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.22181826504794033, 0.8192771125867657, 0.2807017543859649, 0.3333333333333333))","Map(vectorType -> dense, length -> 3, values -> List(0.7850239692762913, 0.08220923982191257, 0.13276679090179616))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.24363639137961646, 0.48192781466914225, 0.10526315789473684, 0.3055555555555556))","Map(vectorType -> dense, length -> 3, values -> List(0.7306674653983385, 0.12919539492270948, 0.14013713967895194))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.2727272727272727, 0.5421687771402204, 0.21052631578947367, 0.2222222222222222))","Map(vectorType -> dense, length -> 3, values -> List(0.7054708523879442, 0.12753061184759618, 0.16699853576445978))",0,0
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.2763637195933949, 0.6867469491901869, 0.17543859649122806, 0.2361111111111111))","Map(vectorType -> dense, length -> 3, values -> List(0.746493435008094, 0.08687020410640581, 0.16663636088550018))",0,0


### Save the model

In [None]:
  model.save(path="dbfs:/models/penguinsPipeline.model",)

### Reuse the Model

In [None]:
persistedModel=PipelineModel.load(path="dbfs:/models/penguinsPipeline.model")

In [None]:
newData=spark.createDataFrame(
    data=[
        {
            "Island":"Biscoe",
            "CulmenLength":47.6,
            "CulmenDepth":14.5,
            "FlipperLength":215,
            "BodyMass":5400
        },
        
    ]
)

In [None]:
predictions=persistedModel.transform(dataset=newData)

In [None]:
predictedNew=predictions.select(
    "Features",
    "probability",
    col(col="prediction").astype("Int"),
)

In [None]:
display(input=predictedNew)

Features,probability,prediction
"Map(vectorType -> dense, length -> 5, values -> List(0.0, 0.5636364191228693, 0.15662652540490654, 0.7192982456140351, 0.75))","Map(vectorType -> dense, length -> 3, values -> List(0.06793173171539242, 0.8579324915671119, 0.07413577671749547))",1
