# SSDS5. Supervised ML with PySpark

## Regression & Classification

In [None]:
!wget -q https://archive.apache.org/dist/spark/spark-3.2.4/spark-3.2.4-bin-hadoop3.2.tgz
!tar xf spark-3.2.4-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
import findspark
findspark.init("/content/spark-3.2.4-bin-hadoop3.2")

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

## Mount

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
gpath = '/gdrive/MyDrive/data/'

# Load Data

data source :
https://www.kaggle.com/uciml/student-alcohol-consumption

In [None]:
df1 = spark.read.option("header", "true") \
                    .option("inferSchema", "true") \
                    .csv(gpath+'student-por.csv')

df1.show(3)

+------+---+---+-------+-------+-------+----+----+-------+-------+------+--------+----------+---------+--------+---------+------+----+----------+-------+------+--------+--------+------+--------+-----+----+----+------+--------+---+---+---+
|school|sex|age|address|famsize|Pstatus|Medu|Fedu|   Mjob|   Fjob|reason|guardian|traveltime|studytime|failures|schoolsup|famsup|paid|activities|nursery|higher|internet|romantic|famrel|freetime|goout|Dalc|Walc|health|absences| G1| G2| G3|
+------+---+---+-------+-------+-------+----+----+-------+-------+------+--------+----------+---------+--------+---------+------+----+----------+-------+------+--------+--------+------+--------+-----+----+----+------+--------+---+---+---+
|    GP|  F| 18|      U|    GT3|      A|   4|   4|at_home|teacher|course|  mother|         2|        2|       0|      yes|    no|  no|        no|    yes|   yes|      no|      no|     4|       3|    4|   1|   1|     3|       4|  0| 11| 11|
|    GP|  F| 17|      U|    GT3|      T|   1

In [None]:
df2 = spark.read.option("header", "true") \
                    .option("inferSchema", "true") \
                    .csv(gpath+'student-mat.csv')

df2.show(3)

+------+---+---+-------+-------+-------+----+----+-------+-------+------+--------+----------+---------+--------+---------+------+----+----------+-------+------+--------+--------+------+--------+-----+----+----+------+--------+---+---+---+
|school|sex|age|address|famsize|Pstatus|Medu|Fedu|   Mjob|   Fjob|reason|guardian|traveltime|studytime|failures|schoolsup|famsup|paid|activities|nursery|higher|internet|romantic|famrel|freetime|goout|Dalc|Walc|health|absences| G1| G2| G3|
+------+---+---+-------+-------+-------+----+----+-------+-------+------+--------+----------+---------+--------+---------+------+----+----------+-------+------+--------+--------+------+--------+-----+----+----+------+--------+---+---+---+
|    GP|  F| 18|      U|    GT3|      A|   4|   4|at_home|teacher|course|  mother|         2|        2|       0|      yes|    no|  no|        no|    yes|   yes|      no|      no|     4|       3|    4|   1|   1|     3|       6|  5|  6|  6|
|    GP|  F| 17|      U|    GT3|      T|   1

In [None]:
#union df1 & df2
df = df1.union(df2)
df.count()

1044

In [None]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder

binaryCols = ['school','sex','address','famsize','Pstatus','schoolsup','famsup',\
              'paid','activities','nursery','higher','internet','romantic']
categoryCols = ['Mjob', 'Fjob', 'reason', 'guardian']
numericCols = [x for x in df.columns if x not in (binaryCols+categoryCols)]

for col in binaryCols+categoryCols:
  indexer = StringIndexer(inputCol=col, outputCol=col+"_indexed")
  df = indexer.fit(df).transform(df)

encoder = OneHotEncoder(inputCols=[x+"_indexed" for x in categoryCols],\
                                 outputCols=[x+"_encoded" for x in categoryCols])
df = encoder.fit(df).transform(df)

df.show(3)

+------+---+---+-------+-------+-------+----+----+-------+-------+------+--------+----------+---------+--------+---------+------+----+----------+-------+------+--------+--------+------+--------+-----+----+----+------+--------+---+---+---+--------------+-----------+---------------+---------------+---------------+-----------------+--------------+------------+------------------+---------------+--------------+----------------+----------------+------------+------------+--------------+----------------+-------------+-------------+--------------+----------------+
|school|sex|age|address|famsize|Pstatus|Medu|Fedu|   Mjob|   Fjob|reason|guardian|traveltime|studytime|failures|schoolsup|famsup|paid|activities|nursery|higher|internet|romantic|famrel|freetime|goout|Dalc|Walc|health|absences| G1| G2| G3|school_indexed|sex_indexed|address_indexed|famsize_indexed|Pstatus_indexed|schoolsup_indexed|famsup_indexed|paid_indexed|activities_indexed|nursery_indexed|higher_indexed|internet_indexed|romantic_ind

In [None]:
df_encoded = df[numericCols+[x+"_indexed" for x in binaryCols]+[x+"_encoded" for x in categoryCols]]
df_encoded.show(3)

+---+----+----+----------+---------+--------+------+--------+-----+----+----+------+--------+---+---+---+--------------+-----------+---------------+---------------+---------------+-----------------+--------------+------------+------------------+---------------+--------------+----------------+----------------+-------------+-------------+--------------+----------------+
|age|Medu|Fedu|traveltime|studytime|failures|famrel|freetime|goout|Dalc|Walc|health|absences| G1| G2| G3|school_indexed|sex_indexed|address_indexed|famsize_indexed|Pstatus_indexed|schoolsup_indexed|famsup_indexed|paid_indexed|activities_indexed|nursery_indexed|higher_indexed|internet_indexed|romantic_indexed| Mjob_encoded| Fjob_encoded|reason_encoded|guardian_encoded|
+---+----+----+----------+---------+--------+------+--------+-----+----+----+------+--------+---+---+---+--------------+-----------+---------------+---------------+---------------+-----------------+--------------+------------+------------------+-------------

In [None]:
(trainData, testData) = df_encoded.randomSplit([0.9, 0.1], seed=123)

classification_target = 'schoolsup_indexed'
df_encoded.groupBy(classification_target).count().show()

+-----------------+-----+
|schoolsup_indexed|count|
+-----------------+-----+
|              0.0|  925|
|              1.0|  119|
+-----------------+-----+



In [None]:
from pyspark.ml.feature import VectorAssembler
# remove target label
inputCols = trainData.drop(classification_target).columns

# asemble features to vector
assembler = VectorAssembler(inputCols=inputCols,
    outputCol="featureVector")

assembledTrainData = assembler.transform(trainData)
assembledTestData = assembler.transform(testData)

assembledTrainData.select('featureVector').show(3, truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|featureVector                                                                                                                                                      |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|(41,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,26,27,30,33,39],[15.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,3.0,1.0,1.0,2.0,6.0,10.0,10.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|(41,[0,1,2,3,4,6,7,8,9,10,11,12,13,14,15,18,24,27,28,32,38,39],[15.0,1.0,1.0,1.0,2.0,3.0,3.0,4.0,2.0,4.0,5.0,2.0,13.0,11.0,11.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])      |
|(41,[0,1,2,3,4,6,7,8,9,10,11,12,13,14,15,23,24,28,32,37,40],[15.0,1.0,1.0,1.0,2.0,4.0,3.0,2.0,2.0,3.0,4.0,2.0,13.0,12.0,12.0,1.0,1.0,1.0,1.0,1.0,1.0])             |
+---

# Logistic Regression

In [None]:
# import library
from pyspark.ml.classification import LogisticRegression

# create logistic regression model
logi = LogisticRegression(labelCol=classification_target, featuresCol="featureVector",
	family="multinomial",
	predictionCol="prediction")

# train model with train data
model = logi.fit(assembledTrainData)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# create evaluator
evaluator = MulticlassClassificationEvaluator(
	labelCol=classification_target,
	predictionCol="prediction")

# transform test data using the fitted model
predictions = model.transform(assembledTestData)
predictions.select([classification_target, "prediction", "probability"]).show(10, truncate=False)

print(evaluator.setMetricName("accuracy").evaluate(predictions))
print(evaluator.setMetricName("f1").evaluate(predictions))

+-----------------+----------+-----------------------------------------+
|schoolsup_indexed|prediction|probability                              |
+-----------------+----------+-----------------------------------------+
|1.0              |0.0       |[0.7792854532818779,0.22071454671812205] |
|0.0              |0.0       |[0.6998501175962517,0.3001498824037483]  |
|0.0              |0.0       |[0.8434799517360372,0.1565200482639628]  |
|1.0              |0.0       |[0.7688379466099926,0.23116205339000734] |
|0.0              |0.0       |[0.7593475356306848,0.24065246436931534] |
|1.0              |0.0       |[0.5207725260740741,0.4792274739259258]  |
|0.0              |0.0       |[0.9928122216995908,0.007187778300409177]|
|0.0              |0.0       |[0.6869434359666374,0.3130565640333626]  |
|0.0              |0.0       |[0.9010676164142403,0.09893238358575972] |
|1.0              |0.0       |[0.8385072761180825,0.16149272388191746] |
+-----------------+----------+---------------------

# Decision tree

In [None]:
# load library
from pyspark.ml.classification import DecisionTreeClassifier

#create model
classifier = DecisionTreeClassifier(
	labelCol=classification_target,
	featuresCol="featureVector",
	predictionCol="prediction")

# train model with train data
model = classifier.fit(assembledTrainData)

In [None]:
print(model.numNodes)
print(model.depth)
print(model.numFeatures)
print(model.numClasses)

43
5
41
2


In [None]:
print(model.featureImportances)

(41,[0,1,2,3,4,5,7,9,10,13,15,18,27,36],[0.14587189047267993,0.03265254456162728,0.08604528829489784,0.0204366916399975,0.0481774310038254,0.030673814095909992,0.046324452888293725,0.09964285654907307,0.03924873030417919,0.12242204879185466,0.20239972742176757,0.043359687903442856,0.0368615684497604,0.04588326762269085])


In [None]:
print(model.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_2936e047b992, depth=5, numNodes=43, numClasses=2, numFeatures=41
  If (feature 0 <= 15.5)
   If (feature 15 <= 12.5)
    If (feature 2 <= 1.5)
     If (feature 5 <= 0.5)
      Predict: 0.0
     Else (feature 5 > 0.5)
      If (feature 4 <= 1.5)
       Predict: 0.0
      Else (feature 4 > 1.5)
       Predict: 1.0
    Else (feature 2 > 1.5)
     If (feature 9 <= 1.5)
      If (feature 7 <= 4.5)
       Predict: 1.0
      Else (feature 7 > 4.5)
       Predict: 0.0
     Else (feature 9 > 1.5)
      If (feature 13 <= 12.5)
       Predict: 0.0
      Else (feature 13 > 12.5)
       Predict: 1.0
   Else (feature 15 > 12.5)
    If (feature 1 <= 1.5)
     If (feature 36 in {1.0})
      Predict: 0.0
     Else (feature 36 not in {1.0})
      Predict: 1.0
    Else (feature 1 > 1.5)
     Predict: 0.0
  Else (feature 0 > 15.5)
   If (feature 10 <= 1.5)
    If (feature 27 in {1.0})
     If (feature 0 <= 19.5)
      Predict: 0.0
     Else (feat

In [None]:
# transform test data using the fitted model
predictions = model.transform(assembledTestData)
predictions.select([classification_target, "prediction", "probability"]).show(10, truncate=False)

print(evaluator.setMetricName("accuracy").evaluate(predictions))
print(evaluator.setMetricName("f1").evaluate(predictions))

+-----------------+----------+-----------------------------------------+
|schoolsup_indexed|prediction|probability                              |
+-----------------+----------+-----------------------------------------+
|1.0              |0.0       |[0.9523809523809523,0.047619047619047616]|
|0.0              |0.0       |[1.0,0.0]                                |
|0.0              |0.0       |[1.0,0.0]                                |
|1.0              |0.0       |[0.9523809523809523,0.047619047619047616]|
|0.0              |1.0       |[0.375,0.625]                            |
|1.0              |1.0       |[0.375,0.625]                            |
|0.0              |0.0       |[1.0,0.0]                                |
|0.0              |0.0       |[1.0,0.0]                                |
|0.0              |0.0       |[0.9183673469387755,0.08163265306122448] |
|1.0              |0.0       |[0.9545454545454546,0.045454545454545456]|
+-----------------+----------+---------------------

# RandomForest

In [None]:
# load library
from pyspark.ml.classification import RandomForestClassifier

#create model
rf = RandomForestClassifier(
	labelCol=classification_target,
	featuresCol="featureVector",
	predictionCol="prediction")

# train model with train data
model = rf.fit(assembledTrainData)

In [None]:
print(model.trees)

[DecisionTreeClassificationModel: uid=dtc_2f921b57b341, depth=5, numNodes=29, numClasses=2, numFeatures=41, DecisionTreeClassificationModel: uid=dtc_cc7ab391f570, depth=5, numNodes=35, numClasses=2, numFeatures=41, DecisionTreeClassificationModel: uid=dtc_84079df73ead, depth=5, numNodes=21, numClasses=2, numFeatures=41, DecisionTreeClassificationModel: uid=dtc_680caa899c99, depth=5, numNodes=35, numClasses=2, numFeatures=41, DecisionTreeClassificationModel: uid=dtc_5792f18f21ee, depth=5, numNodes=27, numClasses=2, numFeatures=41, DecisionTreeClassificationModel: uid=dtc_c536d2452fb3, depth=5, numNodes=29, numClasses=2, numFeatures=41, DecisionTreeClassificationModel: uid=dtc_05c7a9c08b7a, depth=5, numNodes=33, numClasses=2, numFeatures=41, DecisionTreeClassificationModel: uid=dtc_ceb091686ef2, depth=5, numNodes=25, numClasses=2, numFeatures=41, DecisionTreeClassificationModel: uid=dtc_de72992cd668, depth=5, numNodes=37, numClasses=2, numFeatures=41, DecisionTreeClassificationModel: uid

In [None]:
# transform test data using the fitted model
predictions = model.transform(assembledTestData)
predictions.select([classification_target, "prediction", "probability"]).show(10, truncate=False)

print(evaluator.setMetricName("accuracy").evaluate(predictions))
print(evaluator.setMetricName("f1").evaluate(predictions))

+-----------------+----------+----------------------------------------+
|schoolsup_indexed|prediction|probability                             |
+-----------------+----------+----------------------------------------+
|1.0              |0.0       |[0.8253246504880927,0.17467534951190727]|
|0.0              |0.0       |[0.8200655705516198,0.1799344294483803] |
|0.0              |0.0       |[0.823760968498476,0.17623903150152403] |
|1.0              |0.0       |[0.7954855504168619,0.20451444958313814]|
|0.0              |0.0       |[0.8439807588501779,0.1560192411498222] |
|1.0              |0.0       |[0.6500636152871364,0.3499363847128635] |
|0.0              |0.0       |[0.8923081322136035,0.10769186778639658]|
|0.0              |0.0       |[0.9301223777311517,0.06987762226884825]|
|0.0              |0.0       |[0.917293552178123,0.08270644782187697] |
|1.0              |0.0       |[0.8702751631423876,0.12972483685761232]|
+-----------------+----------+----------------------------------

# Linear SVM

In [None]:
# load library
from pyspark.ml.classification import LinearSVC

#create model
svm = LinearSVC(labelCol=classification_target,
		featuresCol="featureVector",
		maxIter=10, regParam=0.01)

# train model with train data
model = svm.fit(assembledTrainData)

In [None]:
print(model.intercept)
print(model.numFeatures)
print(model.numClasses)
print(model.coefficients)

0.5812293329590792
41
2
[-0.07555188455875586,-0.02124535262719366,0.03245179104980747,-0.014932693391515296,0.047793379535950656,0.014871347530887355,0.013644876283934612,-0.017953568169342973,0.0005317384153969861,0.02382702839779569,-0.027755825328125785,-0.019973764317399872,0.000544265452230042,-0.04576680869572514,0.006662307626350617,0.004408922430463545,-0.1741476996860488,-0.04770620742066615,0.07513363498171298,-0.001932444513423723,0.037740796114173476,0.012814267863411418,-0.016153547269958678,0.05796721775951888,-0.047525071613227526,-0.14846115847078872,0.0291581365467498,-0.10251891926123428,0.07295434259449478,0.0680779415554777,0.015639169438180084,-0.03542969964409015,-0.10026675364755987,-0.12204851528372729,0.029645314653254002,0.03234799984639835,-0.009741478045352333,0.04149248185876255,-0.02609545748001934,-0.01053147538733971,0.0028401459207304308]


In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# create evaluator
evaluator = MulticlassClassificationEvaluator(
	labelCol=classification_target,
	predictionCol="prediction")

# transform test data using the fitted model
predictions = model.transform(assembledTestData)
predictions.select([classification_target, "prediction"]).show(10, truncate=False)

print(evaluator.setMetricName("accuracy").evaluate(predictions))
print(evaluator.setMetricName("f1").evaluate(predictions))

+-----------------+----------+
|schoolsup_indexed|prediction|
+-----------------+----------+
|1.0              |0.0       |
|0.0              |0.0       |
|0.0              |0.0       |
|1.0              |0.0       |
|0.0              |0.0       |
|1.0              |0.0       |
|0.0              |0.0       |
|0.0              |0.0       |
|0.0              |0.0       |
|1.0              |0.0       |
+-----------------+----------+
only showing top 10 rows

0.8409090909090909
0.7682379349046016
