# SSDS5. Supervised ML with PySpark

## Regression & Classification

In [None]:
!wget -q https://archive.apache.org/dist/spark/spark-3.2.4/spark-3.2.4-bin-hadoop3.2.tgz
!tar xf spark-3.2.4-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
import findspark
findspark.init("/content/spark-3.2.4-bin-hadoop3.2")

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

## Mount

In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
gpath = '/gdrive/MyDrive/data/'

# Load Data

data source :
https://www.kaggle.com/uciml/student-alcohol-consumption

In [None]:
df1 = spark.read.option("header", "true") \
                    .option("inferSchema", "true") \
                    .csv(gpath+'student-por.csv')

df1.show(3)

+------+---+---+-------+-------+-------+----+----+-------+-------+------+--------+----------+---------+--------+---------+------+----+----------+-------+------+--------+--------+------+--------+-----+----+----+------+--------+---+---+---+
|school|sex|age|address|famsize|Pstatus|Medu|Fedu|   Mjob|   Fjob|reason|guardian|traveltime|studytime|failures|schoolsup|famsup|paid|activities|nursery|higher|internet|romantic|famrel|freetime|goout|Dalc|Walc|health|absences| G1| G2| G3|
+------+---+---+-------+-------+-------+----+----+-------+-------+------+--------+----------+---------+--------+---------+------+----+----------+-------+------+--------+--------+------+--------+-----+----+----+------+--------+---+---+---+
|    GP|  F| 18|      U|    GT3|      A|   4|   4|at_home|teacher|course|  mother|         2|        2|       0|      yes|    no|  no|        no|    yes|   yes|      no|      no|     4|       3|    4|   1|   1|     3|       4|  0| 11| 11|
|    GP|  F| 17|      U|    GT3|      T|   1

In [None]:
df2 = spark.read.option("header", "true") \
                    .option("inferSchema", "true") \
                    .csv(gpath+'student-mat.csv')

df2.show(3)

+------+---+---+-------+-------+-------+----+----+-------+-------+------+--------+----------+---------+--------+---------+------+----+----------+-------+------+--------+--------+------+--------+-----+----+----+------+--------+---+---+---+
|school|sex|age|address|famsize|Pstatus|Medu|Fedu|   Mjob|   Fjob|reason|guardian|traveltime|studytime|failures|schoolsup|famsup|paid|activities|nursery|higher|internet|romantic|famrel|freetime|goout|Dalc|Walc|health|absences| G1| G2| G3|
+------+---+---+-------+-------+-------+----+----+-------+-------+------+--------+----------+---------+--------+---------+------+----+----------+-------+------+--------+--------+------+--------+-----+----+----+------+--------+---+---+---+
|    GP|  F| 18|      U|    GT3|      A|   4|   4|at_home|teacher|course|  mother|         2|        2|       0|      yes|    no|  no|        no|    yes|   yes|      no|      no|     4|       3|    4|   1|   1|     3|       6|  5|  6|  6|
|    GP|  F| 17|      U|    GT3|      T|   1

In [None]:
#union df1 & df2
df = df1."fill here"(df2)
df.count()

1044

In [None]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder

binaryCols = ['school','sex','address','famsize','Pstatus','schoolsup','famsup',\
              'paid','activities','nursery','higher','internet','romantic']
categoryCols = ['Mjob', 'Fjob', 'reason', 'guardian']
numericCols = [x for x in df.columns if x not in (binaryCols+categoryCols)]

for col in binaryCols+categoryCols:
  indexer = "fill here"(inputCol=col, outputCol=col+"_indexed")
  df = indexer."fill here"(df)."fill here"(df)

encoder = "fill here"(inputCols=[x+"_indexed" for x in categoryCols],\
                                 outputCols=[x+"_encoded" for x in categoryCols])
df = encoder."fill here"(df)."fill here"(df)

df.show(3)

+------+---+---+-------+-------+-------+----+----+-------+-------+------+--------+----------+---------+--------+---------+------+----+----------+-------+------+--------+--------+------+--------+-----+----+----+------+--------+---+---+---+--------------+-----------+---------------+---------------+---------------+-----------------+--------------+------------+------------------+---------------+--------------+----------------+----------------+------------+------------+--------------+----------------+-------------+-------------+--------------+----------------+
|school|sex|age|address|famsize|Pstatus|Medu|Fedu|   Mjob|   Fjob|reason|guardian|traveltime|studytime|failures|schoolsup|famsup|paid|activities|nursery|higher|internet|romantic|famrel|freetime|goout|Dalc|Walc|health|absences| G1| G2| G3|school_indexed|sex_indexed|address_indexed|famsize_indexed|Pstatus_indexed|schoolsup_indexed|famsup_indexed|paid_indexed|activities_indexed|nursery_indexed|higher_indexed|internet_indexed|romantic_ind

In [None]:
df_encoded = df[numericCols+[x+"_indexed" for x in binaryCols]+[x+"_encoded" for x in categoryCols]]
df_encoded.show(3)

+---+----+----+----------+---------+--------+------+--------+-----+----+----+------+--------+---+---+---+--------------+-----------+---------------+---------------+---------------+-----------------+--------------+------------+------------------+---------------+--------------+----------------+----------------+-------------+-------------+--------------+----------------+
|age|Medu|Fedu|traveltime|studytime|failures|famrel|freetime|goout|Dalc|Walc|health|absences| G1| G2| G3|school_indexed|sex_indexed|address_indexed|famsize_indexed|Pstatus_indexed|schoolsup_indexed|famsup_indexed|paid_indexed|activities_indexed|nursery_indexed|higher_indexed|internet_indexed|romantic_indexed| Mjob_encoded| Fjob_encoded|reason_encoded|guardian_encoded|
+---+----+----+----------+---------+--------+------+--------+-----+----+----+------+--------+---+---+---+--------------+-----------+---------------+---------------+---------------+-----------------+--------------+------------+------------------+-------------

In [None]:
(trainData, testData) = df_encoded.randomSplit([0.9, 0.1], seed=123)

classification_target = 'schoolsup_indexed'
df_encoded."fill here"(classification_target).count().show()

+-----------------+-----+
|schoolsup_indexed|count|
+-----------------+-----+
|              0.0|  925|
|              1.0|  119|
+-----------------+-----+



In [None]:
from pyspark.ml.feature import VectorAssembler
# remove target label
inputCols = trainData."fill here"(classification_target).columns

# asemble features to vector
assembler = "fill here"(inputCols=inputCols,
    outputCol="featureVector")

assembledTrainData = assembler."fill here"(trainData)
assembledTestData = assembler."fill here"(testData)

assembledTrainData.select('featureVector').show(3, truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|featureVector                                                                                                                                                      |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|(41,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,26,27,30,33,39],[15.0,1.0,1.0,1.0,1.0,1.0,4.0,1.0,3.0,1.0,1.0,2.0,6.0,10.0,10.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|(41,[0,1,2,3,4,6,7,8,9,10,11,12,13,14,15,18,24,27,28,32,38,39],[15.0,1.0,1.0,1.0,2.0,3.0,3.0,4.0,2.0,4.0,5.0,2.0,13.0,11.0,11.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])      |
|(41,[0,1,2,3,4,6,7,8,9,10,11,12,13,14,15,19,21,30,32,39],[15.0,1.0,1.0,1.0,2.0,4.0,3.0,2.0,2.0,3.0,3.0,6.0,12.0,13.0,12.0,1.0,1.0,1.0,1.0,1.0])                    |
+---

# Logistic Regression

In [None]:
# import library
from pyspark.ml.classification import LogisticRegression

# create logistic regression model
logi = "fill here"(labelCol=classification_target, featuresCol="featureVector",
	family="multinomial",
	predictionCol="prediction")

# train model with train data
model = logi."fill here"(assembledTrainData)

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# create evaluator
evaluator = "fill here"(
	labelCol=classification_target,
	predictionCol="prediction")

# transform test data using the fitted model
predictions = model."fill here"(assembledTestData)
predictions.select([classification_target, "prediction", "probability"]).show(10, truncate=False)

print(evaluator.setMetricName("accuracy")."fill here"(predictions))
print(evaluator.setMetricName("f1")."fill here"(predictions))

+-----------------+----------+----------------------------------------+
|schoolsup_indexed|prediction|probability                             |
+-----------------+----------+----------------------------------------+
|1.0              |0.0       |[0.7006229339028991,0.29937706609710096]|
|1.0              |0.0       |[0.7172540679256684,0.2827459320743315] |
|0.0              |0.0       |[0.8171311766702615,0.18286882332973856]|
|0.0              |1.0       |[0.26879821700077755,0.7312017829992224]|
|0.0              |1.0       |[0.3024921707286409,0.6975078292713591] |
|0.0              |0.0       |[0.872327668692888,0.127672331307112]   |
|0.0              |1.0       |[0.4767009071525369,0.5232990928474632] |
|0.0              |0.0       |[0.6373742503752765,0.36262574962472355]|
|0.0              |0.0       |[0.9145049298319794,0.0854950701680207] |
|0.0              |0.0       |[0.8687481034834917,0.13125189651650832]|
+-----------------+----------+----------------------------------

# Decision tree

In [None]:
# load library
from pyspark.ml.classification import DecisionTreeClassifier

#create model
classifier = "fill here"(
	labelCol=classification_target,
	featuresCol="featureVector",
	predictionCol="prediction")

# train model with train data
model = classifier."fill here"(assembledTrainData)

In [None]:
print(model.numNodes)
print(model.depth)
print(model.numFeatures)
print(model.numClasses)

31
5
41
2


In [None]:
print(model.featureImportances)

(41,[0,2,3,7,9,11,15,17,26,27,34],[0.23063507162489671,0.022841027490475434,0.1054218670216423,0.13235036762452695,0.029080141282753318,0.03197743848666559,0.10970164706117255,0.06933732789452403,0.14724305221538622,0.04838713001196313,0.07302492928599376])


In [None]:
print(model.toDebugString)

DecisionTreeClassificationModel: uid=DecisionTreeClassifier_d0077e2d3550, depth=5, numNodes=31, numClasses=2, numFeatures=41
  If (feature 0 <= 16.5)
   If (feature 15 <= 12.5)
    If (feature 0 <= 15.5)
     If (feature 3 <= 1.5)
      If (feature 17 in {1.0})
       Predict: 0.0
      Else (feature 17 not in {1.0})
       Predict: 1.0
     Else (feature 3 > 1.5)
      If (feature 7 <= 1.5)
       Predict: 1.0
      Else (feature 7 > 1.5)
       Predict: 0.0
    Else (feature 0 > 15.5)
     If (feature 27 in {1.0})
      If (feature 3 <= 2.5)
       Predict: 0.0
      Else (feature 3 > 2.5)
       Predict: 1.0
     Else (feature 27 not in {1.0})
      Predict: 0.0
   Else (feature 15 > 12.5)
    If (feature 9 <= 4.5)
     Predict: 0.0
    Else (feature 9 > 4.5)
     Predict: 1.0
  Else (feature 0 > 16.5)
   If (feature 34 in {1.0})
    If (feature 26 in {0.0})
     If (feature 7 <= 4.5)
      Predict: 0.0
     Else (feature 7 > 4.5)
      Predict: 1.0
    Else (feature 26 not in {0.0}

In [None]:
# transform test data using the fitted model
predictions = model."fill here"(assembledTestData)
predictions.select([classification_target, "prediction", "probability"]).show(10, truncate=False)

print(evaluator.setMetricName("accuracy")."fill here"(predictions))
print(evaluator.setMetricName("f1")."fill here"(predictions))

+-----------------+----------+----------------------------------------+
|schoolsup_indexed|prediction|probability                             |
+-----------------+----------+----------------------------------------+
|1.0              |0.0       |[0.9239766081871345,0.07602339181286549]|
|1.0              |1.0       |[0.3611111111111111,0.6388888888888888] |
|0.0              |0.0       |[0.8611111111111112,0.1388888888888889] |
|0.0              |0.0       |[0.9239766081871345,0.07602339181286549]|
|0.0              |0.0       |[0.8611111111111112,0.1388888888888889] |
|0.0              |0.0       |[0.9239766081871345,0.07602339181286549]|
|0.0              |1.0       |[0.3611111111111111,0.6388888888888888] |
|0.0              |0.0       |[0.9239766081871345,0.07602339181286549]|
|0.0              |1.0       |[0.3611111111111111,0.6388888888888888] |
|0.0              |0.0       |[0.7096774193548387,0.2903225806451613] |
+-----------------+----------+----------------------------------

# RandomForest

In [None]:
# load library
from pyspark.ml.classification import RandomForestClassifier

#create model
rf = "fill here"(
	labelCol=classification_target,
	featuresCol="featureVector",
	predictionCol="prediction")

# train model with train data
model = rf."fill here"(assembledTrainData)

In [None]:
print(model.trees)

[DecisionTreeClassificationModel: uid=dtc_99b94760970d, depth=5, numNodes=37, numClasses=2, numFeatures=41, DecisionTreeClassificationModel: uid=dtc_7cbc9a4051ac, depth=5, numNodes=33, numClasses=2, numFeatures=41, DecisionTreeClassificationModel: uid=dtc_d738e39f2492, depth=5, numNodes=13, numClasses=2, numFeatures=41, DecisionTreeClassificationModel: uid=dtc_46794a32992b, depth=5, numNodes=33, numClasses=2, numFeatures=41, DecisionTreeClassificationModel: uid=dtc_fa4bd46c76d6, depth=5, numNodes=27, numClasses=2, numFeatures=41, DecisionTreeClassificationModel: uid=dtc_c1878c753290, depth=5, numNodes=29, numClasses=2, numFeatures=41, DecisionTreeClassificationModel: uid=dtc_ddd04035216d, depth=5, numNodes=33, numClasses=2, numFeatures=41, DecisionTreeClassificationModel: uid=dtc_beaf0e132c26, depth=5, numNodes=23, numClasses=2, numFeatures=41, DecisionTreeClassificationModel: uid=dtc_1b2254c3c318, depth=5, numNodes=29, numClasses=2, numFeatures=41, DecisionTreeClassificationModel: uid

In [None]:
# transform test data using the fitted model
predictions = model."fill here"(assembledTestData)
predictions.select([classification_target, "prediction", "probability"]).show(10, truncate=False)

print(evaluator.setMetricName("accuracy")."fill here"(predictions))
print(evaluator.setMetricName("f1")."fill here"(predictions))

+-----------------+----------+----------------------------------------+
|schoolsup_indexed|prediction|probability                             |
+-----------------+----------+----------------------------------------+
|1.0              |0.0       |[0.7915116532338421,0.20848834676615793]|
|1.0              |0.0       |[0.8656730436731449,0.13432695632685515]|
|0.0              |0.0       |[0.8648176643564911,0.13518233564350876]|
|0.0              |0.0       |[0.7513402022776586,0.2486597977223414] |
|0.0              |0.0       |[0.8107547856166505,0.18924521438334957]|
|0.0              |0.0       |[0.9163184814616422,0.08368151853835779]|
|0.0              |0.0       |[0.7743637958643838,0.22563620413561628]|
|0.0              |0.0       |[0.8528990828043886,0.14710091719561133]|
|0.0              |0.0       |[0.7630521691885042,0.23694783081149584]|
|0.0              |0.0       |[0.8088288102085344,0.19117118979146563]|
+-----------------+----------+----------------------------------

# Linear SVM

In [None]:
# load library
from pyspark.ml.classification import LinearSVC

#create model
svm = "fill here"(labelCol=classification_target,
		featuresCol="featureVector",
		maxIter=10, regParam=0.01)

# train model with train data
model = svm."fill here"(assembledTrainData)

In [None]:
print(model.intercept)
print(model.numFeatures)
print(model.numClasses)
print(model.coefficients)

0.7264888518969668
41
2
[-0.08305692668912139,-0.062166029291977086,0.04949225537191316,-0.02086490567919316,0.041243238810911666,-0.019308160691226713,0.00886571063856346,-0.012009237181384056,-0.00015180273366486807,0.07135729199642894,-0.023972075380649292,0.0016383883590500195,0.0008803544151410975,-0.05095083298974979,0.004935530580463163,0.008782232767775783,-0.2025487833048737,-0.12824879707215914,0.01754573031274595,-0.011107056721032474,0.014822401752398266,0.005581527221941247,-0.0824225400962137,0.05490736400710084,-0.07498957845920681,-0.02056035463381418,0.03509561633653505,-0.11337438885816198,0.09618216347054666,0.09892006701617986,0.033312339902130825,-0.10847624339652896,-0.06976497725409384,-0.12669070707351496,0.15056535174186114,-0.0,-0.0656818796628918,-0.018394154531339798,-0.017167292508489272,0.0,0.016075211583198183]


In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# create evaluator
evaluator = "fill here"(
	labelCol=classification_target,
	predictionCol="prediction")

# transform test data using the fitted model
predictions = model."fill here"(assembledTestData)
predictions.select([classification_target, "prediction"]).show(10, truncate=False)

print(evaluator.setMetricName("accuracy")."fill here"(predictions))
print(evaluator.setMetricName("f1")."fill here"(predictions))

+-----------------+----------+
|schoolsup_indexed|prediction|
+-----------------+----------+
|1.0              |0.0       |
|1.0              |0.0       |
|0.0              |0.0       |
|0.0              |0.0       |
|0.0              |0.0       |
|0.0              |0.0       |
|0.0              |0.0       |
|0.0              |0.0       |
|0.0              |0.0       |
|0.0              |0.0       |
+-----------------+----------+
only showing top 10 rows

0.8899082568807339
0.8380689409459339
