# Boston Housing Classification SVM Evaluation

In [21]:
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import expr
from pyspark.sql.session import SparkSession
from pyspark.sql.types import BooleanType
from helpers.helper_functions import translate_to_file_string

In [22]:
inputFile = translate_to_file_string("../data/Boston_Housing_Data.csv")

Spark session creation 

In [23]:
spark = (SparkSession
       .builder
       .appName("BostonHoudingSVNEval")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [24]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)
 |-- CAT: integer (nullable = true)
 |-- CATBOOL: boolean (nullable = true)

None


In [25]:
splits = df.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

Prepare training and test data.

Data preprocessing

In [26]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


Build the evaluator

In [27]:
evaluator = BinaryClassificationEvaluator(labelCol="CAT",rawPredictionCol="prediction", metricName="areaUnderROC")
#evaluator = MulticlassClassificationEvaluator(labelCol="CAT", predictionCol="prediction", metricName='weightedPrecision')

Support Vector Machine Classifier

In [28]:
lsvc = LinearSVC(labelCol="CAT",aggregationDepth=2, featuresCol="features" ) 

Build the pipeline

In [29]:
pipeline = Pipeline(stages= [assembler, lsvc] )

Build the paramGrid

In [30]:
# TODO Add your settings there
paramGrid = ParamGridBuilder().addGrid(lsvc.maxIter, [100])\
                                 .addGrid(lsvc.regParam, [0.1]) \
                                 .build()

Build the CrossValidator 

In [31]:
cvSVM = CrossValidator(estimator=pipeline, evaluator=evaluator, \
                          estimatorParamMaps=paramGrid, numFolds=5, parallelism=2)

Train the model 

In [32]:
cvSVMModel = cvSVM.fit(training)

Test the model

In [33]:
predictions = cvSVMModel.transform(test)
predictions.show()

+-------+----+-----+----+-----+-----+----+-------+---+---+-------+------+-----+----+---+-------+--------------------+--------------------+----------+
|   CRIM|  ZN|INDUS|CHAS|  NOX|   RM| AGE|    DIS|RAD|TAX|PTRATIO|     B|LSTAT|MEDV|CAT|CATBOOL|            features|       rawPrediction|prediction|
+-------+----+-----+----+-----+-----+----+-------+---+---+-------+------+-----+----+---+-------+--------------------+--------------------+----------+
|0.02731| 0.0| 7.07|   0|0.469|6.421|78.9| 4.9671|  2|242|   17.8| 396.9| 9.14|21.6|  0|  false|[0.02731,0.0,7.07...|[1.13875273084909...|       0.0|
|0.02763|75.0| 2.95|   0|0.428|6.595|21.8| 5.4011|  3|252|   18.3|395.63| 4.32|30.8|  1|   true|[0.02763,75.0,2.9...|[0.30397961423077...|       0.0|
|0.03427| 0.0| 5.19|   0|0.515|5.869|46.3| 5.2311|  5|224|   20.2| 396.9|  9.8|19.5|  0|  false|[0.03427,0.0,5.19...|[1.97489235604983...|       0.0|
|0.03659|25.0| 4.86|   0|0.426|6.302|32.2| 5.4007|  4|281|   19.0| 396.9| 6.72|24.8|  0|  false|[0.0

# Evaluate the Model
## Area under ROC

In [34]:
accuracy = evaluator.evaluate(predictions)
print("Test Error",(1.0 - accuracy))

Test Error 0.25


## Confusion Matrix 

In [35]:
predictionAndLabels = predictions.select("prediction", "CAT").rdd.map(lambda p: [p[0], float(p[1])]) # Map to RDD prediction|label
labels = predictionAndLabels.map(lambda x: x[1]).distinct().collect() # List of all labels 
metrics =  MulticlassMetrics(predictionAndLabels)

In [36]:
# TODO print and evaluate on MulticlassMetrics metrics
# Confusion Matrix

## Statistics per label

In [37]:
for label in  labels:
   print("Class %f precision = %f\n" % (label , metrics.precision(label))) 
  # TODO add additional statistics for the label (recall, ...)

Class 0.000000 precision = 0.906250

Class 1.000000 precision = 1.000000



## Weighted stats

In [38]:
#TODO print weighted Stats 

In [39]:
## Summary stats

In [40]:
print(f"Recall = {metrics.recall(1.0)}")
print(f"Precision = {metrics.precision(1.0)}")
print(f"Accuracy = {metrics.accuracy}") 
print(f"F1 = {metrics.fMeasure(1.0)}")

Recall = 0.5
Precision = 1.0
Accuracy = 0.9142857142857143
F1 = 0.6666666666666666


In [41]:
spark.stop()