# Boston Housing Classification SVM Evaluation

In [None]:
import sys
sys.path.append("..")
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import expr
from pyspark.sql.session import SparkSession
from pyspark.sql.types import BooleanType
from helpers.path_translation import translate_to_file_string
from helpers.data_prep_and_print import add_weight_col, print_confusion_matrix

In [None]:
inputFile = translate_to_file_string("../data/Boston_Housing_Data.csv")

Spark session creation 

In [None]:
spark = (SparkSession
       .builder
       .appName("BostonHousingSVNEval")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [None]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

Create the weight column to handle the biased distribution of labels 

In [None]:
df_with_weight = add_weight_col(df,"CAT","classWeightCol") 

Prepare training and test data.

In [None]:
splits = df_with_weight.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

Data preprocessing

In [None]:
featureCols = df_with_weight.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
featureCols.remove("classWeightCol")
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

Build the evaluator

In [None]:
#TODO Add weight column 

evaluator = BinaryClassificationEvaluator(labelCol="CAT",rawPredictionCol="rawPrediction", metricName="areaUnderROC")
#evaluator = MulticlassClassificationEvaluator(labelCol="CAT", predictionCol="prediction", metricName='weightedPrecision')

Support Vector Machine Classifier

In [None]:
lsvc = LinearSVC(labelCol="CAT",aggregationDepth=2, featuresCol="features" ) 

Build the pipeline

In [None]:
pipeline = Pipeline(stages= [assembler, lsvc] )

Build the paramGrid

In [None]:
# TODO Add your settings there
paramGrid = ParamGridBuilder().addGrid(lsvc.maxIter, [100])\
                                 .addGrid(lsvc.regParam, [0.1]) \
                                 .build()

Build the CrossValidator 

In [None]:
cvSVM = CrossValidator(estimator=pipeline, evaluator=evaluator, \
                          estimatorParamMaps=paramGrid, numFolds=5, parallelism=2)

Train the model 

In [None]:
cvSVMModel = cvSVM.fit(training)

Test the model

In [None]:
predictions = cvSVMModel.transform(test)
predictions.show()

# Evaluate the Model
## Area under ROC

In [None]:
accuracy = evaluator.evaluate(predictions)
print("Test Error",(1.0 - accuracy))

## Confusion Matrix 

In [None]:
predictionAndLabels = predictions.select("prediction", "CAT").rdd.map(lambda p: [p[0], float(p[1])]) # Map to RDD prediction|label
labels = predictionAndLabels.map(lambda x: x[1]).distinct().collect() # List of all labels 
metrics =  MulticlassMetrics(predictionAndLabels)
print_confusion_matrix(spark, metrics.confusionMatrix())


In [None]:
# TODO print and evaluate on MulticlassMetrics metrics
# Confusion Matrix

## Statistics per label

In [None]:
for label in  labels:
   print("Class %f precision = %f\n" % (label , metrics.precision(label))) 
  # TODO add additional statistics for the label (recall, ...)

## Weighted stats

In [None]:
#TODO print weighted Stats 

In [None]:
## Summary stats

In [None]:
print(f"Recall = {metrics.recall(1.0)}")
print(f"Precision = {metrics.precision(1.0)}")
print(f"Accuracy = {metrics.accuracy}") 
print(f"F1 = {metrics.fMeasure(1.0)}")

In [None]:
spark.stop()