# Boston Housing Classification SVM Evaluation

In [None]:
import sys
sys.path.append("..")
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import expr,col
from pyspark.sql.session import SparkSession
from pyspark.sql.types import BooleanType
from helpers.path_translation import translate_to_file_string
from helpers.data_prep_and_print import print_df, add_weight_col, print_confusion_matrix

In [None]:
inputFile = translate_to_file_string("../data/Boston_Housing_Data.csv")

Spark session creation 

In [None]:
spark = (SparkSession
       .builder
       .appName("BostonHousingSVNEval")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [None]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

In [None]:
balanced_df = add_weight_col(df,"CAT")

Prepare training and test data.

In [None]:
splits = balanced_df.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

Data preprocessing

In [None]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

Build the evaluator

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol="CAT",rawPredictionCol="rawPrediction", metricName="areaUnderROC",weightCol="classWeightCol")
#evaluator = MulticlassClassificationEvaluator(labelCol="CAT", predictionCol="prediction", metricName='weightedPrecision', weightCol="classWeightCol")

Support Vector Machine Classifier

In [None]:
lsvc = LinearSVC(labelCol="CAT",aggregationDepth=2, featuresCol="features" ) 

Build the pipeline

In [None]:
pipeline = Pipeline(stages= [assembler, lsvc] )

Build the paramGrid

In [None]:
paramGrid = ParamGridBuilder().addGrid(lsvc.maxIter, [100])\
                                 .addGrid(lsvc.regParam, [0.1, 0.001, 0.0001]) \
                                 .addGrid(lsvc.standardization, [True, False]) \
                                 .build()

Build the CrossValidator 

In [None]:
cvSVM = CrossValidator(estimator=pipeline, evaluator=evaluator, \
                          estimatorParamMaps=paramGrid, numFolds=5, parallelism=2)

Train the model 

In [None]:
cvSVMModel = cvSVM.fit(training)

Show best Model 

In [None]:
linearSVCModel = cvSVMModel.bestModel.stages[1] # the stage at index 1 in the pipeline is the SVMModel
print("Best Params: \n", linearSVCModel.explainParams())
print("Param Map: \n", linearSVCModel.extractParamMap())

Test the model

In [None]:
predictions = cvSVMModel.transform(test)
print_df(predictions)

# Evaluate the Model
## Area under ROC

In [None]:
accuracy = evaluator.evaluate(predictions)
print("Test Error",(1.0 - accuracy))

## Confusion Matrix 

In [None]:
predictionAndLabels = predictions.select("prediction", "CAT").rdd.map(lambda p: [p[0], float(p[1])]) # Map to RDD prediction|label
metrics =  MulticlassMetrics(predictionAndLabels)

In [None]:
confusion = metrics.confusionMatrix()
print_confusion_matrix(spark, confusion)

In [None]:
# test the confusion matrix
print_df (predictions.select("prediction", "CAT"))
print("True Positives (Pred. = 0 & Label = 0) %f " % predictions.filter(predictions.prediction==predictions.CAT).filter(predictions.CAT == 0).count())
print("True Negatives (Pred. = 1 & Label = 1) %f " % predictions.filter(predictions.prediction==predictions.CAT).filter(predictions.CAT == 1).count())
print("False Positives (Pred. = 0 & Label = 1) %f " % predictions.filter(predictions.prediction!=predictions.CAT).filter(predictions.CAT == 1).count())
print("False Negatives (Pred = 0 & Label = 0) %f " % predictions.filter(predictions.prediction!=predictions.CAT).filter(predictions.CAT == 0).count())

## Statistics per label

In [None]:
labels = predictionAndLabels.map(lambda x: x[1]).distinct().collect()
print(labels)
for label in  labels:
  print("Class %f precision = %f\n" % (label , metrics.precision(label)))
  print("Class %f recall = %f\n" % (label, metrics.recall(label)))
  print("Class %f F1 score = %f\n" % (label, metrics.fMeasure( label)))

## Weighted stats

In [None]:
print("Weighted precision = %s\n" % metrics.weightedPrecision)
print("Weighted recall = %s\n" % metrics.weightedRecall)
print("Weighted false positive rate = %s\n" % metrics.weightedFalsePositiveRate)

In [None]:
## Summary stats

In [None]:
print(f"Recall = {metrics.recall(1.0)}")
print(f"Precision = {metrics.precision(1.0)}")
print(f"Accuracy = {metrics.accuracy}") 
print(f"F1 = {metrics.fMeasure(1.0)}")

In [None]:
spark.stop()