# Boston Housing Classification SVM Cross Validation

In [None]:
import sys
sys.path.append("..")
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql.functions import expr
from pyspark.sql.session import SparkSession
from pyspark.sql.types import BooleanType
from helpers.path_translation import translate_to_file_string

In [None]:
inputFile = translate_to_file_string("../data/Boston_Housing_Data.csv")

Spark session creation 

In [None]:
spark = (SparkSession
       .builder
       .appName("ChurnDataPreprocessing")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [None]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

In [None]:
splits = df.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

Prepare training and test data.

Data preprocessing

In [None]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

Build the evaluator

In [None]:
evaluator = BinaryClassificationEvaluator(labelCol="CAT",rawPredictionCol="rawPrediction", metricName="areaUnderROC")

Support Vector Machine Classifier

In [None]:
lsvc = LinearSVC(labelCol="CAT",aggregationDepth=2, featuresCol="features" ) 

Build the pipeline

In [None]:
pipeline = Pipeline(stages= [assembler, lsvc] )

Build the paramGrid

In [None]:
paramGrid = ParamGridBuilder().addGrid(lsvc.maxIter, [100])\
                                 .addGrid(lsvc.regParam, [0.1, 0.001, 0.0001]) \
                                 .addGrid(lsvc.standardization, [True, False]) \
                                 .build()

Build the CrossValidator 

In [None]:
cvSVM = CrossValidator(estimator=pipeline, evaluator=evaluator, \
                          estimatorParamMaps=paramGrid, numFolds=5, parallelism=2)

Train the model 

In [None]:
cvSVMModel = cvSVM.fit(training)

Show best Model 

In [None]:
linearSVCModel = cvSVMModel.bestModel.stages[1] # the stage at index 1 in the pipeline is the SVMModel
print("Best Params: \n", linearSVCModel.explainParams())
print("Param Map: \n", linearSVCModel.extractParamMap())

Test the model

In [None]:
predictions = cvSVMModel.transform(test)
predictions.show()

In [None]:
accuracy = evaluator.evaluate(predictions)
print("Test Error",(1.0 - accuracy))