# Boston Housing Classification SVM

In [None]:
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LinearSVC
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import expr
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from helpers.helper_functions import translate_to_file_string
# for pretty printing
def printDf(sprkDF): 
    newdf = sprkDF.toPandas()
    from IPython.display import display, HTML
    return HTML(newdf.to_html())

In [None]:
inputFile = translate_to_file_string("../data/Boston_Housing_Data.csv")

Spark session creation 

In [None]:
spark = (SparkSession
       .builder
       .appName("ChurnDataPreprocessing")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [None]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

Prepare training and test data.

In [None]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

In [None]:
labledPointDataSet = assembler.transform(df)
splits = labledPointDataSet.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

Support Vector Machine Classifier

In [None]:
lsvc = LinearSVC(labelCol="CAT",aggregationDepth=2, featuresCol="features",maxIter=100
                    ,regParam=0.001, standardization=True ) 

Train the model 

In [None]:
lsvcModel = lsvc.fit(training)
print("Coefficients: " + str(lsvcModel.coefficients))
print("Intercept: " + str(lsvcModel.intercept))

Test the model

In [None]:
predictions = lsvcModel.transform(test)
printDf(predictions)

In [None]:
# in column rawpredicition is [-margin, margin] 
# positive margin => 1 
# negative margin => 0

evaluator = BinaryClassificationEvaluator(labelCol="CAT",rawPredictionCol="rawPrediction", metricName="areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print("Test Error",(1.0 - accuracy))

In [None]:
predictionAndLabels = predictions.select(predictions.prediction, predictions.CAT)
print (predictionAndLabels.take(20))

In [None]:
evaluator2 = BinaryClassificationEvaluator(labelCol="CAT",rawPredictionCol="prediction", metricName="areaUnderROC")

accuracy = evaluator2.evaluate(predictionAndLabels)
print("Test Error",(1.0 - accuracy))

In [None]:
countcorrect = predictionAndLabels.filter("CAT == prediction").count()
countincorrect = predictionAndLabels.filter("CAT != prediction").count()
countall = predictionAndLabels.count()
accuracy = countcorrect/countall
print(f"countcorrect: {countcorrect}")
print(f"countincorrect: {countincorrect}")
print(f"countall: {countall}")
print(f"accuracy: {accuracy}")
print(f"Test Error {1-accuracy}")

In [None]:
countpredzero =  predictionAndLabels.filter("prediction == '1.0'").count()
print (countpredzero)

In [None]:
spark.stop()