# Boston Housing Classification SVM

In [1]:
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LinearSVC
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import expr
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [2]:
inputFile = "../data/Boston_Housing_Data.csv"

Spark session creation 

In [3]:
spark = (SparkSession
       .builder
       .appName("ChurnDataPreprocessing")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [4]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)
 |-- CAT: integer (nullable = true)
 |-- CATBOOL: boolean (nullable = true)

None


Prepare training and test data.

In [5]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


In [6]:
labledPointDataSet = assembler.transform(df)
splits = labledPointDataSet.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

Support Vector Machine Classifier

In [7]:
lsvc = LinearSVC(labelCol="CAT",aggregationDepth=2, featuresCol="features",maxIter=1000
                    ,regParam=0.001, standardization=True ) 

Train the model 

In [8]:
lsvcModel = lsvc.fit(training)

Test the model

In [9]:
predictions = lsvcModel.transform(test)
predictions.show()

+-------+----+-----+----+------+-----+----+------+---+---+-------+------+-----+----+---+-------+--------------------+--------------------+----------+
|   CRIM|  ZN|INDUS|CHAS|   NOX|   RM| AGE|   DIS|RAD|TAX|PTRATIO|     B|LSTAT|MEDV|CAT|CATBOOL|            features|       rawPrediction|prediction|
+-------+----+-----+----+------+-----+----+------+---+---+-------+------+-----+----+---+-------+--------------------+--------------------+----------+
|0.01311|90.0| 1.22|   0| 0.403|7.249|21.9|8.6966|  5|226|   17.9|395.93| 4.81|35.4|  1|   true|[0.01311,90.0,1.2...|[-1.1242835635457...|       1.0|
|0.01439|60.0| 2.93|   0| 0.401|6.604|18.8|6.2196|  1|265|   15.6| 376.7| 4.38|29.1|  0|  false|[0.01439,60.0,2.9...|[-0.5981373936435...|       1.0|
|0.03871|52.5| 5.32|   0| 0.405|6.209|31.3|7.3172|  6|293|   16.6| 396.9| 7.14|23.2|  0|  false|[0.03871,52.5,5.3...|[1.36317782240409...|       0.0|
| 0.0456| 0.0|13.89|   1|  0.55|5.888|56.0|3.1121|  5|276|   16.4| 392.8|13.51|23.3|  0|  false|[0.0

In [10]:
evaluator = BinaryClassificationEvaluator(labelCol="CAT",rawPredictionCol="prediction", metricName="areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print("Test Error",(1.0 - accuracy))

Test Error 0.1026785714285714
