# Boston Housing Classification SVM Cross Validation

In [25]:
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.pipeline import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql.functions import expr
from pyspark.sql.session import SparkSession
from pyspark.sql.types import BooleanType

In [26]:
inputFile = "../data/Boston_Housing_Data.csv"

Spark session creation 

In [27]:
spark = (SparkSession
       .builder
       .appName("ChurnDataPreprocessing")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [28]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)
 |-- CAT: integer (nullable = true)
 |-- CATBOOL: boolean (nullable = true)

None


In [29]:
splits = df.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

Prepare training and test data.

Data preprocessing

In [30]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


Build the evaluator

In [31]:
evaluator = BinaryClassificationEvaluator(labelCol="CAT",rawPredictionCol="prediction", metricName="areaUnderROC")

Support Vector Machine Classifier

In [32]:
lsvc = LinearSVC(labelCol="CAT",aggregationDepth=2, featuresCol="features" ) 

Build the pipeline

In [33]:
pipeline = Pipeline(stages= [assembler, lsvc] )

Build the paramGrid

In [34]:
paramGrid = ParamGridBuilder().addGrid(lsvc.maxIter, [100])\
                                 .addGrid(lsvc.regParam, [0.1, 0.001, 0.0001]) \
                                 .addGrid(lsvc.standardization, [True, False]) \
                                 .build()

Build the CrossValidator 

In [35]:
cvSVM = CrossValidator(estimator=pipeline, evaluator=evaluator, \
                          estimatorParamMaps=paramGrid, numFolds=5, parallelism=2)

Train the model 

In [36]:
cvSVMModel = cvSVM.fit(training)

Show best Model 

In [39]:
linearSVCModel = cvSVMModel.bestModel.stages[1] # the stage at index 1 in the pipeline is the SVMModel
print("Best Params: \n", linearSVCModel.explainParams())
print("Param Map: \n", linearSVCModel.extractParamMap())

Best Params: 
 aggregationDepth: suggested depth for treeAggregate (>= 2) (default: 2, current: 2)
featuresCol: features column name (default: features, current: features)
fitIntercept: whether to fit an intercept term (default: True)
labelCol: label column name (default: label, current: CAT)
maxIter: maximum number of iterations (>= 0) (default: 100, current: 100)
predictionCol: prediction column name (default: prediction)
rawPredictionCol: raw prediction (a.k.a. confidence) column name (default: rawPrediction)
regParam: regularization parameter (>= 0) (default: 0.0, current: 0.0001)
standardization: whether to standardize the training features before fitting the model (default: True, current: True)
threshold: threshold in binary classification prediction applied to rawPrediction (default: 0.0)
tol: the convergence tolerance for iterative algorithms (>= 0) (default: 1e-06)
weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0 (undefined)
Para

Test the model

In [40]:
predictions = cvSVMModel.transform(test)
predictions.show()

+-------+----+-----+----+------+-----+----+------+---+---+-------+------+-----+----+---+-------+--------------------+--------------------+----------+
|   CRIM|  ZN|INDUS|CHAS|   NOX|   RM| AGE|   DIS|RAD|TAX|PTRATIO|     B|LSTAT|MEDV|CAT|CATBOOL|            features|       rawPrediction|prediction|
+-------+----+-----+----+------+-----+----+------+---+---+-------+------+-----+----+---+-------+--------------------+--------------------+----------+
|0.01311|90.0| 1.22|   0| 0.403|7.249|21.9|8.6966|  5|226|   17.9|395.93| 4.81|35.4|  1|   true|[0.01311,90.0,1.2...|[-0.9912976507421...|       1.0|
|0.01439|60.0| 2.93|   0| 0.401|6.604|18.8|6.2196|  1|265|   15.6| 376.7| 4.38|29.1|  0|  false|[0.01439,60.0,2.9...|[-0.8161967243092...|       1.0|
|0.03871|52.5| 5.32|   0| 0.405|6.209|31.3|7.3172|  6|293|   16.6| 396.9| 7.14|23.2|  0|  false|[0.03871,52.5,5.3...|[1.64551746979580...|       0.0|
| 0.0456| 0.0|13.89|   1|  0.55|5.888|56.0|3.1121|  5|276|   16.4| 392.8|13.51|23.3|  0|  false|[0.0

In [41]:
accuracy = evaluator.evaluate(predictions)
print("Test Error",(1.0 - accuracy))

Test Error 0.1026785714285714
