# Boston Housing Classification SVM

In [1]:
import sys
sys.path.append("..")
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LinearSVC
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import expr
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from helpers.data_prep_and_print import print_df
from helpers.path_translation import translate_to_file_string

In [2]:
inputFile = translate_to_file_string("../data/Boston_Housing_Data.csv")

Spark session creation 

In [3]:
spark = (SparkSession
       .builder
       .appName("BostonHousingClassSVN")
       .getOrCreate())

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/25 12:40:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


DataFrame creation using an ifered Schema 

In [4]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)
 |-- CAT: integer (nullable = true)
 |-- CATBOOL: boolean (nullable = true)

None


Prepare training and test data.

In [5]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


In [6]:
labledPointDataSet = assembler.transform(df)
splits = labledPointDataSet.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

Support Vector Machine Classifier

In [7]:
# TODO Optimize different setting
lsvc = LinearSVC(labelCol="CAT", featuresCol="features",maxIter=10
                    ,regParam=0.5, standardization=True ) 

Train the model 

In [8]:
lsvc_model = lsvc.fit(training)

22/05/25 12:41:04 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/05/25 12:41:04 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


Test the model

In [9]:
predictions = lsvc_model.transform(test)
print_df(predictions,10)

22/05/25 12:41:06 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,CAT,CATBOOL,features,rawPrediction,prediction
0,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6,0,False,"[0.02731, 0.0, 7.07, 0.0, 0.469, 6.421, 78.9, 4.9671, 2.0, 242.0, 17.8, 396.9, 9.14]","[0.9393735721340299, -0.9393735721340299]",0.0
1,0.02763,75.0,2.95,0,0.428,6.595,21.8,5.4011,3,252,18.3,395.63,4.32,30.8,1,True,"[0.02763, 75.0, 2.95, 0.0, 0.428, 6.595, 21.8, 5.4011, 3.0, 252.0, 18.3, 395.63, 4.32]","[0.590970461757879, -0.590970461757879]",0.0
2,0.03427,0.0,5.19,0,0.515,5.869,46.3,5.2311,5,224,20.2,396.9,9.8,19.5,0,False,"[0.03427, 0.0, 5.19, 0.0, 0.515, 5.869, 46.3, 5.2311, 5.0, 224.0, 20.2, 396.9, 9.8]","[1.4028886335121546, -1.4028886335121546]",0.0
3,0.03659,25.0,4.86,0,0.426,6.302,32.2,5.4007,4,281,19.0,396.9,6.72,24.8,0,False,"[0.03659, 25.0, 4.86, 0.0, 0.426, 6.302, 32.2, 5.4007, 4.0, 281.0, 19.0, 396.9, 6.72]","[1.0099532761223131, -1.0099532761223131]",0.0
4,0.03768,80.0,1.52,0,0.404,7.274,38.3,7.309,2,329,12.6,392.2,6.62,34.6,1,True,"[0.03768, 80.0, 1.52, 0.0, 0.404, 7.274, 38.3, 7.309, 2.0, 329.0, 12.6, 392.2, 6.62]","[-0.06308728969790733, 0.06308728969790733]",1.0
5,0.05789,12.5,6.07,0,0.409,5.878,21.4,6.498,4,345,18.9,396.21,8.1,22.0,0,False,"[0.05789, 12.5, 6.07, 0.0, 0.409, 5.878, 21.4, 6.498, 4.0, 345.0, 18.9, 396.21, 8.1]","[1.3684396618406895, -1.3684396618406895]",0.0
6,0.06127,40.0,6.41,1,0.447,6.826,27.6,4.8628,4,254,17.6,393.45,4.16,33.1,1,True,"[0.06127, 40.0, 6.41, 1.0, 0.447, 6.826, 27.6, 4.8628, 4.0, 254.0, 17.6, 393.45, 4.16]","[0.41597045942923305, -0.41597045942923305]",0.0
7,0.06162,0.0,4.39,0,0.442,5.898,52.3,8.0136,3,352,18.8,364.61,12.67,17.2,0,False,"[0.06162, 0.0, 4.39, 0.0, 0.442, 5.898, 52.3, 8.0136, 3.0, 352.0, 18.8, 364.61, 12.67]","[1.4903030337651904, -1.4903030337651904]",0.0
8,0.07875,45.0,3.44,0,0.437,6.782,41.1,3.7886,5,398,15.2,393.87,6.68,32.0,1,True,"[0.07875, 45.0, 3.44, 0.0, 0.437, 6.782, 41.1, 3.7886, 5.0, 398.0, 15.2, 393.87, 6.68]","[0.3820601205986125, -0.3820601205986125]",0.0
9,0.0795,60.0,1.69,0,0.411,6.579,35.9,10.7103,4,411,18.3,370.78,5.49,24.1,0,False,"[0.0795, 60.0, 1.69, 0.0, 0.411, 6.579, 35.9, 10.7103, 4.0, 411.0, 18.3, 370.78, 5.49]","[0.9143800468262839, -0.9143800468262839]",0.0


In [10]:
evaluator = MulticlassClassificationEvaluator(labelCol="CAT", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error",(1.0 - accuracy))

Test Error 0.1428571428571429


In [11]:
countcorrect = predictions.filter("CAT == prediction").count()
countall = predictions.count()
accuracy = countcorrect/countall
print(f"countcorrect: {countcorrect}")
print(f"countall: {countall}")
print(f"accuracy: {accuracy}")
print(f"Test Error {1.0-accuracy}")

countcorrect: 30
countall: 35
accuracy: 0.8571428571428571
Test Error 0.1428571428571429
