# Boston Housing Classification SVM

In [1]:
import sys
sys.path.append("..")
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LinearSVC
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import expr
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from helpers.path_translation import translate_to_file_string
from helpers.data_prep_and_print import print_df

In [2]:
inputFile = translate_to_file_string("../data/Boston_Housing_Data.csv")

Spark session creation 

In [3]:
spark = (SparkSession
       .builder
       .appName("ChurnDataPreprocessing")
       .getOrCreate())

22/05/10 15:36:13 WARN Utils: Your hostname, Christophs-MacBook-Pro-2.local resolves to a loopback address: 127.0.0.1; using 10.154.97.185 instead (on interface en0)
22/05/10 15:36:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/10 15:36:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/05/10 15:36:15 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/05/10 15:36:15 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


DataFrame creation using an ifered Schema 

In [4]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)
 |-- CAT: integer (nullable = true)
 |-- CATBOOL: boolean (nullable = true)

None


Prepare training and test data.

In [5]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


In [6]:
labledPointDataSet = assembler.transform(df)
splits = labledPointDataSet.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

Support Vector Machine Classifier

In [7]:
lsvc = LinearSVC(labelCol="CAT",aggregationDepth=2, featuresCol="features",maxIter=100
                    ,regParam=0.001, standardization=True ) 
# nox rausgenommen, regParam 0.4 standadization = false

Train the model 

In [8]:
lsvcModel = lsvc.fit(training)
print("Coefficients: " + str(lsvcModel.coefficients))
print("Intercept: " + str(lsvcModel.intercept))

22/05/10 15:36:24 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/05/10 15:36:24 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


Coefficients: [0.009883732517471192,0.008024957184771007,-0.07697983943305368,0.18475720601192872,1.2072019870702853,2.315451034677114,0.007127599555774839,-0.09125186865512716,0.09336659142322308,-0.002799083364356597,-0.2507837055101058,-0.0012911424319540215,-0.2525554918564154]
Intercept: -9.505592941401014


Test the model

In [9]:
predictions = lsvcModel.transform(test)
print_df(predictions)

22/05/10 15:36:32 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,CAT,CATBOOL,features,rawPrediction,prediction
0,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6,0,False,"[0.02731, 0.0, 7.07, 0.0, 0.469, 6.421, 78.9, 4.9671, 2.0, 242.0, 17.8, 396.9, 9.14]","[2.2821777839241095, -2.2821777839241095]",0.0
1,0.02763,75.0,2.95,0,0.428,6.595,21.8,5.4011,3,252,18.3,395.63,4.32,30.8,1,True,"[0.02763, 75.0, 2.95, 0.0, 0.428, 6.595, 21.8, 5.4011, 3.0, 252.0, 18.3, 395.63, 4.32]","[0.2974008142137201, -0.2974008142137201]",0.0
2,0.03427,0.0,5.19,0,0.515,5.869,46.3,5.2311,5,224,20.2,396.9,9.8,19.5,0,False,"[0.03427, 0.0, 5.19, 0.0, 0.515, 5.869, 46.3, 5.2311, 5.0, 224.0, 20.2, 396.9, 9.8]","[4.054519056612792, -4.054519056612792]",0.0
3,0.03659,25.0,4.86,0,0.426,6.302,32.2,5.4007,4,281,19.0,396.9,6.72,24.8,0,False,"[0.03659, 25.0, 4.86, 0.0, 0.426, 6.302, 32.2, 5.4007, 4.0, 281.0, 19.0, 396.9, 6.72]","[2.2233979808772304, -2.2233979808772304]",0.0
4,0.03768,80.0,1.52,0,0.404,7.274,38.3,7.309,2,329,12.6,392.2,6.62,34.6,1,True,"[0.03768, 80.0, 1.52, 0.0, 0.404, 7.274, 38.3, 7.309, 2.0, 329.0, 12.6, 392.2, 6.62]","[-1.8837509291312102, 1.8837509291312102]",1.0
5,0.05789,12.5,6.07,0,0.409,5.878,21.4,6.498,4,345,18.9,396.21,8.1,22.0,0,False,"[0.05789, 12.5, 6.07, 0.0, 0.409, 5.878, 21.4, 6.498, 4.0, 345.0, 18.9, 396.21, 8.1]","[4.09772610631079, -4.09772610631079]",0.0
6,0.06127,40.0,6.41,1,0.447,6.826,27.6,4.8628,4,254,17.6,393.45,4.16,33.1,1,True,"[0.06127, 40.0, 6.41, 1.0, 0.447, 6.826, 27.6, 4.8628, 4.0, 254.0, 17.6, 393.45, 4.16]","[-0.2952727074903141, 0.2952727074903141]",1.0
7,0.06162,0.0,4.39,0,0.442,5.898,52.3,8.0136,3,352,18.8,364.61,12.67,17.2,0,False,"[0.06162, 0.0, 4.39, 0.0, 0.442, 5.898, 52.3, 8.0136, 3.0, 352.0, 18.8, 364.61, 12.67]","[5.101847195500794, -5.101847195500794]",0.0
8,0.07875,45.0,3.44,0,0.437,6.782,41.1,3.7886,5,398,15.2,393.87,6.68,32.0,1,True,"[0.07875, 45.0, 3.44, 0.0, 0.437, 6.782, 41.1, 3.7886, 5.0, 398.0, 15.2, 393.87, 6.68]","[-0.11493402495915639, 0.11493402495915639]",1.0
9,0.0795,60.0,1.69,0,0.411,6.579,35.9,10.7103,4,411,18.3,370.78,5.49,24.1,0,False,"[0.0795, 60.0, 1.69, 0.0, 0.411, 6.579, 35.9, 10.7103, 4.0, 411.0, 18.3, 370.78, 5.49]","[1.3769055223037867, -1.3769055223037867]",0.0


In [10]:
# in column rawpredicition is [-margin, margin] 
# positive margin => 1 
# negative margin => 0

evaluator = BinaryClassificationEvaluator(labelCol="CAT",rawPredictionCol="rawPrediction", metricName="areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print("Test Error",(1.0 - accuracy))

Test Error 0.08045977011494243


In [11]:
predictionAndLabels = predictions.select(predictions.prediction, predictions.CAT)
print (predictionAndLabels.take(20))

[Row(prediction=0.0, CAT=0), Row(prediction=0.0, CAT=1), Row(prediction=0.0, CAT=0), Row(prediction=0.0, CAT=0), Row(prediction=1.0, CAT=1), Row(prediction=0.0, CAT=0), Row(prediction=1.0, CAT=1), Row(prediction=0.0, CAT=0), Row(prediction=1.0, CAT=1), Row(prediction=0.0, CAT=0), Row(prediction=0.0, CAT=0), Row(prediction=0.0, CAT=0), Row(prediction=0.0, CAT=0), Row(prediction=0.0, CAT=0), Row(prediction=0.0, CAT=0), Row(prediction=0.0, CAT=0), Row(prediction=0.0, CAT=0), Row(prediction=0.0, CAT=0), Row(prediction=0.0, CAT=0), Row(prediction=0.0, CAT=0)]


In [12]:
evaluator2 = BinaryClassificationEvaluator(labelCol="CAT",rawPredictionCol="prediction", metricName="areaUnderROC")

accuracy = evaluator2.evaluate(predictionAndLabels)
print("Test Error",(1.0 - accuracy))

Test Error 0.16666666666666674


In [13]:
countcorrect = predictionAndLabels.filter("CAT == prediction").count()
countincorrect = predictionAndLabels.filter("CAT != prediction").count()
countall = predictionAndLabels.count()
accuracy = countcorrect/countall
print(f"countcorrect: {countcorrect}")
print(f"countincorrect: {countincorrect}")
print(f"countall: {countall}")
print(f"accuracy: {accuracy}")
print(f"Test Error {1-accuracy}")

countcorrect: 33
countincorrect: 2
countall: 35
accuracy: 0.9428571428571428
Test Error 0.05714285714285716


In [14]:
countpredzero =  predictionAndLabels.filter("prediction == '1.0'").count()
print (countpredzero)

4


In [15]:
spark.stop()