# Boston Housing Classification Logistic Regression

In [11]:
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import expr
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [12]:
inputFile = "../data/Boston_Housing_Data.csv"

Spark session creation 

In [13]:
spark = (SparkSession
       .builder
       .appName("BostonHoudingClass")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [14]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)
 |-- CAT: integer (nullable = true)
 |-- CATBOOL: boolean (nullable = true)

None


Prepare training and test data.

In [15]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


In [16]:
labledPointDataSet = assembler.transform(df)
splits = labledPointDataSet.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

Logistic regression

In [17]:
lr = LogisticRegression(labelCol="CAT",featuresCol="features", maxIter=100, \
                           regParam=0, standardization=True, aggregationDepth=2)

Train the model 

In [18]:
lrModel = lr.fit(training)

Test the model

In [19]:
predictions = lrModel.transform(test)
predictions.show()

+-------+----+-----+----+------+-----+----+------+---+---+-------+------+-----+----+---+-------+--------------------+--------------------+--------------------+----------+
|   CRIM|  ZN|INDUS|CHAS|   NOX|   RM| AGE|   DIS|RAD|TAX|PTRATIO|     B|LSTAT|MEDV|CAT|CATBOOL|            features|       rawPrediction|         probability|prediction|
+-------+----+-----+----+------+-----+----+------+---+---+-------+------+-----+----+---+-------+--------------------+--------------------+--------------------+----------+
|0.01311|90.0| 1.22|   0| 0.403|7.249|21.9|8.6966|  5|226|   17.9|395.93| 4.81|35.4|  1|   true|[0.01311,90.0,1.2...|[-2.3687234279125...|[0.08558899561916...|       1.0|
|0.01439|60.0| 2.93|   0| 0.401|6.604|18.8|6.2196|  1|265|   15.6| 376.7| 4.38|29.1|  0|  false|[0.01439,60.0,2.9...|[-0.1511796981422...|[0.46227689585302...|       1.0|
|0.03871|52.5| 5.32|   0| 0.405|6.209|31.3|7.3172|  6|293|   16.6| 396.9| 7.14|23.2|  0|  false|[0.03871,52.5,5.3...|[2.13769388418023...|[0.8945

In [20]:
evaluator = BinaryClassificationEvaluator(labelCol="CAT",rawPredictionCol="prediction", metricName="areaUnderROC")
accuracy = evaluator.evaluate(predictions)
print("Test Error",(1.0 - accuracy))

Test Error 0.1026785714285714
