# Boston Housing Classification Logistic Regression

In [1]:
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import expr
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import DenseVector
from pyspark.mllib.evaluation import MulticlassMetrics

In [2]:
inputFile = "../data/Boston_Housing_Data.csv"

Spark session creation 

In [3]:
spark = (SparkSession
       .builder
       .appName("ChurnDataPreprocessing")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [4]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)
 |-- CAT: integer (nullable = true)
 |-- CATBOOL: boolean (nullable = true)

None


In [5]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


In [6]:
assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)
featureSet = assembler.transform(df)


Build Labeled Point Feature Set

In [7]:
featureSetLP = (featureSet.select(featureSet.CAT, featureSet.features)
      .rdd
      .map(lambda row: LabeledPoint(row.CAT, DenseVector(row.features))))

Prepare training and test data.

In [8]:
splits = featureSetLP.randomSplit([0.9, 0.1 ], 12345)
training = splits[0]
test = splits[1]

Logistic regression

Train the model 

In [9]:
#TODO Optimize the parameters of LR
modelLRLB = LogisticRegressionWithLBFGS.train(training, numClasses=2)

Test the model

In [10]:
predictionAndLabels = test.map(lambda x : [float(modelLRLB.predict(x.features )), float(x.label) ])
print (predictionAndLabels.take(20))

[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0], [0.0, 0.0], [0.0, 0.0], [1.0, 1.0], [1.0, 1.0], [0.0, 1.0]]


evaluate the result

In [11]:
metrics =  MulticlassMetrics(predictionAndLabels)
print("Test Error LRLBFG =" , (1.0 - metrics.accuracy))

Test Error LRLBFG = 0.04878048780487809


In [12]:
spark.stop()