# Boston Housing Clustering

In [106]:
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import StringIndexer, VectorAssembler, BucketedRandomProjectionLSH, MinHashLSH
from pyspark.ml.clustering import KMeans, BisectingKMeans
from pyspark.sql.session import SparkSession, Row
from pyspark.sql.functions import desc, expr
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml import Pipeline

In [107]:
inputFile = "../data/Boston_Housing_Data.csv"

Spark session creation 

In [108]:
spark = (SparkSession
       .builder
       .appName("ChurnDataPreprocessing")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [109]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)
 |-- CAT: integer (nullable = true)
 |-- CATBOOL: boolean (nullable = true)

None


## Data Preparation

In [110]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


### KMeans 

In [111]:
km = KMeans (seed=12345, featuresCol="features",predictionCol="prediction")   

### Build Param Grid

In [112]:
paramGrid = ParamGridBuilder().addGrid(km.k, [2, 3, 4, 5, 6 ]) \
				                 .addGrid(km.maxIter, [10, 100 ]) \
                                 .build()

### Build the pipeline

In [113]:
pipeline = Pipeline(stages = [assembler, km ])

### Build the evaluator

In [114]:
evaluator = ClusteringEvaluator()

### Build the cross validator

In [115]:
 cv = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=2, parallelism=2)

### Train the model 

In [116]:
cvModel = cv.fit(df)

### Find the best model

In [117]:
kmModel = cvModel.bestModel.stages[1]
print(kmModel.explainParams())
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
   print(center)

distanceMeasure: The distance measure. Supported options: 'euclidean' and 'cosine' (default: euclidean)
featuresCol: features column name (default: features, current: features)
initMode: The initialization algorithm. Supported options: 'random' and 'k-means||'. (default: k-means||)
initSteps: The number of steps for k-means|| initialization mode. Must be > 0. (default: 2)
k: The number of clusters to create. Must be > 1. (default: 2, current: 3)
maxIter: maximum number of iterations (>= 0) (default: 20, current: 10)
predictionCol: prediction column name (default: prediction, current: prediction)
seed: random seed (default: -5495454462330629519, current: 12345)
tol: the convergence tolerance for iterative algorithms (>= 0) (default: 0.0001)
Cluster Centers: 
[1.09105113e+01 0.00000000e+00 1.85725490e+01 7.84313725e-02
 6.71225490e-01 5.98226471e+00 8.99137255e+01 2.07716373e+00
 2.30196078e+01 6.68205882e+02 2.01950980e+01 3.71803039e+02
 1.78740196e+01]
[3.74992678e-01 1.57103825e+01 8

## Test the model

In [118]:
predictions = cvModel.transform(df)
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " , silhouette)

Silhouette with squared euclidean distance =  0.8838838565015775


## Bisecting k-means

### Build BisectingKMeans

In [119]:
bkm = BisectingKMeans(seed=12345, featuresCol="features",predictionCol="prediction")   

### Build pararm grid

In [120]:
paramGridBkm = ParamGridBuilder().addGrid(km.k, [2, 3, 4, 5, 6 ]) \
				                 .addGrid(km.maxIter, [10, 100 ]) \
                                 .build()

### Build pipeline

In [121]:
pipelineBkm =  Pipeline(stages = [assembler, km ])

### Build Cross Validator

In [122]:
cvbkm = CrossValidator(estimator=pipelineBkm,evaluator=evaluator,estimatorParamMaps=paramGridBkm, numFolds=2,parallelism=2)

### Train the model

In [123]:
cvModebkml = cvbkm.fit(df)

### Find the best model

In [124]:
kmModelbkm = cvModebkml.bestModel.stages[1]
print(kmModelbkm.explainParams())
centers = kmModelbkm.clusterCenters()
print("Cluster Centers: ")
for center in centers:
   print(center)

distanceMeasure: The distance measure. Supported options: 'euclidean' and 'cosine' (default: euclidean)
featuresCol: features column name (default: features, current: features)
initMode: The initialization algorithm. Supported options: 'random' and 'k-means||'. (default: k-means||)
initSteps: The number of steps for k-means|| initialization mode. Must be > 0. (default: 2)
k: The number of clusters to create. Must be > 1. (default: 2, current: 3)
maxIter: maximum number of iterations (>= 0) (default: 20, current: 10)
predictionCol: prediction column name (default: prediction, current: prediction)
seed: random seed (default: -5495454462330629519, current: 12345)
tol: the convergence tolerance for iterative algorithms (>= 0) (default: 0.0001)
Cluster Centers: 
[1.09105113e+01 0.00000000e+00 1.85725490e+01 7.84313725e-02
 6.71225490e-01 5.98226471e+00 8.99137255e+01 2.07716373e+00
 2.30196078e+01 6.68205882e+02 2.01950980e+01 3.71803039e+02
 1.78740196e+01]
[3.74992678e-01 1.57103825e+01 8

In [125]:
### Test the model

In [126]:
predictions = cvModebkml.transform(df)
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " , silhouette)

Silhouette with squared euclidean distance =  0.8838838565015775


In [127]:
spark.stop()