# Boston Housing Clustering

In [1]:
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import StringIndexer, VectorAssembler, BucketedRandomProjectionLSH, MinHashLSH
from pyspark.ml.clustering import KMeans, BisectingKMeans
from pyspark.sql.session import SparkSession, Row
from pyspark.sql.functions import desc, expr
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml import Pipeline

In [2]:
inputFile = "../data/Boston_Housing_Data.csv"

Spark session creation 

In [3]:
spark = (SparkSession
       .builder
       .appName("BostonHousingClustering")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [4]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

root
 |-- CRIM: double (nullable = true)
 |-- ZN: double (nullable = true)
 |-- INDUS: double (nullable = true)
 |-- CHAS: integer (nullable = true)
 |-- NOX: double (nullable = true)
 |-- RM: double (nullable = true)
 |-- AGE: double (nullable = true)
 |-- DIS: double (nullable = true)
 |-- RAD: integer (nullable = true)
 |-- TAX: integer (nullable = true)
 |-- PTRATIO: double (nullable = true)
 |-- B: double (nullable = true)
 |-- LSTAT: double (nullable = true)
 |-- MEDV: double (nullable = true)
 |-- CAT: integer (nullable = true)
 |-- CATBOOL: boolean (nullable = true)

None


## Data Preparation

In [5]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)

['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']


### KMeans 

In [6]:
# TODO add usefull params
km = KMeans (seed=12345, featuresCol="features",predictionCol="prediction")   

### Build Param Grid

In [7]:
# TODO adde usefull settings and appropriate params
paramGrid = ParamGridBuilder().addGrid(km.k, [2  ]) \
				                 .addGrid(km.maxIter, [10 ]) \
                                 .build()

### Build the pipeline

In [8]:
pipeline = Pipeline(stages = [assembler, km ])

### Build the evaluator

In [9]:
evaluator = ClusteringEvaluator()

### Build the cross validator

In [10]:
# TODO Ajust settings
cv = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=2, parallelism=2)

### Train the model 

In [11]:
cvModel = cv.fit(df)

### Find the best model

In [12]:
kmModel = cvModel.bestModel.stages[1]
print(kmModel.explainParams())
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
   print(center)

distanceMeasure: The distance measure. Supported options: 'euclidean' and 'cosine' (default: euclidean)
featuresCol: features column name (default: features, current: features)
initMode: The initialization algorithm. Supported options: 'random' and 'k-means||'. (default: k-means||)
initSteps: The number of steps for k-means|| initialization mode. Must be > 0. (default: 2)
k: The number of clusters to create. Must be > 1. (default: 2, current: 2)
maxIter: maximum number of iterations (>= 0) (default: 20, current: 10)
predictionCol: prediction column name (default: prediction, current: prediction)
seed: random seed (default: 3783669506817089249, current: 12345)
tol: the convergence tolerance for iterative algorithms (>= 0) (default: 0.0001)
Cluster Centers: 
[1.22991617e+01 0.00000000e+00 1.84518248e+01 5.83941606e-02
 6.70102190e-01 6.00621168e+00 8.99678832e+01 2.05447007e+00
 2.32700730e+01 6.67642336e+02 2.01963504e+01 2.91039051e+02
 1.86745255e+01]
[3.88774444e-01 1.55826558e+01 8.

## Test the model

In [13]:
predictions = cvModel.transform(df)
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " , silhouette)

Silhouette with squared euclidean distance =  0.8537784087478455


## Bisecting k-means

### Build BisectingKMeans

In [14]:
# TODO add usefull settings 
bkm = BisectingKMeans(seed=12345, featuresCol="features",predictionCol="prediction")   

### Build pararm grid

In [15]:
# TODO add usefull settings and appropriate params 
paramGridBkm = ParamGridBuilder().addGrid(km.k, [2]) \
				                 .addGrid(km.maxIter, [10 ]) \
                                 .build()

### Build pipeline

In [16]:
pipelineBkm =  Pipeline(stages = [assembler, km ])

### Build Cross Validator

In [17]:
# TODO ajust settings
cvbkm = CrossValidator(estimator=pipelineBkm,evaluator=evaluator,estimatorParamMaps=paramGridBkm, numFolds=2,parallelism=2)

### Train the model

In [18]:
cvModebkml = cvbkm.fit(df)

### Find the best model

In [19]:
kmModelbkm = cvModebkml.bestModel.stages[1]
print(kmModelbkm.explainParams())
centers = kmModelbkm.clusterCenters()
print("Cluster Centers: ")
for center in centers:
   print(center)

distanceMeasure: The distance measure. Supported options: 'euclidean' and 'cosine' (default: euclidean)
featuresCol: features column name (default: features, current: features)
initMode: The initialization algorithm. Supported options: 'random' and 'k-means||'. (default: k-means||)
initSteps: The number of steps for k-means|| initialization mode. Must be > 0. (default: 2)
k: The number of clusters to create. Must be > 1. (default: 2, current: 2)
maxIter: maximum number of iterations (>= 0) (default: 20, current: 10)
predictionCol: prediction column name (default: prediction, current: prediction)
seed: random seed (default: 3783669506817089249, current: 12345)
tol: the convergence tolerance for iterative algorithms (>= 0) (default: 0.0001)
Cluster Centers: 
[1.22991617e+01 0.00000000e+00 1.84518248e+01 5.83941606e-02
 6.70102190e-01 6.00621168e+00 8.99678832e+01 2.05447007e+00
 2.32700730e+01 6.67642336e+02 2.01963504e+01 2.91039051e+02
 1.86745255e+01]
[3.88774444e-01 1.55826558e+01 8.

In [20]:
### Test the model

In [21]:
predictions = cvModebkml.transform(df)
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " , silhouette)

Silhouette with squared euclidean distance =  0.8537784087478455


In [22]:
spark.stop()