# Boston Housing Clustering

In [None]:
import sys
sys.path.append("..")
from pyspark.sql.types import BooleanType
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans, BisectingKMeans
from pyspark.sql.session import SparkSession
from pyspark.sql.functions import desc, expr
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml import Pipeline
from helpers.path_translation import translate_to_file_string

In [None]:
inputFile = translate_to_file_string("../data/Boston_Housing_Data.csv")

Spark session creation 

In [None]:
spark = (SparkSession
       .builder
       .appName("BostonHousingClustering")
       .getOrCreate())

DataFrame creation using an ifered Schema 

In [None]:
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile) \
       .withColumn("CATBOOL", expr("CAT").cast(BooleanType()))
print(df.printSchema())

## Data Preparation

In [None]:
featureCols = df.columns.copy()
featureCols.remove("MEDV")
featureCols.remove("CAT")
featureCols.remove("CATBOOL") 
print(featureCols)

assembler =  VectorAssembler(outputCol="features", inputCols=featureCols)
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures",
                        withStd=True, withMean=False)

### KMeans 

In [None]:
# TODO add usefull params
km = KMeans (seed=12345, featuresCol="scaledFeatures",predictionCol="prediction")   

### Build Param Grid

In [None]:
# TODO adde usefull settings and appropriate params
paramGrid = ParamGridBuilder().addGrid(km.k, [2  ]) \
				                 .addGrid(km.maxIter, [10 ]) \
                                 .build()

### Build the pipeline

In [None]:
pipeline = Pipeline(stages = [assembler,scaler, km ])

### Build the evaluator

In [None]:
evaluator = ClusteringEvaluator(featuresCol="scaledFeatures")

### Build the cross validator

In [None]:
# TODO Ajust settings
cv = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=2, parallelism=2)

### Train the model 

In [None]:
cvModel = cv.fit(df)

### Find the best model

In [None]:
kmModel = cvModel.bestModel.stages[2]
print(kmModel.explainParams())
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
   print(center)

## Test the model

In [None]:
predictions = cvModel.transform(df)
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " , silhouette)

## Bisecting k-means

### Build BisectingKMeans

In [None]:
# TODO add usefull settings 
bkm = BisectingKMeans(seed=12345, featuresCol="scaledFeatures",predictionCol="prediction")   

### Build pararm grid

In [None]:
# TODO add usefull settings and appropriate params 
paramGridBkm = ParamGridBuilder().addGrid(km.k, [2]) \
				                 .addGrid(km.maxIter, [10 ]) \
                                 .build()

### Build pipeline

In [None]:
pipelineBkm =  Pipeline(stages = [assembler, scaler, km ])

### Build Cross Validator

In [None]:
# TODO ajust settings
evaluator = ClusteringEvaluator(featuresCol="scaledFeatures")
cvbkm = CrossValidator(estimator=pipelineBkm,evaluator=evaluator,estimatorParamMaps=paramGridBkm, numFolds=2,parallelism=2)

### Train the model

In [None]:
cvModebkml = cvbkm.fit(df)

### Find the best model

In [None]:
kmModelbkm = cvModebkml.bestModel.stages[2]
print(kmModelbkm.explainParams())
centers = kmModelbkm.clusterCenters()
print("Cluster Centers: ")
for center in centers:
   print(center)

### Test the model

In [None]:
predictions = cvModebkml.transform(df)
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " , silhouette)

In [None]:
spark.stop()