# Churn Clustering

In [130]:
from pyspark.sql import DataFrameReader
from pyspark.sql import SparkSession
from pyspark.ml.feature import IndexToString, Normalizer, StringIndexer, VectorAssembler, VectorIndexer
from pyspark.ml.clustering import KMeans, BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline

## Select the churn file 

In [131]:
inputFile = "../data/churn.csv"

## Create the Spark Session 

In [132]:
#create a SparkSession
spark = (SparkSession
       .builder
       .appName("ChurnDecisionTree")
       .getOrCreate())
# create a DataFrame using an ifered Schema 
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .option("delimiter", ";") \
       .csv(inputFile)   

## Data Preparation
### Transform labels into index

In [133]:
labelIndexer = StringIndexer().setInputCol("LEAVE").setOutputCol("label").fit(df)
collegeIndexer = StringIndexer().setInputCol("COLLEGE").setOutputCol("COLLEGE_NUM").fit(df)
satIndexer = StringIndexer().setInputCol("REPORTED_SATISFACTION").setOutputCol("REPORTED_SATISFACTION_NUM").fit(df)
usageIndexer = StringIndexer().setInputCol("REPORTED_USAGE_LEVEL").setOutputCol("REPORTED_USAGE_LEVEL_NUM").fit(df)
changeIndexer = StringIndexer().setInputCol("CONSIDERING_CHANGE_OF_PLAN").setOutputCol("CONSIDERING_CHANGE_OF_PLAN_NUM").fit(df)

 ### Build the feature vector

In [134]:
featureCols = df.columns.copy()
featureCols.remove("LEAVE")
featureCols.remove("COLLEGE")
featureCols.remove("REPORTED_SATISFACTION")
featureCols.remove("REPORTED_USAGE_LEVEL")
featureCols.remove("CONSIDERING_CHANGE_OF_PLAN")
featureCols = featureCols +["COLLEGE_NUM","REPORTED_SATISFACTION_NUM","REPORTED_USAGE_LEVEL_NUM","CONSIDERING_CHANGE_OF_PLAN_NUM"]

### Build the feature Vector Assembler

In [135]:
assembler =  VectorAssembler(outputCol="features", inputCols=list(featureCols))

### Build a featureIndexer 

Automatically identify categorical features, and index them.
Features with > 5 distinct values are treated as continuous.

In [136]:
featureIndexer = VectorIndexer(inputCol="features",outputCol="indexedFeatures", maxCategories=6) 

## Build KMeans

In [137]:
km = KMeans (seed=12345, featuresCol="indexedFeatures",predictionCol="prediction")   

## Data Preparation
### Build a network parameters grid

In [138]:
paramGrid = ParamGridBuilder().addGrid(km.k, [2, 3, 4, 5, 6, 7, 8, 9, 10 ]) \
				                 .addGrid(km.maxIter, [10, 100 ]) \
                                 .build()

### Build a pipeline

In [139]:
pipeline = Pipeline(stages = [labelIndexer, collegeIndexer, satIndexer,
				usageIndexer, changeIndexer, assembler, featureIndexer, km ])

## Build an evaluator

In [140]:
evaluator = ClusteringEvaluator()

## Build the Cross Validator

In [141]:
 cv = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=paramGrid, numFolds=2, parallelism=2)

## Train the Model 

In [142]:
cvModel = cv.fit(df)

## Find out what is the best model

In [143]:
kmModel = cvModel.bestModel.stages[7]
print(kmModel.explainParams())
centers = kmModel.clusterCenters()
print("Cluster Centers: ")
for center in centers:
   print(center)

distanceMeasure: The distance measure. Supported options: 'euclidean' and 'cosine' (default: euclidean)
featuresCol: features column name (default: features, current: indexedFeatures)
initMode: The initialization algorithm. Supported options: 'random' and 'k-means||'. (default: k-means||)
initSteps: The number of steps for k-means|| initialization mode. Must be > 0. (default: 2)
k: The number of clusters to create. Must be > 1. (default: 2, current: 2)
maxIter: maximum number of iterations (>= 0) (default: 20, current: 10)
predictionCol: prediction column name (default: prediction, current: prediction)
seed: random seed (default: 3128251570395763966, current: 12345)
tol: the convergence tolerance for iterative algorithms (>= 0) (default: 0.0001)
Cluster Centers: 
[8.06789022e+04 8.55489204e+01 2.37184548e+01 3.09949923e+05
 3.92366650e+02 7.93631916e+00 6.00750675e+00 4.95360999e-01
 1.16810054e+00 1.15494265e+00 1.16464238e+00]
[7.97028353e+04 8.66064587e+01 2.41609774e+01 7.59864808e

## Test the model 

In [144]:
predictions = cvModel.transform(df)
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " , silhouette)

Silhouette with squared euclidean distance =  0.7948187722321147


## Bisecting k-means

Build the Bisecting KMeans

In [145]:
bkm = BisectingKMeans(k=2, seed=12345, featuresCol="indexedFeatures", predictionCol="prediction")

Param Grid for Bisecting

In [146]:
paramGridBkm = ParamGridBuilder().addGrid(bkm.k, [ 2, 3, 4, 5, 6, 7, 8, 9, 10 ]) \
				                .addGrid(bkm.maxIter, [10, 100]) \
                                .build()

Pipeline for Bisecting

In [147]:
pipelineBkm = Pipeline(stages = [labelIndexer, collegeIndexer, satIndexer,
				usageIndexer, changeIndexer, assembler, featureIndexer, bkm ])

Build the cross validation

In [148]:
cvbkm = CrossValidator(estimator=pipelineBkm,evaluator=evaluator,estimatorParamMaps=paramGridBkm, numFolds=2,parallelism=2)

## Train the Model

In [149]:
cvModebkml = cvbkm.fit(df)

### Find the best model

In [150]:
kmModelbkm = cvModebkml.bestModel.stages[7]
print(kmModelbkm.explainParams())
centers = kmModelbkm.clusterCenters()
print("Cluster Centers: ")
for center in centers:
   print(center)

distanceMeasure: The distance measure. Supported options: 'euclidean' and 'cosine' (default: euclidean)
featuresCol: features column name (default: features, current: indexedFeatures)
k: The desired number of leaf clusters. Must be > 1. (default: 4, current: 2)
maxIter: maximum number of iterations (>= 0) (default: 20, current: 10)
minDivisibleClusterSize: The minimum number of points (if >= 1.0) or the minimum proportion of points (if < 1.0) of a divisible cluster. (default: 1.0)
predictionCol: prediction column name (default: prediction, current: prediction)
seed: random seed (default: -772008126836755120, current: 12345)
Cluster Centers: 
[8.06702658e+04 8.55429463e+01 2.37207222e+01 3.09873992e+05
 3.92378164e+02 7.93520081e+00 6.00683429e+00 4.95359433e-01
 1.16807290e+00 1.15507931e+00 1.16452919e+00]
[7.97158769e+04 8.66146294e+01 2.41574620e+01 7.59754385e+05
 3.85595974e+02 8.09597447e+00 5.99558174e+00 5.00859107e-01
 1.14837997e+00 1.15868925e+00 1.16163476e+00]


### Test the model

In [151]:
predictions = cvModebkml.transform(df)
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " , silhouette)

Silhouette with squared euclidean distance =  0.7948187722321147


In [152]:
spark.stop()