<a href="https://colab.research.google.com/github/roitraining/SparkforDataEngineers/blob/Development/Ch04_ClusterAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Initialize the spark environment and load the helper functions we have provided.

In [0]:
import sys
sys.path.append('/home/student/ROI/SparkProgram')

import pandas as pd
import matplotlib as mp
import numpy
from matplotlib import pyplot as plt
#from IPython.display import display

import pyspark_helpers as pyh
sc, spark, conf = pyh.initspark()
from pyspark_helpers import display

### Read in a simple dataset of latitudes and longitudes.

In [0]:
filename = 'superchargers.csv'
df = spark.read.csv(f'/home/student/ROI/Spark/datasets/finance/{filename}', header = True, inferSchema = True)
display(df)

# Save a pointer to the raw data
dfRawFile = df



### Visualize this dataset using pandas. Normally you don't do this in spark but it is helpful here.

In [0]:
%matplotlib inline
p = df.toPandas()
import matplotlib.pyplot as plt
plt.plot(p.loc[:,'lat'],p.loc[:,'lng'],'o')



### Turn the features into a big vector.

In [0]:
from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(inputCols=["lat", "lng"], outputCol="features")
dfML = vecAssembler.transform(df)
display(dfML)


### Load the KMeans class and train the model. Evaluate how good it performs for several different cluster counts.

In [0]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

def evaluateCluster(model, df):
    wssse = model.computeCost(dfML.select('features'))
    print("Within Set Sum of Squared Errors = " + str(wssse))

    evaluator = ClusteringEvaluator()

    predictions = model.transform(df)
    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(silhouette))

    # Shows the result.
    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)

for k in range(2, 5):
    print ('Number of clusters', k)
    kmeans = KMeans().setK(k).setSeed(1)
    model = kmeans.fit(dfML.select('features'))
    evaluateCluster(model, dfML.select('features'))
    print()


### Visualize the cluster results graphically.

In [0]:
%matplotlib inline
import matplotlib.pyplot as plt

CLUSTERS = 2
kmeans = KMeans().setK(CLUSTERS).setSeed(1)
model = kmeans.fit(dfML.select('features'))
predictions = model.transform(dfML)
centroids = model.clusterCenters()

for i in range(CLUSTERS):    
    p = predictions.select('lat', 'lng').where(f'prediction = {i}').toPandas()
    plt.plot(p.loc[:,'lat'],p.loc[:,'lng'],'o')
    plt.plot(centroids[i][0], 
           centroids[i][1],'kx')


### Use and elbow chart to help visualize what is the optimal number of clusters.

In [0]:
%matplotlib inline
def plot_elbow(df, cluster_cnt = 6):
    import numpy as np
    CLUSTERS = range(2, cluster_cnt)
    scores = [KMeans().setK(c).setSeed(1).fit(df).computeCost(df)
              for c in CLUSTERS]
    print(scores)
    plt.plot(CLUSTERS, scores)
    plt.xlabel('Number of Clusters')
    plt.ylabel('Score')
    plt.title('Elbow Curve')
    plt.xticks(np.arange(2, cluster_cnt))

plot_elbow(dfML.select('features'))

### Work in progress below.

In [0]:
from pyspark.ml.clustering import LDA
lda = LDA(k=10, maxIter=10)
model = lda.fit(dfML.select('features'))

ll = model.logLikelihood(dfML)
lp = model.logPerplexity(dfML)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

# Describe topics.
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

# Shows the result
transformed = model.transform(dataset.select('features'))
transformed.show()

In [0]:
from pyspark.ml.clustering import GaussianMixture

# loads data
gmm = GaussianMixture().setK(2).setSeed(1)
model = gmm.fit(dfML.select('features'))

print("Gaussians shown as a DataFrame: ")
model.gaussiansDF.show(truncate=False)