### Initialize the spark environment and load the helper functions we have provided.

In [None]:
import sys

rootpath = '/class/'
datapath = f'{rootpath}datasets/'
sys.path.append(rootpath)
from pyspark_helpers import *
sc, spark, conf = initspark()

import pandas as pd
import matplotlib as mp
import numpy
from matplotlib import pyplot as plt


### Read in a simple dataset of latitudes and longitudes.

In [None]:
filename = 'superchargers.csv'
df = spark.read.csv(f'{datapath}{filename}', header = True, inferSchema = True)
display(df)

# Save a pointer to the raw data
dfRawFile = df
df.count()


### Visualize this dataset using pandas. Normally you don't do this in Spark but it is helpful here.

In [None]:
%matplotlib inline
p = df.toPandas()
print(p)
import matplotlib.pyplot as plt
plt.plot(p.loc[:,'lng'],p.loc[:,'lat'],'o')



### Turn the features into a big vector.

In [None]:
from pyspark.ml.feature import VectorAssembler
vecAssembler = VectorAssembler(inputCols=["lat", "lng"], outputCol="features")
dfML = vecAssembler.transform(df)
display(dfML)


### Display the cluster results.

In [None]:
import matplotlib.pyplot as plt
from pyspark.ml.clustering import KMeans

CLUSTERS = 2
kmeans = KMeans().setK(CLUSTERS).setSeed(1)
kmeans = KMeans(k = CLUSTERS, seed = 1)
model = kmeans.fit(dfML)
predictions = model.transform(dfML)
centroids = model.clusterCenters()
print(centroids)
display(predictions)

### Visualize the cluster results graphically.

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from pyspark.ml.clustering import KMeans

print(centroids)
for i in range(CLUSTERS):    
    p = predictions.select('lng', 'lat').where(f'prediction = {i}').toPandas()
    plt.plot(p.loc[:,'lng'],p.loc[:,'lat'],'o')
    plt.plot(centroids[i][1], 
           centroids[i][0],'kx')


### Load the KMeans class and train the model. Evaluate how good it performs for several different cluster counts.

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

def evaluateCluster(model, df):
    wssse = model.computeCost(dfML.select('features'))
    print("Within Set Sum of Squared Errors = " + str(wssse))

    evaluator = ClusteringEvaluator()

    predictions = model.transform(df)
    silhouette = evaluator.evaluate(predictions)
    print("Silhouette with squared euclidean distance = " + str(silhouette))

    # Shows the result.
    centers = model.clusterCenters()
    print("Cluster Centers: ")
    for center in centers:
        print(center)

for k in range(2, 5):
    print ('Number of clusters', k)
    kmeans = KMeans().setK(k).setSeed(1)
    model = kmeans.fit(dfML.select('features'))
    evaluateCluster(model, dfML.select('features'))
    print()


### Use and elbow chart to help visualize what is the optimal number of clusters.

In [None]:
%matplotlib inline
def plot_elbow(df, cluster_cnt = 7):
    import numpy as np
    CLUSTERS = range(2, cluster_cnt)
    scores = [KMeans().setK(c).setSeed(1).fit(df).computeCost(df)
              for c in CLUSTERS]
    print(scores)
    plt.plot(CLUSTERS, scores)
    plt.xlabel('Number of Clusters')
    plt.ylabel('Score')
    plt.title('Elbow Curve')
    plt.xticks(np.arange(2, cluster_cnt))

plot_elbow(dfML.select('features'))