In [None]:
import os
os.environ['JDBC_HOST'] = 'jrtest01-splice-hregion'

In [None]:
# setup-- 
import os
import pyspark
from splicemachine.spark.context import PySpliceContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
!conda install plotly

# make sure pyspark tells workers to use python3 not 2 if both are installed
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
jdbc_host = os.environ['JDBC_HOST']

conf = pyspark.SparkConf()
sc = pyspark.SparkContext(conf=conf)

spark = SparkSession.builder.config(conf=conf).getOrCreate()

splicejdbc=f"jdbc:splice://{jdbc_host}:1527/splicedb;user=splice;password=admin"

splice = PySpliceContext(spark, splicejdbc)


# KMeans


# KMeans
KMeans is an unsupervised-learning, clustering algorithm used to determine similarities and trends within a given dataset
KMeans is an iterative process, where K clusters are created by the user and continualy computed on a given dataset until the data converges and the algorithm ends.
## Setting up a KMeans
To implement KMeans, you will need two things:

* A dataset of structured data. Learn about structured data [here](https://www.quora.com/What-are-Structured-semi-structured-and-unstructured-data-in-Big-Data/answer/Manoj-R-Patil?srid=33JGI)
* A value for K. K can be computed a number of ways, none of which are necessarily incorrect. It is dependent on the specific dataset you are working with. It is suggested to first plot your data and do trials with multiple values of K. Learn more about choosing a good K [here](https://en.wikipedia.org/wiki/Determining_the_number_of_clusters_in_a_data_set)

## Computing KMeans
A KMeans algorithm is computed in three main steps:
1. K clusters are created and assigned locations, either randomly generated or randomly taken from K datapoints
2. For each datapoint in your dataset, the square Euclidian Distance is computed against all clusters until a minimum is found. That datapoint is assigned to the cluster of minimum distance
3. After all datapoints are assigned, clusters are recomputed and reassigned locations using the mean distance of its assigned datapoints.

#### Steps 2 and 3 are repeated until one of the following:
* A set number of iterations occurs
* No datapoints are reassigned to new clusters
* Minimum distance changes occur within clusters

Learn more about KMeans algorithm [here](https://www.datascience.com/blog/introduction-to-k-means-clustering-algorithm-learn-data-science-tutorials)
In-depth KMeans clustering documentation [here](http://scikit-learn.org/stable/modules/clustering.html#k-means)

#### We will now create a simple KMeans example.
To follow along with this data, download [here](https://raw.githubusercontent.com/datascienceinc/learn-data-science/master/Introduction-to-K-means-Clustering/Data/data_1024.csv)
More KMeans and Scala examples [here](https://github.com/apache/spark/tree/master/examples/src/main/scala/org/apache/spark/examples/mllib)

Learn about Sum of Squares for Errors (used later in algorithm) [here](http://www.wikihow.com/Calculate-the-Sum-of-Squares-for-Error-(SSE))

In [None]:
%%scala 
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg.Vectors

//Grabbing data and parsing to vectors

val data = sc.textFile("s3a://splice-demo/sample_kmeans_data.txt")
val formatted =  data.map(s => s.split(" "))
val parsedData = formatted.map(s => Vectors.dense(s))
val rddData = sc.parallelize(parsedData)

//Clustering data into 3 groups
val K = 4
val maxIterations = 5000
val clusters = KMeans.train(rddData, K, maxIterations)

//Compute cost Within Set Sum of Squared Errors
val WSSSE = clusters.computeCost(rddData)
println("Within Set Sum of Squared Errors: " + WSSSE)

## And in PySpark

In [None]:
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
import plotly.express as px

data = spark.createDataFrame(px.data.iris()).drop('species_id')

# Convert species column into int type
si = StringIndexer(inputCol='species', outputCol='species_vec')

# Create a vector of features
cols = [c for c in data.columns if c != 'species' and c != 'petal_length']
va = VectorAssembler(inputCols=cols, outputCol='features')

# Define stages of a Pipeline for Spark
pipeline = Pipeline(stages = [si, va])

data = pipeline.fit(data).transform(data)

# Show the final dataset
data.orderBy('sepal_width').show()

## Build the kmeans algorithm and print cluster results

In [None]:
# Trains a k-means model.
kmeans = KMeans().setK(3).setSeed(1)
model = kmeans.fit(data)

# Make predictions
predictions = model.transform(data)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

## Now we will combine the cluster data to the dataset for plotting

In [None]:
import pandas as pd
centers = []
for center in model.clusterCenters():
    centers.append(center)
# Match the schema of the centers dataframe to the predictions dataframe
cents = pd.DataFrame(centers,columns=['sepal_length','sepal_width','petal_width'])
# The 3 labels we are trying to cluster on
cents['species'] = ['setosa','virginica','versicolor']
# Add a column with the value 10 to all cluster center datapoints. This is for plotting purposes
cents.insert(0,'center',[10]*len(cents))
cents

In [None]:
# Add the center datapoints to the predictions dataframe
preds = predictions.toPandas()
# Add a column with the value of 2 to all non-cluster-center datapoints. Again, for plotting purposes
preds.insert(0, "center", [2]*len(preds), True)
# Insert the cluster center data
preds = preds.append(cents, sort=False)
preds[:10]

## Now we can plot the datapoints, colored by cluster, with the cluster centers as the large, diamond datapoints in the "center"
The cluster center datapoints are larger due to the data formatting we performed in the cells above

In [None]:
# We use plotly to visualize our data
px.scatter_3d(preds, x='sepal_length', y='sepal_width', z='petal_width',
              color='species', symbol='center', size='center')
