# Machine Learning - Clustering

In [1]:
from pyspark.sql import SparkSession

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
data_path = '../Data'
file_path = data_path + '/utilization.csv'

In [4]:
df = spark.read.format('csv').options(header=False, inferSchema=True).load(file_path)

# rename columns
df = df.withColumnRenamed('_c0', 'event_datetime')\
    .withColumnRenamed('_c1', 'server_id')\
    .withColumnRenamed('_c2', 'cpu_utilization')\
    .withColumnRenamed('_c3', 'free_memory')\
    .withColumnRenamed('_c4', 'session_count')

In [5]:
df.show(5)

+-------------------+---------+---------------+-----------+-------------+
|     event_datetime|server_id|cpu_utilization|free_memory|session_count|
+-------------------+---------+---------------+-----------+-------------+
|03/05/2019 08:06:14|      100|           0.57|       0.51|           47|
|03/05/2019 08:11:14|      100|           0.47|       0.62|           43|
|03/05/2019 08:16:14|      100|           0.56|       0.57|           62|
|03/05/2019 08:21:14|      100|           0.57|       0.56|           50|
|03/05/2019 08:26:14|      100|           0.35|       0.46|           43|
+-------------------+---------+---------------+-----------+-------------+
only showing top 5 rows



---------

# Vectorizing Features, tranform using VectorAssembler
- Spark ML algorithm expects the inputs in vector format.

In [6]:
vector_assembler = VectorAssembler(inputCols=['cpu_utilization', 'free_memory', 'session_count'], outputCol='features')

In [8]:
# transform by passing the original dataframe
vcluster_df = vector_assembler.transform(df)

In [9]:
vcluster_df.show(5)

+-------------------+---------+---------------+-----------+-------------+----------------+
|     event_datetime|server_id|cpu_utilization|free_memory|session_count|        features|
+-------------------+---------+---------------+-----------+-------------+----------------+
|03/05/2019 08:06:14|      100|           0.57|       0.51|           47|[0.57,0.51,47.0]|
|03/05/2019 08:11:14|      100|           0.47|       0.62|           43|[0.47,0.62,43.0]|
|03/05/2019 08:16:14|      100|           0.56|       0.57|           62|[0.56,0.57,62.0]|
|03/05/2019 08:21:14|      100|           0.57|       0.56|           50|[0.57,0.56,50.0]|
|03/05/2019 08:26:14|      100|           0.35|       0.46|           43|[0.35,0.46,43.0]|
+-------------------+---------+---------------+-----------+-------------+----------------+
only showing top 5 rows



We can see that there is a new column 'features' in the newly created DataFrame.

In [11]:
# type(vcluster_df)

# Model Creation

In [13]:
kmeans = KMeans().setK(3)   # set clusters as 3
kmeans = kmeans.setSeed(1)  # set random seed as 1

In [14]:
k_model = kmeans.fit(vcluster_df)  #fit the model with transformed df

# Get Cluster Centers

In [16]:
k_model.clusterCenters()         # get the clusters center

[array([ 0.71542187,  0.28469012, 87.5516823 ]),
 array([ 0.51439668,  0.48445202, 50.49452021]),
 array([ 0.62129573,  0.37851014, 69.19070448])]

We can see that there are 3 clusters (as we have defined).
- each cluster is represented by 3 features values ('cpu_utilization', 'free_memory', 'session_count')

--------