In [1]:
# download the file containing the data in PARQUET format
!wget https://github.com/IBM/coursera/raw/master/hmp.parquet

# create a dataframe out of it
df = spark.read.parquet('hmp.parquet')

Waiting for a Spark session to start...
Spark Initialization Done! ApplicationId = app-20200214024744-0000
KERNEL_ID = 2d6ce03e-9f3e-4ac3-91ce-f52b6d45b2ab
--2020-02-14 02:47:46--  https://github.com/IBM/coursera/raw/master/hmp.parquet
Resolving github.com (github.com)... 192.30.253.112
Connecting to github.com (github.com)|192.30.253.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/IBM/coursera/master/hmp.parquet [following]
--2020-02-14 02:47:46--  https://raw.githubusercontent.com/IBM/coursera/master/hmp.parquet
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 199.232.8.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|199.232.8.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 932997 (911K) [application/octet-stream]
Saving to: 'hmp.parquet'


2020-02-14 02:47:47 (21.3 MB/s) - 'hmp.parquet' saved [932997/932997]



In [3]:
df.createOrReplaceTempView('df')

In [4]:
df.show(5)

+---+---+---+--------------------+-----------+
|  x|  y|  z|              source|      class|
+---+---+---+--------------------+-----------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|
+---+---+---+--------------------+-----------+
only showing top 5 rows



In [6]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, Normalizer, MinMaxScaler
from pyspark.ml import Pipeline

indexer = StringIndexer(inputCol='class', outputCol='class_index')
encoder = OneHotEncoder(inputCol='class_index', outputCol='class_encoded')
vectorAssembler = VectorAssembler(inputCols=['x','y','z'], outputCol='features')
normalizer = Normalizer(inputCol='features', outputCol='features_norm', p=1.0)
minmaxscaler = MinMaxScaler(inputCol='features_norm', outputCol='features_rescaled')

pipeline = Pipeline(stages=[indexer, encoder, vectorAssembler, normalizer, minmaxscaler])
preprocessed_df = pipeline.fit(df).transform(df)
preprocessed_df.show(5)

+---+---+---+--------------------+-----------+-----------+--------------+----------------+--------------------+--------------------+
|  x|  y|  z|              source|      class|class_index| class_encoded|        features|       features_norm|   features_rescaled|
+---+---+---+--------------------+-----------+-----------+--------------+----------------+--------------------+--------------------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|[0.20754716981132...|[0.26684636118598...|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|[0.20754716981132...|[0.26684636118598...|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|[0.20183486238532...|[0.25950196592398...|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|[0.20183486238532...|[0.25950196592398...|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.

In [8]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

kmeans = KMeans(featuresCol='features').setK(14).setSeed(2020)
pipeline = Pipeline(stages=[vectorAssembler, kmeans])
predictions = pipeline.fit(df).transform(df)

evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.3907092517982006


In [10]:
# working with normalized data
kmeans = KMeans(featuresCol='features_rescaled').setK(14).setSeed(2020)
pipeline = Pipeline(stages=[vectorAssembler, normalizer, minmaxscaler, kmeans])
predictions = pipeline.fit(df).transform(df)
silhouette = ClusteringEvaluator().evaluate(predictions)
print(f'silhouette score is {silhouette}')

silhouette score is 0.21450412971430025


In [12]:
from pyspark.ml.clustering import GaussianMixture

gmm = GaussianMixture(featuresCol='features_rescaled').setK(14).setSeed(2020)

pipeline = Pipeline(stages=[vectorAssembler, normalizer, minmaxscaler, gmm])
predictions = pipeline.fit(df).transform(df)
silhouette = ClusteringEvaluator().evaluate(predictions)
print(f'silhouette score is {silhouette}')

silhouette score is 0.0840287050008519
