In [1]:
# get parquet data

!wget https://github.com/IBM/coursera/raw/master/hmp.parquet
    
df = spark.read.parquet('hmp.parquet')

df.show(10)

Waiting for a Spark session to start...
Spark Initialization Done! ApplicationId = app-20200213031336-0000
KERNEL_ID = 137bc8da-c0ac-435b-a1ef-be91da6bbd28
--2020-02-13 03:13:39--  https://github.com/IBM/coursera/raw/master/hmp.parquet
Resolving github.com (github.com)... 192.30.253.112
Connecting to github.com (github.com)|192.30.253.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/IBM/coursera/master/hmp.parquet [following]
--2020-02-13 03:13:39--  https://raw.githubusercontent.com/IBM/coursera/master/hmp.parquet
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 199.232.8.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|199.232.8.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 932997 (911K) [application/octet-stream]
Saving to: 'hmp.parquet'


2020-02-13 03:13:39 (27.5 MB/s) - 'hmp.parquet' saved [932997/932997]

+---+---+---+--------------------+-

In [2]:
# register sql structure
df.createOrReplaceTempView('df')

In [6]:
from pyspark.ml.feature import StringIndexer, MinMaxScaler, VectorAssembler, OneHotEncoder
from pyspark.ml import Pipeline

indexer = StringIndexer(inputCol='class', outputCol='class_index')
encoder = OneHotEncoder(inputCol='class_index', outputCol='class_onehot')
vectorAssembler = VectorAssembler(inputCols=['x','y','z'], outputCol='features')
rescaler = MinMaxScaler(inputCol='features', outputCol='feature_rescaled')

pipeline = Pipeline(stages=[indexer, encoder, vectorAssembler, rescaler])

preprocessed_df = pipeline.fit(df).transform(df)
preprocessed_df.show(10)

+---+---+---+--------------------+-----------+-----------+--------------+----------------+--------------------+
|  x|  y|  z|              source|      class|class_index|  class_onehot|        features|    feature_rescaled|
+---+---+---+--------------------+-----------+-----------+--------------+----------------+--------------------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|[0.34920634920634...|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|[0.34920634920634...|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|[0.34920634920634...|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|[0.34920634920634...|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[21.0,52.0,34.0]|[0.33333333333333...|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,51.0,34.0]|[0.34920634920

In [10]:
# let's do some k-mean
from pyspark.ml.clustering import KMeans

kmeans = KMeans().setK(13).setSeed(2020)

main_pipeline = Pipeline(stages=[pipeline, kmeans])
model = main_pipeline.fit(df)

In [12]:
loss = model.stages[1].computeCost(pipeline.fit(df).transform(df)) / df.count()
print(loss)

34.77824525276347


In [15]:
# try with two classes
df2 = spark.sql("select * from df where class = 'Brush_teeth' or class = 'Climb_stairs'")
kmeans = KMeans().setK(2).setSeed(2020)
model = Pipeline(stages=[pipeline, kmeans]).fit(df2)
loss = model.stages[1].computeCost(pipeline.fit(df2).transform(df2)) / df2.count()
print(loss)

134.4947776790293
