In [1]:
# get parquet

!wget https://github.com/IBM/coursera/raw/master/hmp.parquet
    
df = spark.read.parquet('hmp.parquet')

df.show(10)

Waiting for a Spark session to start...
Spark Initialization Done! ApplicationId = app-20200213024621-0000
KERNEL_ID = e301e143-1949-4927-a959-acec2e29b89c
--2020-02-13 02:46:24--  https://github.com/IBM/coursera/raw/master/hmp.parquet
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/IBM/coursera/master/hmp.parquet [following]
--2020-02-13 02:46:24--  https://raw.githubusercontent.com/IBM/coursera/master/hmp.parquet
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 199.232.8.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|199.232.8.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 932997 (911K) [application/octet-stream]
Saving to: 'hmp.parquet'


2020-02-13 02:46:24 (45.1 MB/s) - 'hmp.parquet' saved [932997/932997]

+---+---+---+--------------------+-----

In [2]:
# register a corresponding sql table
df.createOrReplaceTempView('df')

In [3]:
# create a data preprocessing pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, MinMaxScaler
from pyspark.ml.linalg import Vectors
from pyspark.ml import Pipeline

indexer = StringIndexer(inputCol='class', outputCol='class_index')
encoder = OneHotEncoder(inputCol='class_index', outputCol='encoded_class')
vectorizer = VectorAssembler(inputCols=['x','y','z'], outputCol='features')
rescaler = MinMaxScaler(inputCol='features', outputCol='features_rescaled')

pipeline = Pipeline(stages=[indexer, encoder, vectorizer, rescaler]).fit(df)
preprocessed_df = pipeline.transform(df)
preprocessed_df.show(10)

+---+---+---+--------------------+-----------+-----------+--------------+----------------+--------------------+
|  x|  y|  z|              source|      class|class_index| encoded_class|        features|   features_rescaled|
+---+---+---+--------------------+-----------+-----------+--------------+----------------+--------------------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|[0.34920634920634...|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|[0.34920634920634...|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|[0.34920634920634...|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|[0.34920634920634...|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[21.0,52.0,34.0]|[0.33333333333333...|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,51.0,34.0]|[0.34920634920

In [8]:
# create a data preprocessing pipeline
from pyspark.ml.feature import OneHotEncoderEstimator

indexer = StringIndexer(inputCol='class', outputCol='class_index')
encoder = OneHotEncoderEstimator(inputCols=['class_index'], outputCols=['encoded_class'])
vectorizer = VectorAssembler(inputCols=['x','y','z'], outputCol='features')
rescaler = MinMaxScaler(inputCol='features', outputCol='features_rescaled')

pipeline = Pipeline(stages=[indexer, encoder, vectorizer, rescaler]).fit(df)
preprocessed_df = pipeline.transform(df)
preprocessed_df.show(10)

+---+---+---+--------------------+-----------+-----------+--------------+----------------+--------------------+
|  x|  y|  z|              source|      class|class_index| encoded_class|        features|   features_rescaled|
+---+---+---+--------------------+-----------+-----------+--------------+----------------+--------------------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|[0.34920634920634...|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|[0.34920634920634...|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|[0.34920634920634...|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|[0.34920634920634...|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[21.0,52.0,34.0]|[0.33333333333333...|
| 22| 51| 34|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,51.0,34.0]|[0.34920634920

In [11]:
from pyspark.ml.feature import Normalizer

indexer = StringIndexer(inputCol='class', outputCol='class_index')
encoder = OneHotEncoderEstimator(inputCols=['class_index'], outputCols=['encoded_class'])
vectorizer = VectorAssembler(inputCols=['x','y','z'], outputCol='features')
normalizer = Normalizer(inputCol='features', outputCol='features_norm')
rescaler = MinMaxScaler(inputCol='features_norm', outputCol='features_rescaled')

pipeline = Pipeline(stages=[indexer, encoder, vectorizer, normalizer, rescaler]).fit(df)
preprocessed_df = pipeline.transform(df)
preprocessed_df.show(10)

+---+---+---+--------------------+-----------+-----------+--------------+----------------+--------------------+--------------------+
|  x|  y|  z|              source|      class|class_index| encoded_class|        features|       features_norm|   features_rescaled|
+---+---+---+--------------------+-----------+-----------+--------------+----------------+--------------------+--------------------+
| 22| 49| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|[0.34316403829308...|[0.35689598697470...|
| 22| 49| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,49.0,35.0]|[0.34316403829308...|[0.35689598697470...|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|[0.33117360613218...|[0.34442574929594...|
| 22| 52| 35|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.0])|[22.0,52.0,35.0]|[0.33117360613218...|[0.34442574929594...|
| 21| 52| 34|Accelerometer-201...|Brush_teeth|        6.0|(13,[6],[1.