# Feature Extraction aus InfluxDB und Klassifikation

## Imports

In [37]:
import numpy as np
import pandas as pd
import seaborn as sns
from influxdb_client import InfluxDBClient
import matplotlib.pyplot as plt

## Daten Laden
Wier wollen die Daten direkt aus Influx Laden.
Dafür bauen wir zuerst eine Connection zur Influx Instanz auf

In [38]:
BUCKET = "extended_labels"
URL = "https://css21.teco.edu"
TOKEN = "***REMOVED***"
ORG = "css21"
client = InfluxDBClient(url=URL, token=TOKEN, org=ORG, verify_ssl=False)
query_api = client.query_api()

### Querying Influx
Der Query wird in Flux geschrieben

In [39]:
query = '''
    from(bucket: bucket)
      |> range(start: -60d, stop: now())
      |> filter(fn: (r) => r.label != "testing")
      |> pivot(rowKey: ["_time", "label", "subject"], columnKey: ["_field", "_measurement"], valueColumn: "_value")
'''
params = {
    "bucket": BUCKET
}
result = query_api.query_data_frame(query, params=params)
result = result.set_index("_time", drop=True)
result = result.drop(columns=["table", "result", "_start", "_stop", "browser", "mobile"])
result["label"] = pd.Categorical(result["label"])
result




Unnamed: 0_level_0,label,subject,alpha_devicemotion,beta_devicemotion,gamma_devicemotion,x_devicemotion,x0_devicemotion,y_devicemotion,y0_devicemotion,z_devicemotion,z0_devicemotion,alpha_deviceorientation,beta_deviceorientation,gamma_deviceorientation
_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2021-08-14 11:58:37.871000+00:00,driving,0c86078c3e,23.695926,-41.337541,-9.004711,0.774075,0.479989,-5.938367,0.633047,-9.519047,-2.245780,,,
2021-08-14 11:58:37.887000+00:00,driving,0c86078c3e,52.862054,-18.493067,-2.514026,0.685190,0.455864,-6.186018,0.483131,-9.409512,-2.223397,,,
2021-08-14 11:58:37.903000+00:00,driving,0c86078c3e,65.476283,4.852037,-1.505551,0.137966,-0.083518,-5.850979,0.967563,-8.309975,-1.265203,,,
2021-08-14 11:58:37.920000+00:00,driving,0c86078c3e,59.653137,3.318965,-8.634220,-0.124648,-0.371062,-6.084713,0.887691,-7.275830,-0.384150,,,
2021-08-14 11:58:37.937000+00:00,driving,0c86078c3e,51.026444,-1.898931,-12.700321,-0.046388,-0.306667,-6.291212,0.746649,-6.849212,-0.024907,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-08-21 11:42:30.367000+00:00,studying,hj44jjh740,,,,,,,,,,6.368002,52.924892,-9.198874
2021-08-21 11:42:30.383000+00:00,studying,hj44jjh740,,,,,,,,,,6.306407,53.026791,-9.471686
2021-08-21 11:42:30.400000+00:00,studying,hj44jjh740,,,,,,,,,,6.221469,52.874954,-9.370203
2021-08-21 11:42:30.417000+00:00,studying,hj44jjh740,,,,,,,,,,6.290653,52.814398,-9.351599


## Feature Extraction

In [40]:
import findspark
findspark.init("/opt/apache-spark/")

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [41]:
df = spark.createDataFrame(result.reset_index(
))

In [42]:
from pyspark.sql.functions import udf, col,  window, mean, sum as _sum, max as _max, min as _min, var_samp, to_timestamp
from pyspark.sql.types import IntegerType

df = df.withColumn("_time", to_timestamp("_time"))
w = window("_time", "1 seconds")

aggregate = ["alpha_devicemotion", "beta_devicemotion", "gamma_devicemotion", "x_devicemotion", "y_devicemotion", "z_devicemotion", "alpha_deviceorientation", "beta_deviceorientation", "gamma_deviceorientation"] 
funs = [mean, _sum, _max, var_samp, _min]

exprs = [f(col(c)) for f in funs for c in aggregate]

grouped = df.fillna(0).groupBy([w, "subject", "label"]).agg(*exprs)

## Time Based Features
hour = udf(lambda x: x.hour, IntegerType())
grouped = grouped.withColumn("hourOfDay", hour("window.start"))
                             
grouped = grouped.drop("window")
grouped

DataFrame[subject: string, label: string, avg(alpha_devicemotion): double, avg(beta_devicemotion): double, avg(gamma_devicemotion): double, avg(x_devicemotion): double, avg(y_devicemotion): double, avg(z_devicemotion): double, avg(alpha_deviceorientation): double, avg(beta_deviceorientation): double, avg(gamma_deviceorientation): double, sum(alpha_devicemotion): double, sum(beta_devicemotion): double, sum(gamma_devicemotion): double, sum(x_devicemotion): double, sum(y_devicemotion): double, sum(z_devicemotion): double, sum(alpha_deviceorientation): double, sum(beta_deviceorientation): double, sum(gamma_deviceorientation): double, max(alpha_devicemotion): double, max(beta_devicemotion): double, max(gamma_devicemotion): double, max(x_devicemotion): double, max(y_devicemotion): double, max(z_devicemotion): double, max(alpha_deviceorientation): double, max(beta_deviceorientation): double, max(gamma_deviceorientation): double, var_samp(alpha_devicemotion): double, var_samp(beta_devicemotion

## Trainings

Let's build a pipeline and traing it using spark.

In [43]:
from pyspark.sql.types import StringType, DoubleType

num_cols = [f.name for f in grouped.schema.fields if isinstance(f.dataType, DoubleType) or isinstance(f.dataType, IntegerType)]

num_cols

['avg(alpha_devicemotion)',
 'avg(beta_devicemotion)',
 'avg(gamma_devicemotion)',
 'avg(x_devicemotion)',
 'avg(y_devicemotion)',
 'avg(z_devicemotion)',
 'avg(alpha_deviceorientation)',
 'avg(beta_deviceorientation)',
 'avg(gamma_deviceorientation)',
 'sum(alpha_devicemotion)',
 'sum(beta_devicemotion)',
 'sum(gamma_devicemotion)',
 'sum(x_devicemotion)',
 'sum(y_devicemotion)',
 'sum(z_devicemotion)',
 'sum(alpha_deviceorientation)',
 'sum(beta_deviceorientation)',
 'sum(gamma_deviceorientation)',
 'max(alpha_devicemotion)',
 'max(beta_devicemotion)',
 'max(gamma_devicemotion)',
 'max(x_devicemotion)',
 'max(y_devicemotion)',
 'max(z_devicemotion)',
 'max(alpha_deviceorientation)',
 'max(beta_deviceorientation)',
 'max(gamma_deviceorientation)',
 'var_samp(alpha_devicemotion)',
 'var_samp(beta_devicemotion)',
 'var_samp(gamma_devicemotion)',
 'var_samp(x_devicemotion)',
 'var_samp(y_devicemotion)',
 'var_samp(z_devicemotion)',
 'var_samp(alpha_deviceorientation)',
 'var_samp(beta_de

### Train Test Split by Subjects

In [44]:
subjects = grouped.select("subject").distinct()
train_subjects, test_subjects = subjects.randomSplit([0.7, 0.3])

train_df = grouped.join(train_subjects, "subject").drop("subject")
test_df = grouped.join(test_subjects, "subject").drop("subject")

In [47]:

from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml.feature import StringIndexer,  VectorIndexer, VectorAssembler, IndexToString

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(grouped)
train_data = labelIndexer.transform(train_df)
test_data = labelIndexer.transform(test_df)

featureAssembler = VectorAssembler(inputCols = num_cols, outputCol = "features")

# Train a DecisionTree model.
dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="features", maxDepth=10)

# Chain indexers and tree in a Pipeline
indexRevert = IndexToString(inputCol="prediction", outputCol="predLabel", labels=labelIndexer.labels)
pipeline = Pipeline(stages=[featureAssembler, dt])

# Train model.  This also runs the indexers.
model = pipeline.fit(train_data)

21/08/24 13:43:36 WARN TaskSetManager: Stage 527 contains a task of very large size (8654 KiB). The maximum recommended task size is 1000 KiB.
21/08/24 13:43:39 WARN TaskSetManager: Stage 530 contains a task of very large size (8654 KiB). The maximum recommended task size is 1000 KiB.
21/08/24 13:43:40 WARN TaskSetManager: Stage 532 contains a task of very large size (8654 KiB). The maximum recommended task size is 1000 KiB.


In [48]:

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Make predictions.
predictions = model.transform(test_data)

# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g " % (1.0 - accuracy))

21/08/24 13:46:17 WARN TaskSetManager: Stage 610 contains a task of very large size (8654 KiB). The maximum recommended task size is 1000 KiB.
21/08/24 13:46:18 WARN TaskSetManager: Stage 612 contains a task of very large size (8654 KiB). The maximum recommended task size is 1000 KiB.


+----------+------------+--------------------+
|prediction|indexedLabel|            features|
+----------+------------+--------------------+
|       0.0|         3.0|[-0.3790788402144...|
|       0.0|         3.0|[-0.0041126379926...|
|       0.0|         3.0|[-9.1602241620192...|
|       0.0|         3.0|[-0.0070602165788...|
|       0.0|         3.0|[0.02152862151651...|
+----------+------------+--------------------+
only showing top 5 rows



21/08/24 13:46:27 WARN TaskSetManager: Stage 626 contains a task of very large size (8654 KiB). The maximum recommended task size is 1000 KiB.
21/08/24 13:46:28 WARN TaskSetManager: Stage 625 contains a task of very large size (8654 KiB). The maximum recommended task size is 1000 KiB.


Test Error = 0.114238 




In [51]:
from onnxmltools import convert_sparkml, utils
from onnxmltools.convert.sparkml.utils import buildInitialTypesSimple

initial_types = buildInitialTypesSimple(train_data.drop("indexedLabel", "label"))
onnx_model = convert_sparkml(model, 'Context Activity Predictor', initial_types, spark_session = spark)
utils.save_model(onnx_model, 'ks_context.onnx')

The maximum opset needed by this model is only 4.


In [None]:
print(labelIndexer.labels)

['studying', 'driving', 'standing', 'sleeping', 'jogging']
