In [1]:
%pip install  edge-ml

Collecting edge-ml
  Downloading edge_ml-0.4.0-py3-none-any.whl.metadata (4.9 kB)
Downloading edge_ml-0.4.0-py3-none-any.whl (7.8 kB)
Installing collected packages: edge-ml
Successfully installed edge-ml-0.4.0
Note: you may need to restart the kernel to use updated packages.


In [45]:
%pip install imblearn

Note: you may need to restart the kernel to use updated packages.


In [47]:
from edgeml import DatasetReceiver
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt

In [5]:
# Load project object
project=DatasetReceiver("https://beta.edge-ml.org","02a4fad735d3308b68672ddb7593f047")
project.loadData()

In [9]:
# Store project object
pickle.dump(project, open("data_snapshot/project_css25.pkl","wb"))
project=pickle.load(open("data_snapshot/project_css25.pkl","rb"))

In [13]:
# Aggregation_list
filtered_dataframes = []

for d in project.datasets:

    # Only keep datasets with certain length
    if( (d.name == 'devicemotion') & (d.data.shape[0] > 0) & (d.timeSeries[0].end - d.timeSeries[0].start > 10000) ):
        
        # Remove noisy data at beginning and ending of recording
        cleaned_data = d.data.iloc[40:-40].dropna()

        # Add metadata columns
        enriched_data = cleaned_data.assign(**d.metaData)

        # Append to aggregation list
        filtered_dataframes.append(enriched_data)

# Concatenate individual dataframes
df_raw = pd.concat(filtered_dataframes)

In [15]:
# Drop testing data
df_filtered = df_raw[df_raw['activity'] != 'testing']

# Drop column mobile, Dummy encoding for column browser
df_encoded = pd.concat([df_filtered.drop(columns=["mobile","browser"]),pd.get_dummies(df_filtered[["browser"]])], axis=1)

# Encode activity as integer
df_encoded["activity"], labels = df_encoded["activity"].factorize()

# Set time as index
df_encoded_2 = df_encoded.set_index('time')

In [21]:
# Create custom mode function that returns only mode value (if there are multiple only the first one is returned)
def mode(x): 
    return (x.mode()[0])

# Create aggregation dictionary for groupby which defines aggregation for every column based on datatype
aggs={ 
      **dict.fromkeys(df_encoded_2.select_dtypes(exclude=[np.float32]).columns, [ mode ]),
      **dict.fromkeys(df_encoded_2.select_dtypes(include=[np.float32]).columns, ["mean","var","min","max","median"]),

        # Overwrite ealier entry for acitivity -> special treatment
      "activity": [mode,'count'],
}

# Do not consider for aggregation
del(aggs["participantId"])
aggs

{'activity': [<function __main__.mode(x)>, 'count'],
 'browser_Chrome': [<function __main__.mode(x)>],
 'browser_Safari': [<function __main__.mode(x)>],
 'acceleration.x': ['mean', 'var', 'min', 'max', 'median'],
 'acceleration.y': ['mean', 'var', 'min', 'max', 'median'],
 'acceleration.z': ['mean', 'var', 'min', 'max', 'median'],
 'accelerationIncludingGravity.x': ['mean', 'var', 'min', 'max', 'median'],
 'accelerationIncludingGravity.y': ['mean', 'var', 'min', 'max', 'median'],
 'accelerationIncludingGravity.z': ['mean', 'var', 'min', 'max', 'median'],
 'rotationRate.alpha': ['mean', 'var', 'min', 'max', 'median'],
 'rotationRate.beta': ['mean', 'var', 'min', 'max', 'median'],
 'rotationRate.gamma': ['mean', 'var', 'min', 'max', 'median']}

In [27]:
df_encoded_3 = df_encoded_2.groupby("participantId").rolling("1s").agg(aggs)
df_encoded_3.columns = [col[0] + "_" + col[1] if col [1] != "mode" else col[0] for col in df_encoded_3.columns]
df_encoded_3

Unnamed: 0_level_0,Unnamed: 1_level_0,activity,activity_count,browser_Chrome,browser_Safari,acceleration.x_mean,acceleration.x_var,acceleration.x_min,acceleration.x_max,acceleration.x_median,acceleration.y_mean,...,rotationRate.beta_mean,rotationRate.beta_var,rotationRate.beta_min,rotationRate.beta_max,rotationRate.beta_median,rotationRate.gamma_mean,rotationRate.gamma_var,rotationRate.gamma_min,rotationRate.gamma_max,rotationRate.gamma_median
participantId,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10d83,2025-06-03 12:01:23.600,0.0,1.0,1.0,0.0,0.000000,,0.0,0.0,0.00,0.000,...,0.100000,,0.1,0.1,0.10,-0.900000,,-0.9,-0.9,-0.9
10d83,2025-06-03 12:01:23.616,0.0,2.0,1.0,0.0,0.000000,0.000000,0.0,0.0,0.00,0.000,...,-0.150000,0.125000,-0.4,0.1,-0.15,-0.900000,0.000000,-0.9,-0.9,-0.9
10d83,2025-06-03 12:01:23.632,0.0,3.0,1.0,0.0,-0.033333,0.003333,-0.1,0.0,0.00,0.000,...,-0.466667,0.363333,-1.1,0.1,-0.40,-0.833333,0.013333,-0.9,-0.7,-0.9
10d83,2025-06-03 12:01:23.649,0.0,4.0,1.0,0.0,-0.050000,0.003333,-0.1,0.0,-0.05,0.025,...,-0.700000,0.460000,-1.4,0.1,-0.75,-0.525000,0.389167,-0.9,0.4,-0.8
10d83,2025-06-03 12:01:23.666,0.0,5.0,1.0,0.0,-0.060000,0.003000,-0.1,0.0,-0.10,0.020,...,-0.840000,0.443000,-1.4,0.1,-1.10,-0.340000,0.463000,-0.9,0.4,-0.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
uspgb,2025-06-23 20:26:23.914,0.0,60.0,1.0,0.0,0.000000,0.000000,0.0,0.0,0.00,0.000,...,0.185000,0.755534,-1.7,2.1,0.30,0.046667,0.002531,0.0,0.1,0.0
uspgb,2025-06-23 20:26:23.930,0.0,61.0,1.0,0.0,0.000000,0.000000,0.0,0.0,0.00,0.000,...,0.191803,0.745765,-1.7,2.1,0.30,0.045902,0.002525,0.0,0.1,0.0
uspgb,2025-06-23 20:26:23.947,0.0,60.0,1.0,0.0,0.000000,0.000000,0.0,0.0,0.00,0.000,...,0.160000,0.712271,-1.7,2.1,0.30,0.045000,0.002517,0.0,0.1,0.0
uspgb,2025-06-23 20:26:23.964,0.0,60.0,1.0,0.0,0.000000,0.000000,0.0,0.0,0.00,0.000,...,0.148333,0.691014,-1.7,2.1,0.30,0.043333,0.002497,0.0,0.1,0.0


In [34]:
# Only keep rolling time windows with at least 60 data points
df_encoded_4 = df_encoded_3[df_encoded_3.activity_count >= 60 ].copy()

# Remove time from Index
df_encoded_4.index = df_encoded_4.index.droplevel(1)

# Format participantId as column 
df_encoded_4 = df_encoded_4.reset_index()
df_encoded_4

Unnamed: 0,participantId,activity,activity_count,browser_Chrome,browser_Safari,acceleration.x_mean,acceleration.x_var,acceleration.x_min,acceleration.x_max,acceleration.x_median,...,rotationRate.beta_mean,rotationRate.beta_var,rotationRate.beta_min,rotationRate.beta_max,rotationRate.beta_median,rotationRate.gamma_mean,rotationRate.gamma_var,rotationRate.gamma_min,rotationRate.gamma_max,rotationRate.gamma_median
0,10d83,0.0,60.0,1.0,0.0,0.008333,0.011624,-0.2,0.3,0.0,...,-0.355000,1.314720,-2.7,3.6,-0.45,0.015000,0.912483,-1.5,3.0,0.05
1,10d83,0.0,61.0,1.0,0.0,0.008197,0.011432,-0.2,0.3,0.0,...,-0.324590,1.349219,-2.7,3.6,-0.40,0.011475,0.898033,-1.5,3.0,0.00
2,10d83,0.0,60.0,1.0,0.0,0.008333,0.011624,-0.2,0.3,0.0,...,-0.300000,1.424746,-2.7,3.6,-0.40,0.038333,0.885116,-1.5,3.0,0.05
3,10d83,0.0,60.0,1.0,0.0,0.011667,0.011556,-0.2,0.3,0.0,...,-0.255000,1.473025,-2.7,3.6,-0.30,0.040000,0.882780,-1.5,3.0,0.05
4,10d83,0.0,60.0,1.0,0.0,0.015000,0.011466,-0.2,0.3,0.0,...,-0.201667,1.519489,-2.7,3.6,-0.30,0.006667,0.925040,-1.6,3.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108191,uspgb,0.0,60.0,1.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.185000,0.755534,-1.7,2.1,0.30,0.046667,0.002531,0.0,0.1,0.00
108192,uspgb,0.0,61.0,1.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.191803,0.745765,-1.7,2.1,0.30,0.045902,0.002525,0.0,0.1,0.00
108193,uspgb,0.0,60.0,1.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.160000,0.712271,-1.7,2.1,0.30,0.045000,0.002517,0.0,0.1,0.00
108194,uspgb,0.0,60.0,1.0,0.0,0.000000,0.000000,0.0,0.0,0.0,...,0.148333,0.691014,-1.7,2.1,0.30,0.043333,0.002497,0.0,0.1,0.00


In [42]:
# Drop NA values (0 NA values)
df_encoded_5 = df_encoded_4.dropna()

# Convert acitivity column back to labels 
df_encoded_5.activity = df_encoded_5.activity.apply(lambda x: labels[int(x)])

In [44]:
# Store features
pickle.dump(df_encoded_5,open("../features.pkl","wb"))

In [None]:
# ...

In [None]:
from imblearn.under_sampling import RandomUnderSampler

X, y = RandomUnderSampler().fit_resample(df4.drop(columns=["activity"]), df4.activity)
groups=X.participantId
X.drop(columns=["participantId"], inplace=True)

In [None]:
import seaborn as sns
df5=X[[col for col in X.columns if col.startswith("acceleration.x_")]]
df5["activity"]=y

sns.pairplot(df5, hue="activity")

In [None]:
import warnings
from sklearn.model_selection import cross_validate,LeaveOneGroupOut
from sklearn.tree import DecisionTreeClassifier
warnings.filterwarnings('ignore')
classifier_metrics = [
    "accuracy", "balanced_accuracy", "precision_macro", "recall_macro",
    "f1_macro"
]

scores=pd.DataFrame(cross_validate(DecisionTreeClassifier(), X, y, cv=LeaveOneGroupOut(), groups=groups, scoring=classifier_metrics , verbose = True))

scores

In [None]:
scores.mean()

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GroupKFold

rfecv=RFECV(estimator=DecisionTreeClassifier(), cv=GroupKFold(n_splits=5), step=1 , verbose=1, scoring='f1_macro').fit(X,y,groups=groups)


In [None]:
# Print the optimal number of features and their indices
print("Optimal number of features : %d" % rfecv.n_features_)
print("Selected features : %s" % rfecv.support_)
print("Feature ranking : %s" % rfecv.ranking_)

In [None]:
cv_results = pd.DataFrame(rfecv.cv_results_)
cv_results


In [None]:
import matplotlib.pyplot as plt
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Mean test accuracy")
plt.errorbar(
    x=cv_results.index,
    y=cv_results["mean_test_score"],
    yerr=cv_results["std_test_score"],
)
plt.title("Recursive Feature Elimination")
plt.show()

In [None]:
df6=pd.concat([df4.participantId,df4.activity,df4[rfecv.get_feature_names_out(X.columns)]],axis=1).dropna()
df6