# Process Data

In [2]:
import pandas as pd
import numpy as np
from utilities.data_preprocessing import separate_per_column, sequencing_data_by_one, extract_features_from_windows

## Load cleaned dataset and return raw train and test data

Because I had to delete the dataset (because of its size) we need to start with the original uncleaned dataset

Our goal is to have any dataset looking like the following image. this is make it easier to preprocess the data to all the models, in wich we just need to call the functions on /utilities.

![Forma do dataset após limpeza e preparação para processamento](data/PAMAP2_cleaned_dataset.png)

Let's start by downloading the dataset and put the data separeted per subject in the following folder

https://archive.ics.uci.edu/dataset/231/pamap2+physical+activity+monitoring

In [7]:
data_path_original = "data/PAMAP2_Dataset_original/"
s_1 = "subject101.dat"
s_2 = "subject102.dat"
s_3 = "subject103.dat"
s_4 = "subject104.dat"
s_5 = "subject105.dat"
s_6 = "subject106.dat"
s_7 = "subject107.dat"
s_8 = "subject108.dat"

In [None]:
s1_df = pd.read_csv(data_path_original + s_1, sep=" ", header=None)
s2_df = pd.read_csv(data_path_original + s_2, sep=" ", header=None)
s3_df = pd.read_csv(data_path_original + s_3, sep=" ", header=None)
s4_df = pd.read_csv(data_path_original + s_4, sep=" ", header=None)
s5_df = pd.read_csv(data_path_original + s_5, sep=" ", header=None)
s6_df = pd.read_csv(data_path_original + s_6, sep=" ", header=None)
s7_df = pd.read_csv(data_path_original + s_7, sep=" ", header=None)
s8_df = pd.read_csv(data_path_original + s_8, sep=" ", header=None)

array_of_subjects = [s1_df, s2_df, s3_df, s4_df, s5_df, s6_df, s7_df, s8_df]

In [11]:
def clean_data(df_Subject):
    new_df = df_Subject.copy()
    new_df = new_df.drop(new_df.columns[13:], axis=1)
    new_df = new_df.ffill()
    new_df = new_df.rename(
        columns={
            0: "timestamp",
            1: "activity",
            2: "bpm",
            3: "temp(C)",
            7: "x-accel",
            8: "y-accel",
            9: "z-accel",
            10: "x-gyro",
            11: "y-gyro",
            12: "z-gyro",
        }
    )
    new_df = new_df.drop(new_df.columns[[4, 5, 6]], axis=1)
    return new_df




combined_df = pd.DataFrame()

for idx, df in enumerate(array_of_subjects):
    returned_df = clean_data(df)
    returned_df["userID"] = idx + 1
    combined_df = pd.concat([combined_df, returned_df], axis=0)

In [None]:
# It should have the same columns as the image
combined_df.head(2)

Unnamed: 0,timestamp,activity,bpm,temp(C),x-accel,y-accel,z-accel,x-gyro,y-gyro,z-gyro,userID
0,8.38,0,104.0,30.0,2.43954,8.76165,3.35465,-0.092217,0.056812,-0.015845,1
1,8.39,0,104.0,30.0,2.39494,8.55081,3.64207,-0.024413,0.047759,0.006474,1


In [None]:
# Sava the dataset
combined_df.to_csv("data/PAMAP2_cleaned_dataset.csv", index=False)

In [None]:
# This is the loading of the same cleaned dataset as above
data_path = "data/PAMAP2_cleaned_dataset.csv"

data = pd.read_csv(data_path)

### Preparing the original cleaned dataset
__Drop unnecessary columns and a few activities__

In [5]:
columns_2drop = ["timestamp", "bpm", "temp(C)"]
data = data.drop(columns_2drop, axis=1)

activities_2drop = [0, 4, 6, 7, 10, 11, 16, 17, 18, 19, 20, 24]
data = data[~data['activity'].isin(activities_2drop)].reset_index(drop=True)

print(data["activity"].unique())
data.head(2)

[ 1  2  3 12 13  5]


Unnamed: 0,activity,x-accel,y-accel,z-accel,x-gyro,y-gyro,z-gyro,userID
0,1,-9.46791,-1.68076,1.1548,-0.00824,-0.029004,0.002536,8
1,1,-9.46778,-1.7411,1.23043,0.054293,-0.001861,-0.023345,8


### Saving for training and test

Para treino, aqui, selecionei apenas os userID de 1 a 3 e deixei o 8 para teste (para não ocupar muito espaço e ser mais rápido no processamento).

Nos modelos que eu treinei, cujo os dados estão no ppt, usei para treino os user de 1 a 7 e deixei o 8 para teste


In [None]:
data_train = data[data['userID'].isin([1,2,3])].reset_index(drop=True)
data_test = data[data['userID'].isin([8])].reset_index(drop=True)

print("data_train são os users:",data_train["userID"].unique())
print("data_test são os users:",data_test["userID"].unique())

data_train = data_train.drop("userID", axis=1)
data_test = data_test.drop("userID", axis=1)

data_train.head(2)

data_train são os users: [3 2 1]
data_test são os users: [8]


Unnamed: 0,activity,x-accel,y-accel,z-accel,x-gyro,y-gyro,z-gyro
0,1,-1.25869,5.42474,7.86577,-0.1497,0.075643,0.126064
1,1,-1.42397,5.42599,7.92634,-0.153853,0.061406,0.097976


In [None]:
data_train.to_csv("data/data_raw_train.csv", index=False)
data_test.to_csv("data/data_raw_test.csv", index=False)

## Load Raw Train / Test Data, for further processing

ps: as atividades do dataset são 1,2,3,5,12,13

In [None]:
data_train = pd.read_csv("data/data_raw_train.csv")
data_test = pd.read_csv("data/data_raw_test.csv")
data_train.head(2)

### Separate per Activity

In [7]:
columm_to_separate_by = "activity"
data_activity_dic = separate_per_column(data_train, columm_to_separate_by)

print("data_activity_dic[1].shape", data_activity_dic[1].shape)
data_activity_dic[1].head(2)

data_activity_dic[1].shape (72661, 6)


Unnamed: 0,x-accel,y-accel,z-accel,x-gyro,y-gyro,z-gyro
0,-1.25869,5.42474,7.86577,-0.1497,0.075643,0.126064
1,-1.42397,5.42599,7.92634,-0.153853,0.061406,0.097976


### Create windows of sequences of the data

_Having sequences of data (windows) is necessary to train some generative models._

_It's also necessary to be able to extract time and frequency features from the raw data, which is also benefictial to train ML classification models. On the contrary, DL models often can be feed raw data._

Esta função vai criar arrays de 3d, o primeiro vai ter as primeiras 6*100 linhas, ou seja de 1 a 600, o segundo vai ter de 2 a 601 e por aí fora. É possivel ajustar o overlap usando a função ``sequencing_data`` (em vez de esta) e passando um overlap de 0 a 1

In [8]:
# >>>>> EDIT HERE
seconds = 6 # Got this value from articles, but I tested with others as well
sampling_rate = 100
normalize = False

print("The number of each sequence is", seconds * sampling_rate)

data_act_1_windows = sequencing_data_by_one(data_activity_dic[1], seconds, sampling_rate, normalize)
data_act_2_windows = sequencing_data_by_one(data_activity_dic[2], seconds, sampling_rate, normalize)
data_act_3_windows = sequencing_data_by_one(data_activity_dic[3], seconds, sampling_rate, normalize)
data_act_5_windows = sequencing_data_by_one(data_activity_dic[5], seconds, sampling_rate, normalize)
data_act_12_windows = sequencing_data_by_one(data_activity_dic[12], seconds, sampling_rate, normalize)
data_act_13_windows = sequencing_data_by_one(data_activity_dic[13], seconds, sampling_rate, normalize)


print("Number of windows/sequences:", data_act_1_windows.shape[0], "\nNumber of samples per sequence:", data_act_1_windows.shape[1], "\nNumber of features per sequence:", data_act_1_windows.shape[2])
data_act_1_windows.shape

The number of each sequence is 600


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x1075049d0>>
Traceback (most recent call last):
  File "/opt/anaconda3/envs/ml_env_v12/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


Number of windows/sequences: 72061 
Number of samples per sequence: 600 
Number of features per sequence: 6


(72061, 600, 6)

### Guardar os dados sequenciados

In [None]:
np.save("data/sequences/data_train_act_1_windows.npy", data_act_1_windows)
np.save("data/sequences/data_train_act_2_windows.npy", data_act_2_windows)
np.save("data/sequences/data_train_act_3_windows.npy", data_act_3_windows)
np.save("data/sequences/data_train_act_5_windows.npy", data_act_5_windows)
np.save("data/sequences/data_train_act_12_windows.npy", data_act_12_windows)
np.save("data/sequences/data_train_act_13_windows.npy", data_act_13_windows)

### Extract Features from windows

__Three options are provided to extract the features:__
- 1 - Extract only a selected features that were studied and mentioned on the article:
Experience:A Comparative Analysis of Multivariate Time-Series Generative Models: A Case Study on Human Activity Data
- 2 - Extract only another selection of features mentioned on article:A study of human activity recognition using adaboost classifiers on WISDM dataset
- False - Extract almost all possible features (most likely it will need an extra feature selection step, like using PCA)





__Column_names:__

_An argument of list[str] must be passed and must include accelerometer and/or gyroscope data (or at least one of them).
The name of the data must be like ["x-accel", "y-accel", "z-accel", "x-gyro", "y-gyro", "z-gyro"], this was to simplify the calculation of features. 

It will only return a df with the computed features. If more features were in the data, they will be lost or the extractions functions need to be modified

__PreSelected features = 1__
- {X, Y, Z} AVG: Average sensor value over the window (per axis).
- {X, Y, Z} PEAK: Time in milliseconds between the peaks in the wave associated with most activities. Heuristically determined (per axis).
- {X, Y, Z} ABSOLDEV: Average absolute difference between each of the Sequenced readings and the mean of those values (per axis).
- {X, Y, Z} STANDDEV: Standard deviation of the Sequenced values (per axis).
- RESULTANT: Average resultant value, computed by squaring each matching x, y, and z value, summing them, taking the square root, and then averaging these values over the Sequenced readings.

In [None]:
# IMPORTANTE: Os nomes das colunas devem ser ["x-accel", "y-accel", "z-accel", "x-gyro", "y-gyro", "z-gyro"], na mesma ordem do array

column_names = ["x-accel", "y-accel", "z-accel", "x-gyro", "y-gyro", "z-gyro"]
preselected_features = 1
sampling_rate = 100
band = (0.1, 3) # Este também é o valor default, by chatgpt

data_act_1_extracted_features = extract_features_from_windows(data_act_1_windows, column_names, preselected_features, sampling_rate, band)
data_act_2_extracted_features = extract_features_from_windows(data_act_2_windows, column_names, preselected_features, sampling_rate, band)
data_act_3_extracted_features = extract_features_from_windows(data_act_3_windows, column_names, preselected_features, sampling_rate, band)
data_act_5_extracted_features = extract_features_from_windows(data_act_5_windows, column_names, preselected_features, sampling_rate, band)
data_act_12_extracted_features = extract_features_from_windows(data_act_12_windows, column_names, preselected_features, sampling_rate, band)
data_act_13_extracted_features = extract_features_from_windows(data_act_13_windows, column_names, preselected_features, sampling_rate, band)

In [None]:
data_act_1_extracted_features["activity"] = 1
data_act_2_extracted_features["activity"] = 2
data_act_3_extracted_features["activity"] = 3
data_act_5_extracted_features["activity"] = 5
data_act_12_extracted_features["activity"] = 12
data_act_13_extracted_features["activity"] = 13

data_train_features = pd.concat([data_act_1_extracted_features, data_act_2_extracted_features, data_act_3_extracted_features, data_act_5_extracted_features, data_act_12_extracted_features, data_act_13_extracted_features], axis=0, ignore_index=True)
data_train_features.head(2)

In [None]:
data_train_features.to_csv("data/features/data_train_features.csv", index=False)