# Synthetic data testing 2

Using data from timegan_tsgm

In [1]:
import pandas as pd
import numpy as np

## Load Original data

In [21]:
ori_data_train_features_path = "data/features/data_train_features.csv"

ori_data_train_features = pd.read_csv(ori_data_train_features_path)


ori_data_test_features_path = "data/features/data_test_features.csv"
ori_data_test_features = pd.read_csv(ori_data_test_features_path)

ori_data_train_features.head(2)

Unnamed: 0,mean_x-accel,peak_x-accel,abs_dev_x-accel,std_x-accel,mean_y-accel,peak_y-accel,abs_dev_y-accel,std_y-accel,mean_z-accel,peak_z-accel,...,peak_y-gyro,abs_dev_y-gyro,std_y-gyro,mean_z-gyro,peak_z-gyro,abs_dev_z-gyro,std_z-gyro,resultant_accel,resultant_gyro,activity
0,7.019093,35.508982,0.029864,0.037623,1.25304,40.821918,0.081643,0.108398,6.930752,33.908046,...,31.481481,0.017265,0.0217,-0.006869,31.322751,0.013767,0.017877,7.019093,0.021392,1
1,7.075111,38.076923,0.053842,0.08688,2.528753,49.256198,0.101113,0.129301,6.577109,39.2,...,33.502825,0.018807,0.02453,0.001009,30.408163,0.018829,0.023044,7.075111,0.024693,1


## Load Syn data and process it

In [12]:
syn_data_act_13_windows = np.load("data/synthetic/timegan_tsgm_100epochs_synthetic_data_act_13_windows.npy", allow_pickle=True)

column_names = ["x-accel", "y-accel", "z-accel", "x-gyro", "y-gyro", "z-gyro"]


## Process

In [13]:
from utilities.data_preprocessing import sequencing_data_by_one, extract_features_from_windows

In [14]:
seconds = 6
sampling_rate = 100
normalize = False
preselected_features = 1
band = (0.1, 3)



syn_data_act_13_extracted_features = extract_features_from_windows(syn_data_act_13_windows, column_names, preselected_features, sampling_rate, band)

syn_data_act_13_extracted_features["activity"] = 13


Extracting features from window 0 to 10000 of 100000
Extracting features from window 10000 to 20000 of 100000
Extracting features from window 20000 to 30000 of 100000
Extracting features from window 30000 to 40000 of 100000
Extracting features from window 40000 to 50000 of 100000
Extracting features from window 50000 to 60000 of 100000
Extracting features from window 60000 to 70000 of 100000
Extracting features from window 70000 to 80000 of 100000
Extracting features from window 80000 to 90000 of 100000
Extracting features from window 90000 to 100000 of 100000
Feature extraction completed! A dataframe of features was returned.


In [15]:
syn_data_act_13_extracted_features.head(2)

Unnamed: 0,mean_x-accel,peak_x-accel,abs_dev_x-accel,std_x-accel,mean_y-accel,peak_y-accel,abs_dev_y-accel,std_y-accel,mean_z-accel,peak_z-accel,...,peak_y-gyro,abs_dev_y-gyro,std_y-gyro,mean_z-gyro,peak_z-gyro,abs_dev_z-gyro,std_z-gyro,resultant_accel,resultant_gyro,activity
0,0.043508,151.176471,0.002207,0.01619,0.922114,182.413793,0.002428,0.016715,0.8215,154.285714,...,160.9375,0.001615,0.010628,0.27978,138.684211,0.00158,0.009915,0.043508,0.39065,13
1,0.043508,138.421053,0.002207,0.016189,0.922114,161.818182,0.002427,0.016715,0.8215,205.384615,...,193.333333,0.001615,0.010627,0.279779,149.428571,0.00158,0.009914,0.043508,0.39065,13


# Testing synthetic data

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

from utilities.synthetic_data_evaluation import report_results, discriminative_score_metrics, predictive_score_metrics, tSNE_Analysis

activities = [
    "lying",       #1
    "sitting",      #2
    "standing",     #3
    "running",      #5
    "asc stairs",   #12
    "desc stairs"   #13
] 
activities_12_13 = [
    "asc stairs",   #12
    "desc stairs"   #13
] 

## 1. Build a classifier with original data

__Train on Real__

In [None]:
rf_model_balanced = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight="balanced")

rf_model_balanced.fit(ori_data_train_features.drop(["activity"], axis=1), ori_data_train_features["activity"])

__Test on Real__

In [82]:
y_hat_real = rf_model_balanced.predict(ori_data_test_features.drop(["activity"], axis=1))

save_path = "data/synthetic/real_on_real_balanced"

cmatrix, creport, acc = report_results(ori_data_test_features["activity"], y_hat_real, activities, save_path)

In [83]:
creport()

Unnamed: 0,precision,recall,f1-score,support
lying,0.821,0.972,0.89,23565.0
sitting,0.88,0.667,0.759,22323.0
standing,0.781,0.71,0.744,24560.0
running,1.0,0.916,0.956,15932.0
asc stairs,0.914,0.883,0.898,11083.0
desc stairs,0.602,0.934,0.732,9055.0
Accuracy,0.827,0.827,0.827,0.827
Macro avg,0.833,0.847,0.83,106518.0
Weighted avg,0.842,0.827,0.826,106518.0


__Train on real + synthetic__

In [17]:
# Combine datasets
combined_data = pd.concat([ori_data_train_features, syn_data_act_13_extracted_features]).reset_index(drop=True)
combined_data = combined_data.sample(frac = 1)
combined_X_train = combined_data.drop("activity", axis=1)
combined_y_train = combined_data["activity"]

rf_model_combined = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, class_weight="balanced")

rf_model_combined.fit(combined_X_train, combined_y_train)



__Test on real__

In [19]:
y_hat_combined_real = rf_model_combined.predict(ori_data_test_features.drop(["activity"], axis=1))

save_path = "data/synthetic/timegan_tsgmn_100epochs_13_combined_on_real_balanced"

cmatrix, creport, acc = report_results(ori_data_test_features["activity"], y_hat_combined_real, activities, save_path)

In [20]:
creport()

Unnamed: 0,precision,recall,f1-score,support
lying,0.763,0.967,0.853,23565.0
sitting,0.827,0.612,0.703,22323.0
standing,0.804,0.843,0.823,24560.0
running,1.0,0.918,0.957,15932.0
asc stairs,0.924,0.852,0.887,11083.0
desc stairs,0.834,0.881,0.857,9055.0
Accuracy,0.837,0.837,0.837,0.837
Macro avg,0.859,0.845,0.847,106518.0
Weighted avg,0.844,0.837,0.834,106518.0


A atividade "desc stairs" melhorou por 1.25%