# Extracting Data from H-MOG Dataset

In [None]:
import os
import pandas as pd
from torch.utils.data import Dataset
import torch
import numpy as np
import pickle
    
directory = "C:/Users/rahul/PycharmProjects/siamese-triplet/hmog_dataset/public_dataset_test"

## function to extract X, Y, Z readings from each CSV files of H-MOG dataset 

In [None]:
# Extracting acc, gy, and mag reading from CSV files. Columns 7, 8, 9 contains the X, Y, Z readings
def get_activity(concatenated_rows, df_activity, df_TouchEvent, row_te, allin):
  for index_act, row_act in df_activity[::-1].iterrows():
    if row_te[0] > row_act[3]:
      act_cols = [7, 8, 9]
      act_row = df_activity.iloc[index_act]

      concatenated_row = [row_te[col] for col in df_TouchEvent.columns] + [row_act[col] for col in act_cols]
      concatenated_row.extend([allin[col] for col in allin.columns])

      concatenated_rows.append(concatenated_row)

      break

  return concatenated_rows

## Function to extract 144 features for each touch event

In [None]:
# X_mean, X_std, X_min, X_max, Y_mean, Y_std, Y_min, Y_max, Z_mean, Z_std, Z_min, Z_max, M_mean, M_std, M_min, M_max
# 0 - 47 - acc; 48-95 - gyro; 96 - 143 - mag
# 0-15 before
# 16-31 - after
# 32 - 47 - difference

def get_sensor_data(row_te,df_acc):
    
    stat_before = None
    stat_after = None
    diff = None
    
    t_min = row_te[0]-100
    t = row_te[0]
    t_max = row_te[0]+100
    
    cols_keep = [3,4,5]
    stats_col_keep = ["mean", "std",  "min",   "max"]
    
    # Filter rows where elements in the first column are between min and max
    filtered_df_acc_before = df_acc[ (df_acc[0] >= t_min) & (df_acc[0] <= t) ]
    filtered_df_acc_after = df_acc[ (df_acc[0] >= t) & (df_acc[0] <= t_max) ]
    
   
    
    # Check for empty DataFrames before selecting columns
    if not filtered_df_acc_before.empty and not filtered_df_acc_after.empty:
        filtered_df_acc_before = filtered_df_acc_before[cols_keep]
        filtered_df_acc_before['mean_sqrt'] = (filtered_df_acc_before.iloc[:, :3] ** 2).mean(axis=1) ** 0.5
#         print(filtered_df_acc_before)
        # Calculate statistics for each column (excluding percentiles)
        stats_before = filtered_df_acc_before.describe(percentiles=[]).transpose()  
        stats_before = stats_before[stats_col_keep]
#         print("xxx",stats_before)
        stats_before=stats_before.values.flatten()
#         acc_stat_before = stats_before.values.flatten().tolist()
    
#     if not filtered_df_acc_after.empty:
        filtered_df_acc_after = filtered_df_acc_after[cols_keep]
        filtered_df_acc_after['mean_sqrt'] = (filtered_df_acc_after.iloc[:, :3] ** 2).mean(axis=1) ** 0.5
        # Calculate statistics for each column (excluding percentiles)
        stats_after = filtered_df_acc_after.describe(percentiles=[]).transpose()  
        stats_after = stats_after[stats_col_keep]
        stats_after =  stats_after.values.flatten()
#         # Flatten the DataFrame into a list
#         acc_stat_after = stats_after.values.flatten().tolist()
        diff= stats_before-stats_after
        return stats_before.tolist() + stats_after.tolist() + diff.tolist()

## This function goes through every folder and CSV files in H-MOG dataset and extract features for all touch events and store it as .pkl files

## we are limiting the number of touch event per CSV file to 300

In [None]:
import os
import pandas as pd
import time

for root, _, files in os.walk(directory):
    concatenated_rows = []
#     contains_X_all = any("X_all1.pkl" in file for file in files)
#     contains_X_300 = any("X_3001.pkl" in file for file in files)

    contains_activity_csv = any("Activity.csv" in file for file in files)
    if contains_activity_csv:# and not contains_X_300:
        filepath_act = os.path.join(root, "Activity.csv")
        df_activity = pd.read_csv(filepath_act, header=None)  
        if os.path.getsize(os.path.join(root, "X_300.pkl")) < 2048:

            # extracting sessions related to 2, 8, 14, 20 (Walking and readings)
            if df_activity.iloc[0, 8] in [2,8,14,20]:
                print(filepath_act)
                filepath_TouchEvent = os.path.join(root, "TouchEvent.csv")
                filepath_acc = os.path.join(root, "Accelerometer.csv")
                filepath_gyro = os.path.join(root, "Gyroscope.csv")
                filepath_mag = os.path.join(root, "Magnetometer.csv")

                df_TouchEvent = pd.read_csv(filepath_TouchEvent, header=None)
                df_acc = pd.read_csv(filepath_acc, header=None)
                df_gyro = pd.read_csv(filepath_gyro, header=None)
                df_mag = pd.read_csv(filepath_mag, header=None)

                for index_te, row_te in df_TouchEvent.iterrows():
                    acc = get_sensor_data(row_te, df_acc)
                    gyro = get_sensor_data(row_te, df_gyro)
                    mag = get_sensor_data(row_te, df_mag)

                    if acc is not None and gyro is not None and mag is not None:
                        allin = pd.DataFrame(acc + gyro + mag).transpose()
                        has_nan = allin.isnull().values.any()

                        if not has_nan:
                            concatenated_rows = get_activity(concatenated_rows, df_activity, df_TouchEvent, row_te, allin)
                            test = pd.DataFrame(concatenated_rows)
                            if len(concatenated_rows) > 300:
                                break

                df_C = pd.DataFrame(concatenated_rows)
                save_path = os.path.join(root, "X_300_W_R.pkl")
                df_C.to_pickle(save_path)
                print(df_C.shape)
                print(root)

                # Clearing memory
                del df_activity
                del df_TouchEvent
                del df_acc
                del df_gyro
                del df_mag
                del df_C
                del test
                del concatenated_rows


## This code go through all the folders and to read .pkl and create a large data frame X and corresponding label. 

## For example, the first row of X contain 144 features for user 0's first touch event.

## the second row of X contain 144 features for user 0's second touch event.

## 301st row of X contain 144 features for user 1's first touch event.

## the outputs are stored as .npy files

In [None]:
import pandas as pd
import os


def min_max_normalize(df):
    min_values = df.min(axis=0)  # Find minimum along each column
    max_values = df.max(axis=0)  # Find maximum along each column
    normalized = (df - min_values) / (max_values - min_values)
    return normalized

def standard_normalize(df):
    mean = df.mean(axis=0)  # Calculate mean along each column
    std = df.std(axis=0)  # Calculate standard deviation along each column
    normalized = (df - mean) / std
    return normalized, mean, std

def get_folder_names(directory):
    folders = []
    for root, dirnames, _ in os.walk(directory):
        # Access dirnames directly, skipping subdirectory iteration
        folders.extend(dirnames)  # Use extend to efficiently append all names
        break  # Exit the loop after processing the top-level directory
    return folders

def split_folders(folder_names, split_ratio):
    split_index = int(len(folder_names) * split_ratio)
    first_split = folder_names[:split_index]
    second_split = folder_names[split_index:]
    return first_split, second_split


##############################################################################################
# Update this folder #########################################################################
directory = "C:/Users/rahul/PycharmProjects/siamese-triplet/hmog_dataset/public_dataset_test/"
###############################################################################################
folder_names = get_folder_names(directory)

# Split folders into two arrays (can adjust split_ratio as needed)
training, testing = split_folders(folder_names, split_ratio=1)


concatenated_rows = []
for file in training:
    filepath = os.path.join(directory, file)
    for root, _, files in os.walk(filepath):
            for file in files:
                if file == "X_300.pkl":  # Check directly for the filename
                    filepath_act = os.path.join(root, file)
                    X = pd.read_pickle(filepath_act)
                    concatenated_rows.append(X)
                    print(filepath)

# Check if any data was loaded before creating the DataFrame
if concatenated_rows:
    df_C = pd.concat(concatenated_rows, ignore_index=True)  # Concatenate DataFrames
    print(df_C.shape)
else:
    print("No 'X_300.pkl' files found in the directory.")
    
df_C = df_C.astype(dtype='float64')
df_C.shape

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

y = df_C.iloc[:,2]
y = (y/10**9).astype(int)

encoder = LabelEncoder()
y= encoder.fit_transform(y)
y=pd.DataFrame(y)
X = df_C.iloc[:,14:159]



X,mean,std = standard_normalize(X)

X_ori = X.to_numpy()
np.save('X_100_all.npy', X_ori)
y_ori = y.to_numpy()
np.save('y_100_all.npy', y_ori)