In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

LOADING DATA

In [None]:
def load_files(relative_folder_path: str, sep=' ', header = None):
  base_path = "/content/drive/MyDrive"
  target_path = os.path.join(base_path, relative_folder_path)

  data_dict = {}

  for root, _ ,files in os.walk(target_path):
    for file in files:
      full_path = os.path.join(root, file)
      rel_key = os.path.relpath(full_path, base_path)
      try:
        df = pd.read_csv(full_path, sep=sep, header= header)
        data_dict[rel_key] = df
        print(f'Loaded: {rel_key}')
      except Exception as e:
        print(f'Error loading {rel_key}: {e}')

  return data_dict

In [None]:
pamap2_data = load_files('PAMAP2_Dataset/Protocol')

Loaded: PAMAP2_Dataset/Protocol/subject101.dat
Loaded: PAMAP2_Dataset/Protocol/subject102.dat
Loaded: PAMAP2_Dataset/Protocol/subject103.dat
Loaded: PAMAP2_Dataset/Protocol/subject104.dat
Loaded: PAMAP2_Dataset/Protocol/subject105.dat
Loaded: PAMAP2_Dataset/Protocol/subject106.dat
Loaded: PAMAP2_Dataset/Protocol/subject107.dat
Loaded: PAMAP2_Dataset/Protocol/subject108.dat
Loaded: PAMAP2_Dataset/Protocol/subject109.dat


RENAMING COLUMNS

In [None]:
def rename_pamap2_columns(df):
    columns = [
        # General
        "timestamp", "activity_id", "heart_rate",

        # Hand IMU
        "hand_temp", "hand_acc_16g_x", "hand_acc_16g_y", "hand_acc_16g_z",
        "hand_acc_6g_x", "hand_acc_6g_y", "hand_acc_6g_z",
        "hand_gyro_x", "hand_gyro_y", "hand_gyro_z",
        "hand_mag_x", "hand_mag_y", "hand_mag_z",
        "hand_ori_w", "hand_ori_x", "hand_ori_y", "hand_ori_z",

        # Chest IMU
        "chest_temp", "chest_acc_16g_x", "chest_acc_16g_y", "chest_acc_16g_z",
        "chest_acc_6g_x", "chest_acc_6g_y", "chest_acc_6g_z",
        "chest_gyro_x", "chest_gyro_y", "chest_gyro_z",
        "chest_mag_x", "chest_mag_y", "chest_mag_z",
        "chest_ori_w", "chest_ori_x", "chest_ori_y", "chest_ori_z",

        # Ankle IMU
        "ankle_temp", "ankle_acc_16g_x", "ankle_acc_16g_y", "ankle_acc_16g_z",
        "ankle_acc_6g_x", "ankle_acc_6g_y", "ankle_acc_6g_z",
        "ankle_gyro_x", "ankle_gyro_y", "ankle_gyro_z",
        "ankle_mag_x", "ankle_mag_y", "ankle_mag_z",
        "ankle_ori_w", "ankle_ori_x", "ankle_ori_y", "ankle_ori_z",
    ]

    # assert df.shape[1] == 54, "Expected 54 columns for PAMAP2 .dat files"
    df.columns = columns
    return df

In [None]:
for file in pamap2_data:
    pamap2_data[file] = rename_pamap2_columns(pamap2_data[file])

DROPING UNWANTED COLUMNS

In [None]:
drop_cols = [
    'timestamp', 'hand_temp', 'chest_temp', 'ankle_temp', 'heart_rate',
    "ankle_ori_w", "ankle_ori_x", "ankle_ori_y", "ankle_ori_z",
    "chest_ori_w", "chest_ori_x", "chest_ori_y", "chest_ori_z",
    "hand_ori_w", "hand_ori_x", "hand_ori_y", "hand_ori_z",
    "ankle_mag_x", "ankle_mag_y",	"ankle_mag_z",
    "chest_mag_x", "chest_mag_y",	"chest_mag_z",
    "hand_mag_x", "hand_mag_y",	"hand_mag_z",
    "ankle_temp", "ankle_acc_16g_x", "ankle_acc_16g_y", "ankle_acc_16g_z",
    "chest_temp", "chest_acc_16g_x", "chest_acc_16g_y", "chest_acc_16g_z",
    "hand_temp", "hand_acc_16g_x", "hand_acc_16g_y", "hand_acc_16g_z",
]

for filename in pamap2_data:
    pamap2_data[filename] = pamap2_data[filename].drop(columns=drop_cols)

In [None]:
df=pamap2_data['PAMAP2_Dataset/Protocol/subject101.dat']

In [None]:
df.head()

Unnamed: 0,activity_id,hand_acc_6g_x,hand_acc_6g_y,hand_acc_6g_z,hand_gyro_x,hand_gyro_y,hand_gyro_z,chest_acc_6g_x,chest_acc_6g_y,chest_acc_6g_z,chest_gyro_x,chest_gyro_y,chest_gyro_z,ankle_acc_6g_x,ankle_acc_6g_y,ankle_acc_6g_z,ankle_gyro_x,ankle_gyro_y,ankle_gyro_z
0,0,2.43954,8.76165,3.35465,-0.092217,0.056812,-0.015845,0.265304,9.81549,-1.41344,-0.005065,-0.006781,-0.005663,9.64689,-1.55576,0.310404,0.0083,0.00925,-0.01758
1,0,2.39494,8.55081,3.64207,-0.024413,0.047759,0.006474,0.234939,9.78539,-1.42846,0.013685,0.001486,-0.041522,9.6167,-1.6163,0.280488,-0.006577,-0.004638,0.000368
2,0,2.30514,8.53644,3.7328,-0.057976,0.032574,-0.006988,0.17385,9.72528,-1.51894,-0.039923,0.034056,-0.002113,9.63173,-1.58605,0.280311,0.003014,0.000148,0.022495
3,0,2.33528,8.53622,3.73277,-0.002352,0.03281,-0.003747,0.157969,9.64994,-1.57952,0.007513,-0.010498,-0.020684,9.63197,-1.63135,0.340997,0.003175,-0.020301,0.011275
4,0,2.23055,8.59741,3.76295,0.012269,0.018305,-0.053325,0.233506,9.57411,-1.44418,-0.003822,-0.011217,-0.025975,9.64699,-1.64647,0.340965,0.012698,-0.014303,-0.002823


In [None]:
print(df.isna().sum())

activity_id          0
hand_acc_6g_x     1454
hand_acc_6g_y     1454
hand_acc_6g_z     1454
hand_gyro_x       1454
hand_gyro_y       1454
hand_gyro_z       1454
chest_acc_6g_x     509
chest_acc_6g_y     509
chest_acc_6g_z     509
chest_gyro_x       509
chest_gyro_y       509
chest_gyro_z       509
ankle_acc_6g_x    1327
ankle_acc_6g_y    1327
ankle_acc_6g_z    1327
ankle_gyro_x      1327
ankle_gyro_y      1327
ankle_gyro_z      1327
dtype: int64


In [None]:
missing_percentage = (df.isna().sum() / len(df)) * 100
print(missing_percentage)

activity_id       0.000000
hand_acc_6g_x     0.386274
hand_acc_6g_y     0.386274
hand_acc_6g_z     0.386274
hand_gyro_x       0.386274
hand_gyro_y       0.386274
hand_gyro_z       0.386274
chest_acc_6g_x    0.135222
chest_acc_6g_y    0.135222
chest_acc_6g_z    0.135222
chest_gyro_x      0.135222
chest_gyro_y      0.135222
chest_gyro_z      0.135222
ankle_acc_6g_x    0.352535
ankle_acc_6g_y    0.352535
ankle_acc_6g_z    0.352535
ankle_gyro_x      0.352535
ankle_gyro_y      0.352535
ankle_gyro_z      0.352535
dtype: float64


In [None]:
df_cleaned = df.dropna().reset_index(drop=True)
print(f"Original shape: {df.shape}, After dropping NaNs: {df_cleaned.shape}")

Original shape: (376417, 19), After dropping NaNs: (373161, 19)


In [None]:
# Loop through each file in the PAMAP2 dataset
for file_name, df in pamap2_data.items():
    # Calculate % of missing values
    missing_percentage = (df.isna().sum() / len(df)) * 100
    total_missing = df.isna().sum().sum()

    print(f"\n {file_name}")
    print(f"➤ Total rows: {len(df)} | Total NaNs: {total_missing}")
    print("➤ Missing percentage per column (only showing > 0%):")
    print(missing_percentage[missing_percentage > 0].round(3))  # Round for neatness



 PAMAP2_Dataset/Protocol/subject101.dat
➤ Total rows: 376417 | Total NaNs: 19740
➤ Missing percentage per column (only showing > 0%):
hand_acc_6g_x     0.386
hand_acc_6g_y     0.386
hand_acc_6g_z     0.386
hand_gyro_x       0.386
hand_gyro_y       0.386
hand_gyro_z       0.386
chest_acc_6g_x    0.135
chest_acc_6g_y    0.135
chest_acc_6g_z    0.135
chest_gyro_x      0.135
chest_gyro_y      0.135
chest_gyro_z      0.135
ankle_acc_6g_x    0.353
ankle_acc_6g_y    0.353
ankle_acc_6g_z    0.353
ankle_gyro_x      0.353
ankle_gyro_y      0.353
ankle_gyro_z      0.353
dtype: float64

 PAMAP2_Dataset/Protocol/subject102.dat
➤ Total rows: 447000 | Total NaNs: 33366
➤ Missing percentage per column (only showing > 0%):
hand_acc_6g_x     0.611
hand_acc_6g_y     0.611
hand_acc_6g_z     0.611
hand_gyro_x       0.611
hand_gyro_y       0.611
hand_gyro_z       0.611
chest_acc_6g_x    0.087
chest_acc_6g_y    0.087
chest_acc_6g_z    0.087
chest_gyro_x      0.087
chest_gyro_y      0.087
chest_gyro_z      0

In [None]:
# Remove NaNs from every dataframe in pamap2_data
for file_name in pamap2_data:
    df = pamap2_data[file_name]
    before = len(df)
    pamap2_data[file_name] = df.dropna().reset_index(drop=True)
    after = len(pamap2_data[file_name])
    print(f"Cleaned {file_name}: {before - after} rows with NaNs removed.")


Cleaned PAMAP2_Dataset/Protocol/subject101.dat: 3256 rows with NaNs removed.
Cleaned PAMAP2_Dataset/Protocol/subject102.dat: 5432 rows with NaNs removed.
Cleaned PAMAP2_Dataset/Protocol/subject103.dat: 1203 rows with NaNs removed.
Cleaned PAMAP2_Dataset/Protocol/subject104.dat: 3456 rows with NaNs removed.
Cleaned PAMAP2_Dataset/Protocol/subject105.dat: 3687 rows with NaNs removed.
Cleaned PAMAP2_Dataset/Protocol/subject106.dat: 2647 rows with NaNs removed.
Cleaned PAMAP2_Dataset/Protocol/subject107.dat: 2728 rows with NaNs removed.
Cleaned PAMAP2_Dataset/Protocol/subject108.dat: 5193 rows with NaNs removed.
Cleaned PAMAP2_Dataset/Protocol/subject109.dat: 63 rows with NaNs removed.


In [None]:
# Confirm no NaNs exist now
for file_name, df in pamap2_data.items():
    nan_count = df.isna().sum().sum()
    print(f"{file_name}: Remaining NaNs = {nan_count}")

PAMAP2_Dataset/Protocol/subject101.dat: Remaining NaNs = 0
PAMAP2_Dataset/Protocol/subject102.dat: Remaining NaNs = 0
PAMAP2_Dataset/Protocol/subject103.dat: Remaining NaNs = 0
PAMAP2_Dataset/Protocol/subject104.dat: Remaining NaNs = 0
PAMAP2_Dataset/Protocol/subject105.dat: Remaining NaNs = 0
PAMAP2_Dataset/Protocol/subject106.dat: Remaining NaNs = 0
PAMAP2_Dataset/Protocol/subject107.dat: Remaining NaNs = 0
PAMAP2_Dataset/Protocol/subject108.dat: Remaining NaNs = 0
PAMAP2_Dataset/Protocol/subject109.dat: Remaining NaNs = 0


WINDOW AND DOWNSAMPLING [100hz to 50hz]

In [None]:
def process_pamap2_windows_kmax(df, cols, window_size=256, step_size=128 , k=128):
    X, y = [], []

    for start in range(0, len(df) - window_size, step_size):

        window = df[cols].iloc[start:start+window_size]
        label_window = df['activity_id'].iloc[start:start+window_size]

        if label_window.nunique() == 1:
            window_array = window.values.T

            # K-Max Pooling across time dimension (axis=1)
            top_k = np.sort(np.partition(window_array, -k, axis=1)[:, -k:], axis=1)
            X.append(top_k)
            y.append(label_window.iloc[0])

    return np.array(X), np.array(y)

In [None]:
sensor_cols = [
    # Hand
    "hand_acc_6g_x", "hand_acc_6g_y", "hand_acc_6g_z",
    "hand_gyro_x", "hand_gyro_y", "hand_gyro_z",
    # Chest
    "chest_acc_6g_x", "chest_acc_6g_y", "chest_acc_6g_z",
    "chest_gyro_x", "chest_gyro_y", "chest_gyro_z",
    # Ankle
    "ankle_acc_6g_x", "ankle_acc_6g_y", "ankle_acc_6g_z",
    "ankle_gyro_x", "ankle_gyro_y", "ankle_gyro_z",
]


In [None]:
# X_pamap, y_pamap = [], []

# for file, df in pamap2_data.items():
#     X, y = process_pamap2_windows_kmax(
#         df=df,
#         cols=sensor_cols,
#         window_size=256,
#         step_size=128,
#         k=128
#     )
#     X_pamap.append(X)
#     y_pamap.append(y)


In [None]:
# pamap2_data['PAMAP2_Dataset/Protocol/subject103.dat'].columns.tolist()

In [None]:
# print(f"Total files processed: {len(X_pamap)}")


In [None]:
# for i, (x, y) in enumerate(zip(X_pamap, y_pamap)):
#     print(f"Subject {i+1}: {x.shape[0]} windows | {y.shape[0]} labels")


In [None]:
# for i, (X, y) in enumerate(zip(X_pamap, y_pamap), start=101):
#     print(f"Subject {i}: X shape = {X.shape}, y shape = {y.shape}")


Getting only HAND SENSORS DATA:

In [None]:
hand_sensor_cols = [
    "hand_acc_6g_x", "hand_acc_6g_y", "hand_acc_6g_z",
    "hand_gyro_x", "hand_gyro_y", "hand_gyro_z"
]


In [None]:
X_hand, y_hand = [], []

for file, df in pamap2_data.items():
    X, y = process_pamap2_windows_kmax(
        df=df,
        cols=hand_sensor_cols,
        window_size=256,   # 2.56 seconds at 100 Hz
        step_size=128,     # 50% overlap
        k=128              # Downsample to 50 Hz
    )
    X_hand.append(X)
    y_hand.append(y)


In [None]:
X_hand_all = np.concatenate(X_hand, axis=0)
y_hand_all = np.concatenate(y_hand, axis=0)

In [None]:
X_hand_all.shape

(21841, 6, 128)

EXTRACTING COMMON ACTIVITIES AS IN UCI HAR DATASET

In [None]:
common_pamap_activities = [1, 2, 3, 4, 12, 13]

mask = np.isin(y_hand_all , common_pamap_activities)

X_hand_common = X_hand_all[mask]
y_hand_common = y_hand_all[mask]

print(f"Filtered X shape: {X_hand_common.shape}")
print(f"Filtered y shape: {y_hand_common.shape}")
print("Unique activities in filtered labels:", np.unique(y_hand_common))


Filtered X shape: (7823, 6, 128)
Filtered y shape: (7823,)
Unique activities in filtered labels: [ 1  2  3  4 12 13]


In [None]:
np.savez_compressed('pamap2_hand_common.npz', X=X_hand_common, y=y_hand_common)

UCI DATASET

In [None]:
import numpy as np
import os

# Base UCI HAR dataset directory
base_dir = "/content/drive/MyDrive/UCI HAR Dataset"

# 6 relevant inertial signal files
sensor_files = [
    "body_acc_x", "body_acc_y", "body_acc_z",
    "body_gyro_x", "body_gyro_y", "body_gyro_z"
]

def load_uci_split(split="train"):
    signal_data = []
    for sensor in sensor_files:
        file_path = os.path.join(base_dir, split, "Inertial Signals", f"{sensor}_{split}.txt")
        data = np.loadtxt(file_path)  # shape: (num_windows, 128)
        signal_data.append(data)

    # Stack into shape: (num_windows, 6, 128)
    X_split = np.stack(signal_data, axis=1)

    # Load activity labels
    y_path = os.path.join(base_dir, split, f"y_{split}.txt")
    y_split = np.loadtxt(y_path).astype(int)

    return X_split, y_split

# Load separately
X_uci_train, y_uci_train = load_uci_split("train")
X_uci_test, y_uci_test = load_uci_split("test")

# Check shapes
print("Loaded UCI HAR train and test separately:")
print(f"X_uci_train: {X_uci_train.shape}, y_uci_train: {y_uci_train.shape}")
print(f"X_uci_test: {X_uci_test.shape}, y_uci_test: {y_uci_test.shape}")


✅ Loaded UCI HAR train and test separately:
X_uci_train: (7352, 6, 128), y_uci_train: (7352,)
X_uci_test: (2947, 6, 128), y_uci_test: (2947,)


In [None]:
# Combine only training-related data
X_train_combined = np.concatenate([X_hand_common, X_uci_train], axis=0)
y_train_combined = np.concatenate([y_hand_common, y_uci_train], axis=0)

print(f"Training X: {X_train_combined.shape}, y: {y_train_combined.shape}")

Training X: (15175, 6, 128), y: (15175,)


In [None]:
X_train_combined

array([[[-8.925340e-01, -8.232470e-01, -8.219690e-01, ...,
          3.123890e+00,  3.136310e+00,  3.195610e+00],
        [ 7.831890e+00,  7.831960e+00,  7.871280e+00, ...,
          1.137660e+01,  1.161770e+01,  1.204210e+01],
        [ 5.245460e+00,  5.274990e+00,  5.279320e+00, ...,
          8.131870e+00,  8.670690e+00,  8.779140e+00],
        [ 1.353280e-01,  1.379200e-01,  1.552890e-01, ...,
          1.382760e+00,  1.515830e+00,  1.564260e+00],
        [ 4.837670e-01,  4.946680e-01,  4.955100e-01, ...,
          1.255800e+00,  1.265290e+00,  1.287300e+00],
        [ 9.688590e-02,  9.792640e-02,  1.006640e-01, ...,
          6.478730e-01,  6.620720e-01,  7.072560e-01]],

       [[-1.468400e+00, -1.442720e+00, -1.436830e+00, ...,
          1.337900e-01,  5.242550e-01,  1.226890e+00],
        [ 9.229330e+00,  9.241510e+00,  9.278290e+00, ...,
          1.237800e+01,  1.242410e+01,  1.253970e+01],
        [ 2.858190e+00,  2.859190e+00,  2.873260e+00, ...,
          6.309580e+00,  6.

In [None]:
np.savez_compressed('uci_pamap2_combined_dataset.npz', X=X_train_combined, y=y_train_combined)

In [None]:
# Compute stats across entire dataset (flattened over time axis)
print("Mean per channel:", np.mean(X_train_combined, axis=(0, 2)))
print("Std deviation per channel:", np.std(X_train_combined, axis=(0, 2)))
print("Min per channel:", np.min(X_train_combined, axis=(0, 2)))
print("Max per channel:", np.max(X_train_combined, axis=(0, 2)))


🧪 Mean per channel: [-1.74520835  2.04312193  2.42211643  0.30033552  0.20917903  0.36060468]
📈 Std deviation per channel: [4.287851   3.32645084 3.13031334 0.8413845  0.59277173 0.91471134]
🔍 Min per channel: [-15.1982    -9.70226   -8.43529   -4.733656  -5.97433   -2.763014]
🔍 Max per channel: [28.6948 54.4933 61.9234 21.8452 16.9171 10.5557]


In [None]:
# Standardize your training data
mean = np.mean(X_train_combined, axis=(0, 2), keepdims=True)
std = np.std(X_train_combined, axis=(0, 2), keepdims=True)

X_train_standardized = (X_train_combined - mean) / std


In [None]:
# Compute stats across entire dataset (flattened over time axis)
print(" Mean per channel:", np.mean(X_train_standardized, axis=(0, 2)))
print(" Std deviation per channel:", np.std(X_train_standardized, axis=(0, 2)))
print(" Min per channel:", np.min(X_train_standardized, axis=(0, 2)))
print(" Max per channel:", np.max(X_train_standardized, axis=(0, 2)))


🧪 Mean per channel: [-7.32491132e-16 -4.96598552e-15  2.28738866e-15 -3.01819708e-16
 -1.12593076e-15 -5.08445565e-16]
📈 Std deviation per channel: [1. 1. 1. 1. 1. 1.]
🔍 Min per channel: [ -3.13746714  -3.53090501  -3.46847273  -5.98298583 -10.43151806
  -3.41486822]
🔍 Max per channel: [ 7.09912922 15.76760955 19.0080919  25.60644341 28.18609601 11.14569691]


In [None]:
np.savez("/content/drive/MyDrive/normalization_stats.npz", mean=mean, std=std)


In [None]:
np.savez_compressed('X_train_standardized', X=X_hand_common, y=y_hand_common)

In [None]:
import os

# Create the directory if it doesn't exist
os.makedirs("/content/drive/MyDrive/UCI+PAMAP2_COMBINED_PROCESSED", exist_ok=True)

# Now save
np.savez_compressed(
    "/content/drive/MyDrive/processed/X_train_standardized.npz",
    X=X_train_standardized,
    y=y_train_combined,
    mean=mean,
    std=std
)
