In [1]:
# Cell 1: Imports and Setup (Updated channel names)
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import re

# Base paths configuration
base_paths = {
    "healthy": r"C:\NewHandPD\Healthy Signals\Signal",
    "patient": r"C:\NewHandPD\Patient Signals\Signal"
}

category_labels = {
    "healthy": 0,
    "patient": 1
}

# Updated channel names (excluding Microphone)
channel_names = [
    "Fingergrip", "Axial_Pressure",
    "Tilt_X", "Tilt_Y", "Tilt_Z"
]

In [2]:
# Cell 2: Enhanced File Parsing Function with Metadata Extraction
def parse_signal_file(file_path, label):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    meta_info = {}
    signal_start_idx = 0
    in_meta_section = False

    # Extract metadata
    for i, line in enumerate(lines):
        line = line.strip()
        
        if line == "#<meta>":
            in_meta_section = True
            continue
        elif line == "#</meta>":
            in_meta_section = False
            signal_start_idx = i + 1
            break
            
        if in_meta_section and line.startswith("#<") and line.endswith(">"):
            # Extract key-value pairs from metadata
            try:
                key = line[2:line.find(">")]  # Get text between #< and >
                value = line[line.find(">")+1:-1]  # Get text between > and <
                
                # Convert numeric values to appropriate type
                if value.isdigit():
                    value = int(value)
                elif value.replace('.', '', 1).isdigit():
                    value = float(value)
                elif value.lower() == 'true':
                    value = True
                elif value.lower() == 'false':
                    value = False
                elif value == '':
                    value = None
                    
                meta_info[key] = value
            except:
                continue

    # Load signal data and drop first channel (Microphone)
    signal_lines = lines[signal_start_idx:]
    signal_array = np.loadtxt(signal_lines, delimiter="\t")
    
    # Keep only channels 1-5 (drop channel 0 - Microphone)
    if signal_array.ndim == 1:
        signal_array = signal_array[1:6].reshape(1, -1)  # For single-row signals
    else:
        signal_array = signal_array[:, 1:6]  # For multi-row signals

    return signal_array, label, meta_info

In [3]:
# Cell 3: Main Processing (Now includes metadata in records)
all_records = []
sigMea_records = []
sigSp_records = []

for category, folder_path in base_paths.items():
    label = category_labels[category]
    print(f"Processing {category} files...")

    for file_name in tqdm(os.listdir(folder_path)):
        if not file_name.endswith(".txt"):
            continue
            
        file_path = os.path.join(folder_path, file_name)
        try:
            signal_data, label_val, meta_info = parse_signal_file(file_path, label)

            record = {
                "file_name": file_name,
                "label": label_val,
                "signal": signal_data,
                **meta_info  # Unpack all metadata into the record
            }

            all_records.append(record)
            
            # Filter for specific file prefixes
            if file_name.startswith("sigMea"):
                sigMea_records.append(record)
            elif file_name.startswith("sigSp"):
                sigSp_records.append(record)

        except Exception as e:
            print(f"Error parsing {file_name}: {e}")

Processing healthy files...


100%|██████████| 421/421 [00:14<00:00, 29.44it/s]


Processing patient files...


100%|██████████| 373/373 [00:20<00:00, 18.44it/s]


In [4]:
# Cell 4: Create DataFrames (Now includes metadata columns)
df_all = pd.DataFrame(all_records)
df_sigMea = pd.DataFrame(sigMea_records)
df_sigSp = pd.DataFrame(sigSp_records)

print(f"\nSummary:")
print(f"Total files processed: {len(all_records)}")
print(f"Files starting with 'sigMea': {len(sigMea_records)}")
print(f"Files starting with 'sigSp': {len(sigSp_records)}")

# Show metadata columns we've extracted
print("\nMetadata columns found:")
print([col for col in df_all.columns if col not in ['file_name', 'label', 'signal']])


Summary:
Total files processed: 792
Files starting with 'sigMea': 264
Files starting with 'sigSp': 264

Metadata columns found:
['Person_ID_Number', 'Age', 'Gender', 'Writing_Hand', 'Weight', 'Height', 'Smoker', 'Notice', 'Object', 'Object_Index', 'Pen', 'Samplerate', 'Time', 'Date', 'Comment', 'Surename', 'Forename']


In [5]:
# Cell 5: Combine sigMea and sigSp DataFrames
df_combined = pd.concat([df_sigMea, df_sigSp], ignore_index=True)
print(f"Combined DataFrame shape: {df_combined.shape}")

Combined DataFrame shape: (528, 20)


In [6]:
# Cell 6: Simplified Feature extraction helpers
def compute_basic_stats(signal):
    """Compute only basic statistics without entropy or zero crossings"""
    return {
        "mean": np.mean(signal),
        "std": np.std(signal),
        "min": np.min(signal),
        "max": np.max(signal),
        "range": np.max(signal) - np.min(signal),
        "median": np.median(signal),
    }

def compute_derivatives(signal, order=1, sampling_rate=1000):
    """Compute derivatives up to snap (2nd derivative)"""
    dt = 1 / sampling_rate
    derivative = signal.copy()
    for _ in range(order):
        derivative = np.gradient(derivative, dt)
    return derivative

def compute_mass(signal):
    """Compute motion mass (sum of absolute values)"""
    return np.sum(np.abs(signal))

def extract_features_from_window(window_data, sampling_rate=1000):
    """Simplified feature extraction without entropy or zero crossings"""
    features = {}
    
    for i, channel in enumerate(channel_names):
        signal = window_data[:, i]
        stats = compute_basic_stats(signal)
        for stat_name, stat_val in stats.items():
            features[f"{channel}_{stat_name}"] = stat_val

        # First and second derivatives
        jerk = compute_derivatives(signal, order=1, sampling_rate=sampling_rate)
        snap = compute_derivatives(signal, order=2, sampling_rate=sampling_rate)

        # Only mass features
        features[f"{channel}_jerk_mass"] = compute_mass(jerk)
        features[f"{channel}_snap_mass"] = compute_mass(snap)

    return features

In [7]:
# Cell 7: Enhanced Sliding window feature extraction with metadata
window_size = 1000  # 1 second
step_size = 500     # 50% overlap
sampling_rate = 1000

all_features = []

for idx, row in tqdm(df_combined.iterrows(), total=len(df_combined)):
    signal = row["signal"]
    label = row["label"]
    file_name = row["file_name"]
    
    # Get all metadata columns (excluding signal, label, and file_name)
    metadata = {k: v for k, v in row.items() 
               if k not in ['signal', 'label', 'file_name']}

    if signal.shape[0] < window_size:
        continue  # Skip very short signals

    for start in range(0, signal.shape[0] - window_size + 1, step_size):
        end = start + window_size
        window_data = signal[start:end, :]

        window_features = extract_features_from_window(window_data, sampling_rate)
        window_features["label"] = label
        window_features["file_name"] = file_name
        window_features["start_index"] = start
        window_features["end_index"] = end
        
        # Add all metadata to the window features
        window_features.update(metadata)

        all_features.append(window_features)

df_features = pd.DataFrame(all_features)
print(f"Extracted features shape: {df_features.shape}")

100%|██████████| 528/528 [00:24<00:00, 21.57it/s]


Extracted features shape: (16922, 61)


In [8]:
# Cell 8: Add delta features (Now preserves metadata columns)
delta_features = []
grouped = df_features.groupby("file_name")

for file_name, group in grouped:
    group = group.sort_values("start_index").reset_index(drop=True)
    group_delta = group.copy()

    # Identify feature columns (excluding metadata, label, and index columns)
    non_feature_cols = ['label', 'file_name', 'start_index', 'end_index'] + \
                      [col for col in group.columns if col in metadata.keys()]
    feature_cols = [col for col in group.columns if col not in non_feature_cols]
    
    # Compute deltas only for feature columns
    for col in feature_cols:
        group_delta[f"delta_{col}"] = group[col].diff()

    # Fill first row deltas with 0
    delta_cols = [f"delta_{col}" for col in feature_cols]
    group_delta.loc[0, delta_cols] = 0

    delta_features.append(group_delta)

# Combine all processed groups
df_delta_features = pd.concat(delta_features, ignore_index=True)

print(f"Final shape with delta features: {df_delta_features.shape}")
print("Any NaNs left?", df_delta_features.isna().any().any())

# Show the structure of the final dataframe
print("\nFinal DataFrame columns:")
print(df_delta_features.columns.tolist())
df_delta_features

Final shape with delta features: (16922, 101)
Any NaNs left? True

Final DataFrame columns:
['Fingergrip_mean', 'Fingergrip_std', 'Fingergrip_min', 'Fingergrip_max', 'Fingergrip_range', 'Fingergrip_median', 'Fingergrip_jerk_mass', 'Fingergrip_snap_mass', 'Axial_Pressure_mean', 'Axial_Pressure_std', 'Axial_Pressure_min', 'Axial_Pressure_max', 'Axial_Pressure_range', 'Axial_Pressure_median', 'Axial_Pressure_jerk_mass', 'Axial_Pressure_snap_mass', 'Tilt_X_mean', 'Tilt_X_std', 'Tilt_X_min', 'Tilt_X_max', 'Tilt_X_range', 'Tilt_X_median', 'Tilt_X_jerk_mass', 'Tilt_X_snap_mass', 'Tilt_Y_mean', 'Tilt_Y_std', 'Tilt_Y_min', 'Tilt_Y_max', 'Tilt_Y_range', 'Tilt_Y_median', 'Tilt_Y_jerk_mass', 'Tilt_Y_snap_mass', 'Tilt_Z_mean', 'Tilt_Z_std', 'Tilt_Z_min', 'Tilt_Z_max', 'Tilt_Z_range', 'Tilt_Z_median', 'Tilt_Z_jerk_mass', 'Tilt_Z_snap_mass', 'label', 'file_name', 'start_index', 'end_index', 'Person_ID_Number', 'Age', 'Gender', 'Writing_Hand', 'Weight', 'Height', 'Smoker', 'Notice', 'Object', 'Object_

Unnamed: 0,Fingergrip_mean,Fingergrip_std,Fingergrip_min,Fingergrip_max,Fingergrip_range,Fingergrip_median,Fingergrip_jerk_mass,Fingergrip_snap_mass,Axial_Pressure_mean,Axial_Pressure_std,...,delta_Tilt_Y_jerk_mass,delta_Tilt_Y_snap_mass,delta_Tilt_Z_mean,delta_Tilt_Z_std,delta_Tilt_Z_min,delta_Tilt_Z_max,delta_Tilt_Z_range,delta_Tilt_Z_median,delta_Tilt_Z_jerk_mass,delta_Tilt_Z_snap_mass
0,5.462305,0.260225,5.083403,6.512203,1.428800,5.361213,17722.403263,8.507645e+06,5.949351,0.004271,...,0.000,0.0,0.000000,0.000000,0.00000,0.00000,0.00000,0.00000,0.00,0.0
1,5.433828,0.221120,5.140265,6.064690,0.924425,5.363130,19508.365037,9.544334e+06,5.949649,0.013002,...,-2.740,8520.0,-0.011304,0.006764,-0.02089,0.01870,0.03959,-0.00880,449.28,393465.0
2,5.723913,0.348393,4.985950,6.224496,1.238546,5.793650,17715.685612,8.368615e+06,5.947940,0.015945,...,353.515,310922.5,-0.052200,0.006008,-0.01980,0.00000,0.01980,-0.07368,-135.80,-116560.0
3,5.892087,0.276969,4.985950,6.224496,1.238546,5.988191,15240.896436,6.791810e+06,5.878483,0.140184,...,-785.700,-641915.0,-0.042398,-0.022727,-0.01759,-0.12317,-0.10558,-0.03354,-366.12,-308110.0
4,5.857324,0.309971,4.888707,6.597887,1.709180,5.943806,20612.432189,7.664715e+06,5.670894,0.455755,...,3916.635,3660362.5,-0.020489,0.007584,-0.10448,-0.01320,0.09128,-0.01265,1719.95,1539857.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16917,6.016277,0.187881,5.556223,6.572509,1.016285,5.959106,16378.589132,7.584063e+06,5.967094,0.085594,...,-274.900,-244100.0,0.026156,0.012476,-0.11217,0.08138,0.19355,0.02749,575.16,488270.0
16918,5.975543,0.091234,5.721879,6.226205,0.504326,5.970079,12204.307754,6.629756e+06,5.955774,0.075249,...,1981.690,2025110.0,0.021500,0.010821,0.00000,0.06048,0.06048,0.02859,2643.66,2482600.0
16919,5.968480,0.076950,5.721879,6.135714,0.413835,5.972797,11549.225402,6.413268e+06,5.960218,0.082892,...,468.460,410980.0,0.007537,0.000269,0.05388,0.00000,-0.05388,0.00440,409.05,578945.0
16920,5.944587,0.097590,5.614600,6.437226,0.822626,5.959259,12316.767977,6.318515e+06,5.977937,0.166939,...,-1120.135,-1254290.0,-0.041027,0.064827,-0.27602,-0.01979,0.25623,-0.01100,-1040.83,-1031245.0
