In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import pandas as pd
import numpy as np
from scipy.signal import find_peaks
from scipy.stats import linregress

# GSR Feature Extraction Function

def extract_gsr_features_updated(gsr_signal, sampling_rate=2000):
    gsr = np.array(gsr_signal)

    # Detect peaks (above mean)
    peaks, properties = find_peaks(gsr, height=np.mean(gsr))
    peak_heights = properties["peak_heights"] if "peak_heights" in properties else []

    # Detect valleys (invert signal to find local minima)
    valleys, _ = find_peaks(-gsr)

    # Linear regression for slope
    x = np.arange(len(gsr))
    slope, _, _, _, _ = linregress(x, gsr)

    # Time duration of the signal
    duration = len(gsr) / sampling_rate
    peak_time_ratio = len(peaks) / duration if duration > 0 else 0

    # Feature dictionary
    features = {
        'GSR_Mean': np.mean(gsr),
        'GSR_Std': np.std(gsr),
        'GSR_Variance': np.var(gsr),
        'GSR_Min': np.min(gsr),
        'GSR_Max': np.max(gsr),
        'GSR_Range': np.max(gsr) - np.min(gsr),
        'GSR_PeakCount': len(peaks),
        'GSR_ValleyCount': len(valleys),
        'GSR_PeakMeanAmp': np.mean(peak_heights) if len(peak_heights) > 0 else 0,
        'GSR_AUC': np.trapezoid(gsr),
        'GSR_Slope': slope,
        'GSR_PeakTimeRatio': peak_time_ratio
    }

    return pd.Series(features)

# List of 26 included participant IDs
included_participants = [
    100, 102, 107, 108, 109, 110, 111, 112, 113,
    114, 115, 116, 117, 119, 120, 121, 122, 123,
    124, 125, 126, 127, 128, 129, 132, 133
]

# Path to all participant GSR CSVs
base_path = '/content/drive/MyDrive/IT/MScThesis/potentialDS/healthyVRskinDS/05 ECG-GSR Data/01 ECG-GSR Data (Pre-Processed)/Cleaned_GSR_CSVs'
all_data = []

for filename in os.listdir(base_path):
    if filename.endswith('.csv'):
        path = os.path.join(base_path, filename)
        df = pd.read_csv(path)

        # Check if this participant should be included
        participant_id = int(df.loc[0, 'Participant_ID'])
        if participant_id not in included_participants:
            continue

        # Convert GSR signal from string to list if needed
        if isinstance(df.loc[0, 'GSR_Signal'], str):
            df['GSR_Signal'] = df['GSR_Signal'].apply(eval)

        # Filter out baseline BEFORE feature extraction
        df = df[df['Label'] != 4].reset_index(drop=True)

        # Extract features
        features = df['GSR_Signal'].apply(extract_gsr_features_updated)
        features['label'] = df['Label']
        features['participant'] = df['Participant_ID']
        features['video'] = df['Video_Index']

        all_data.append(features)

# Combine all participant data into one DataFrame
df_final = pd.concat(all_data, ignore_index=True)
print("Final dataset shape:", df_final.shape)
df_final.head()


Final dataset shape: (300, 15)


Unnamed: 0,GSR_Mean,GSR_Std,GSR_Variance,GSR_Min,GSR_Max,GSR_Range,GSR_PeakCount,GSR_ValleyCount,GSR_PeakMeanAmp,GSR_AUC,GSR_Slope,GSR_PeakTimeRatio,label,participant,video
0,1.808212,0.107222,0.011496,1.652888,2.007352,0.354464,2.0,5.0,2.005259,657366.3,-1.005752e-06,0.011003,1,102,2
1,3.602151,0.190428,0.036263,2.848227,3.998326,1.150099,8.0,11.0,3.803034,1238052.0,-1.070788e-06,0.046552,2,102,3
2,1.601157,0.060926,0.003712,1.471214,1.716384,0.24517,2.0,7.0,1.709457,308900.0,-5.334046e-07,0.020734,0,102,4
3,1.700477,0.074889,0.005608,1.567081,1.828967,0.261886,3.0,8.0,1.808965,536094.0,-8.076004e-07,0.019032,1,102,5
4,1.613899,0.036161,0.001308,1.557908,1.689144,0.131236,1.0,3.0,1.689144,235567.9,-7.614943e-07,0.013702,0,102,6


In [None]:
for filename in os.listdir(base_path):
    if filename.endswith('.csv'):
        try:
            path = os.path.join(base_path, filename)
            df = pd.read_csv(path)

            participant_id = int(df.loc[0, 'Participant_ID'])
            if participant_id not in included_participants:
                continue

            if isinstance(df.loc[0, 'GSR_Signal'], str):
                df['GSR_Signal'] = df['GSR_Signal'].apply(eval)

            df = df[df['Label'] != 4].reset_index(drop=True)
            if df.empty:
                print(f"All baseline for {participant_id} in {filename}")
                continue

            features = df['GSR_Signal'].apply(extract_gsr_features_updated)
            features['label'] = df['Label']
            features['participant'] = df['Participant_ID']
            features['video'] = df['Video_Index']
            all_data.append(features)

        except Exception as e:
            print(f"⚠️ Error with participant {filename}: {e}")


In [None]:
import os
import pandas as pd
import numpy as np
from scipy.signal import find_peaks
from scipy.stats import linregress

# Feature extraction function
def extract_gsr_features_updated(gsr_signal, sampling_rate=2000):
    gsr = np.array(gsr_signal)
    peaks, properties = find_peaks(gsr, height=np.mean(gsr))
    peak_heights = properties["peak_heights"] if "peak_heights" in properties else []
    valleys, _ = find_peaks(-gsr)
    x = np.arange(len(gsr))
    slope, _, _, _, _ = linregress(x, gsr)
    duration = len(gsr) / sampling_rate
    peak_time_ratio = len(peaks) / duration if duration > 0 else 0

    features = {
        'GSR_Mean': np.mean(gsr),
        'GSR_Std': np.std(gsr),
        'GSR_Variance': np.var(gsr),
        'GSR_Min': np.min(gsr),
        'GSR_Max': np.max(gsr),
        'GSR_Range': np.max(gsr) - np.min(gsr),
        'GSR_PeakCount': len(peaks),
        'GSR_ValleyCount': len(valleys),
        'GSR_PeakMeanAmp': np.mean(peak_heights) if len(peak_heights) > 0 else 0,
        'GSR_AUC': np.trapezoid(gsr),
        'GSR_Slope': slope,
        'GSR_PeakTimeRatio': peak_time_ratio
    }
    return pd.Series(features)

# Included participant IDs
included_participants = [
    100, 102, 107, 108, 109, 110, 111, 112, 113,
    114, 115, 116, 117, 119, 120, 121, 122, 123,
    124, 125, 126, 127, 128, 129, 132, 133
]

# Path to cleaned GSR data
base_path = '/content/drive/MyDrive/IT/MScThesis/potentialDS/healthyVRskinDS/05 ECG-GSR Data/01 ECG-GSR Data (Pre-Processed)/Cleaned_GSR_CSVs'
all_data = []

for filename in os.listdir(base_path):
    if filename.lower().endswith('.csv'):
        print(f"📄 Checking file: {filename}")
        try:
            path = os.path.join(base_path, filename)
            df = pd.read_csv(path)

            participant_id = int(df.loc[0, 'Participant_ID'])

            if participant_id not in included_participants:
                print(f"⏭️ Skipping participant {participant_id} (not in included list)")
                continue

            if isinstance(df.loc[0, 'GSR_Signal'], str):
                df['GSR_Signal'] = df['GSR_Signal'].apply(eval)

            df = df[df['Label'] != 4].reset_index(drop=True)
            if df.empty:
                print(f"⚠️ All baseline data for participant {participant_id} in {filename}")
                continue

            features = df['GSR_Signal'].apply(extract_gsr_features_updated)
            features['label'] = df['Label']
            features['participant'] = df['Participant_ID']
            features['video'] = df['Video_Index']

            all_data.append(features)
            print(f"✅ Processed participant {participant_id} from {filename}")

        except Exception as e:
            print(f"❌ Error with file {filename}: {e}")

# Combine into final DataFrame
df_final = pd.concat(all_data, ignore_index=True)
print("📊 Final dataset shape:", df_final.shape)


📄 Checking file: GSR_Participant_101.csv
⏭️ Skipping participant 101 (not in included list)
📄 Checking file: GSR_Participant_102.csv
✅ Processed participant 102 from GSR_Participant_102.csv
📄 Checking file: GSR_Participant_103.csv
⏭️ Skipping participant 103 (not in included list)
📄 Checking file: GSR_Participant_104.csv
⏭️ Skipping participant 104 (not in included list)
📄 Checking file: GSR_Participant_105.csv
⏭️ Skipping participant 105 (not in included list)
📄 Checking file: GSR_Participant_106.csv
⏭️ Skipping participant 106 (not in included list)
📄 Checking file: GSR_Participant_107.csv
✅ Processed participant 107 from GSR_Participant_107.csv
📄 Checking file: GSR_Participant_108.csv
✅ Processed participant 108 from GSR_Participant_108.csv
📄 Checking file: GSR_Participant_109.csv
✅ Processed participant 109 from GSR_Participant_109.csv
📄 Checking file: GSR_Participant_110.csv
✅ Processed participant 110 from GSR_Participant_110.csv
📄 Checking file: GSR_Participant_111.csv
✅ Process

In [None]:
df_final.to_csv("/content/GSR_Features_26Participants.csv", index=False)

In [None]:
#@title install pywt
!pip install PyWavelets

Collecting PyWavelets
  Downloading pywavelets-1.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)
Downloading pywavelets-1.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/4.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m4.5/4.5 MB[0m [31m156.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m80.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyWavelets
Successfully installed PyWavelets-1.8.0


In [None]:
# @title WAVELET
import os
import numpy as np
import pandas as pd
import pywt

# --- Wavelet Feature Extraction Function ---
def extract_wavelet_band_energy(signal, fs=2000):
    max_level = 6
    wp = pywt.WaveletPacket(data=signal, wavelet='db4', mode='symmetric', maxlevel=max_level)
    total_bandwidth = fs / 2
    band_width = total_bandwidth / (2 ** max_level)

    target_bands = {
        'freq0': (0.0000, 0.0625),
        'freq1': (0.0625, 0.1250),
        'freq2': (0.1250, 0.1875),
        'freq3': (0.1875, 0.2500),
        'freq4': (0.2500, 0.3125),
        'freq5': (0.3125, 0.3750),
        'freq6': (0.3750, 0.4375),
        'freq7': (0.4375, 0.5000),
    }

    band_energy = {f: 0 for f in target_bands}
    total_energy = 0

    level_nodes = wp.get_level(max_level, order='freq')

    for i, node in enumerate(level_nodes):
        f_start = i * band_width
        f_end = f_start + band_width
        energy = np.sum(np.square(node.data))
        total_energy += energy

        for band_name, (low, high) in target_bands.items():
            if (f_start >= low * fs) and (f_end <= high * fs):
                band_energy[band_name] += energy
                break

    for band in band_energy:
        band_energy[band] /= total_energy if total_energy != 0 else 1

    return pd.Series(band_energy)

# --- List of Included Participant IDs (clean, fixed) ---
included_participants = [
    102, 107, 108, 109, 110, 111, 112, 113,
    114, 115, 116, 117, 119, 120, 121, 122,
    123, 124, 125, 126, 127, 128, 129, 132, 133
]

# --- Wavelet Feature Extraction Loop ---
base_path = '/content/drive/MyDrive/IT/MScThesis/potentialDS/healthyVRskinDS/05 ECG-GSR Data/01 ECG-GSR Data (Pre-Processed)/Cleaned_GSR_CSVs'
all_data = []

for filename in os.listdir(base_path):
    if filename.lower().endswith('.csv'):
        path = os.path.join(base_path, filename)
        df = pd.read_csv(path)

        # Participant ID check
        participant_id = int(df.loc[0, 'Participant_ID'])
        if participant_id not in included_participants:
            continue

        # Convert GSR signal if stored as string
        if isinstance(df.loc[0, 'GSR_Signal'], str):
            df['GSR_Signal'] = df['GSR_Signal'].apply(eval)

        # Remove baseline label
        df = df[df['Label'] != 4].reset_index(drop=True)
        if df.empty:
            continue

        # Extract wavelet features
        wavelet_features = df['GSR_Signal'].apply(extract_wavelet_band_energy)
        wavelet_features['label'] = df['Label']
        wavelet_features['participant'] = df['Participant_ID']
        wavelet_features['video'] = df['Video_Index']

        all_data.append(wavelet_features)

# Combine and inspect final dataset
df_wavelet_final = pd.concat(all_data, ignore_index=True)
print("✅ Final wavelet dataset shape:", df_wavelet_final.shape)
df_wavelet_final.head()


✅ Final wavelet dataset shape: (300, 11)


Unnamed: 0,freq0,freq1,freq2,freq3,freq4,freq5,freq6,freq7,label,participant,video
0,1.0,1.358571e-11,2.870167e-12,3.73531e-12,3.872229e-14,1.052761e-12,2.025291e-13,1.419451e-12,1,102,2
1,1.0,8.950634e-10,6.577798e-11,1.383071e-11,1.022438e-10,8.098477e-11,8.374152e-11,2.77394e-11,2,102,3
2,1.0,1.177976e-10,8.232625e-11,5.249016e-12,3.497714e-11,4.394342e-11,1.042154e-11,1.045918e-11,0,102,4
3,1.0,3.616409e-11,2.657689e-12,5.588256e-13,4.130993e-12,3.272235e-12,3.383518e-12,1.120732e-12,1,102,5
4,1.0,6.110535e-12,1.602008e-11,1.38812e-11,1.222137e-13,2.517771e-13,2.091656e-12,2.334411e-12,0,102,6


In [None]:
# Ensure data types match before merging
df_wavelet_final['participant'] = df_wavelet_final['participant'].astype(int)
df_wavelet_final['video'] = df_wavelet_final['video'].astype(int)
df_wavelet_final['label'] = df_wavelet_final['label'].astype(int)

df_final['participant'] = df_final['participant'].astype(int)
df_final['video'] = df_final['video'].astype(int)
df_final['label'] = df_final['label'].astype(int)

# Perform the merge
df_merged = pd.merge(
    df_final,
    df_wavelet_final,
    on=['participant', 'video', 'label'],
    how='inner'
)

# Check final shape and sample
print("✅ Merged dataset shape:", df_merged.shape)
display(df_merged.head())

# Optional: save the merged dataset
df_merged.to_csv("GSR_with_Wavelet_Merged.csv", index=False)


✅ Merged dataset shape: (300, 23)


Unnamed: 0,GSR_Mean,GSR_Std,GSR_Variance,GSR_Min,GSR_Max,GSR_Range,GSR_PeakCount,GSR_ValleyCount,GSR_PeakMeanAmp,GSR_AUC,...,participant,video,freq0,freq1,freq2,freq3,freq4,freq5,freq6,freq7
0,1.808212,0.107222,0.011496,1.652888,2.007352,0.354464,2.0,5.0,2.005259,657366.3,...,102,2,1.0,1.358571e-11,2.870167e-12,3.73531e-12,3.872229e-14,1.052761e-12,2.025291e-13,1.419451e-12
1,3.602151,0.190428,0.036263,2.848227,3.998326,1.150099,8.0,11.0,3.803034,1238052.0,...,102,3,1.0,8.950634e-10,6.577798e-11,1.383071e-11,1.022438e-10,8.098477e-11,8.374152e-11,2.77394e-11
2,1.601157,0.060926,0.003712,1.471214,1.716384,0.24517,2.0,7.0,1.709457,308900.0,...,102,4,1.0,1.177976e-10,8.232625e-11,5.249016e-12,3.497714e-11,4.394342e-11,1.042154e-11,1.045918e-11
3,1.700477,0.074889,0.005608,1.567081,1.828967,0.261886,3.0,8.0,1.808965,536094.0,...,102,5,1.0,3.616409e-11,2.657689e-12,5.588256e-13,4.130993e-12,3.272235e-12,3.383518e-12,1.120732e-12
4,1.613899,0.036161,0.001308,1.557908,1.689144,0.131236,1.0,3.0,1.689144,235567.9,...,102,6,1.0,6.110535e-12,1.602008e-11,1.38812e-11,1.222137e-13,2.517771e-13,2.091656e-12,2.334411e-12
