# Character Identity 

In [1]:
import sys
import os
# Add the parent directory to the path to import bundle
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

import numpy as np
import warnings
from sklearn.preprocessing import scale
from scipy.io import loadmat
from scipy import signal
from bundle.DataCraft import * 


warnings.filterwarnings("ignore")

# --- Configuration ---
contributors_to_process = ["II"]  # List of contributors to process
characters_file_path = "../../data/characters.txt"  # Updated path
channels = list(range(64))
initial_sampling_frequency = 240
down_sampling_frequency = 120
WINDOW_DURATIONS = [650, 300]  # ms - for window sizes 78 and 36

# --- File Checks & Character Loading ---
try:
    with open(characters_file_path, "r") as f:
        characters = f.read().strip()
    print(f"Loaded characters from {characters_file_path}: {characters}")
except FileNotFoundError:
    print(f"Error: Characters file not found at {characters_file_path}")
    exit()

# --- Initialize Combined Samples Dictionary for both window sizes --- 
# samples_by_window_size will have structure: {78: {char: []}, 36: {char: []}}
samples_by_window_size = {}
for duration in WINDOW_DURATIONS:
    window_size = round(down_sampling_frequency * (duration / 1000))
    samples_by_window_size[window_size] = {char: [] for char in characters}
    print(f"Initialized samples dictionary for window size {window_size} (duration {duration}ms)")


Loaded characters from ../../data/characters.txt: ABCDEFGHIJKLMNOPQRSTUVWXYZ123456789_
Initialized samples dictionary for window size 78 (duration 650ms)
Initialized samples dictionary for window size 36 (duration 300ms)


# Loop Through Contributors

In [2]:
for contributor_selected in contributors_to_process:
    print(f"\n{'=' * 20} Processing Contributor: {contributor_selected} {'=' * 20}")
    contributor_data_file_path = f"../../data/contributor_{contributor_selected}.mat"

    # Check if data file exists for the current contributor
    if not os.path.exists(contributor_data_file_path):
        print(
            f"Warning: data file not found for contributor {contributor_selected} at {contributor_data_file_path}. Skipping this contributor.")
        continue  # Skip to the next contributor

    # --- Load Data ---
    print(f"Loading data from: {contributor_data_file_path}")
    try:
        data = loadmat(contributor_data_file_path)
        signals = data["Signal"]  # Shape: (Trials, Samples, Channels)
        flashing = data["Flashing"]
        stimulus = data["StimulusType"]
        word_raw = data["TargetChar"]  # Expected: String of target chars
    except Exception as e:
        print(f"Error loading data for contributor {contributor_selected}: {e}. Skipping this contributor.")
        continue
        
    # --- Data Consistency Checks & Target Character Extraction ---
    num_signal_trials = signals.shape[0]
    print(f"Number of trials based on Signal array: {num_signal_trials}")

    target_char_string = None
    if isinstance(word_raw, np.ndarray):
        if word_raw.size == 1:
            target_char_string = str(word_raw.item()).strip()
        else:
            print(f"Warning: Unexpected shape for TargetChar: {word_raw.shape}. Trying to flatten.")
            try:
                target_char_string = "".join(map(str, word_raw.flatten()))
            except Exception as e:
                print(f"Error processing TargetChar array: {e}")
    elif isinstance(word_raw, str):
        target_char_string = word_raw.strip()
    else:
        print(f"Error: Unexpected type for TargetChar: {type(word_raw)}")

    if target_char_string is None:
        print("Error: Could not extract target character string. Skipping this contributor.")
        continue

    print(f"Extracted Target Character String (length {len(target_char_string)}): {target_char_string[:50]}...")


    if len(target_char_string) != num_signal_trials:
        print(
            f"Error: Length of TargetChar string ({len(target_char_string)}) does not match number of trials in Signal ({num_signal_trials}). Please check data integrity.")
        print("Warning: Proceeding with minimum of the two lengths for processing this contributor.")
        trials_to_process = min(len(target_char_string), num_signal_trials)
    else:
        trials_to_process = num_signal_trials
        print("TargetChar string length matches number of signal trials.")

    print(f"\n Data Info (Initial):")
    print_data(signals, target_char_string, contributor_selected, initial_sampling_frequency)

    # --- Butterworth Filter ---
    print("Applying Butterworth filter...")
    sampling_frequency = initial_sampling_frequency
    b, a = signal.butter(4, [0.1 / sampling_frequency, 20 / sampling_frequency], "bandpass")
    for trial in range(num_signal_trials):
        try:
            signals[trial, :, :] = signal.filtfilt(b, a, signals[trial, :, :], axis=0)
        except IndexError:
            print(f"Error: IndexError during filtering trial {trial}. Check signal dimensions.")
            continue
    print("Filtering complete.")

    # --- Down-sampling ---
    print(f"Downsampling signals from {initial_sampling_frequency}Hz to {down_sampling_frequency}Hz...")
    SCALE_FACTOR = round(initial_sampling_frequency / down_sampling_frequency)
    sampling_frequency = down_sampling_frequency

    print(f"# Samples of EEG signals before downsampling: {signals.shape[1]}")
    try:
        signals = signals[:, ::SCALE_FACTOR, :]
        flashing = flashing[:, ::SCALE_FACTOR]
        stimulus = stimulus[:, ::SCALE_FACTOR]
    except IndexError as e:
        print(f"Error during downsampling: {e}. Check array dimensions after filtering.")
        continue  # Skip feature extraction for this contributor if downsampling fails
    print(f"# Samples of EEG signals after downsampling: {signals.shape[1]}")
    print("Downsampling complete.")

    # --- Feature Extraction & Grouping for EACH WINDOW SIZE ---
    SAMPLES_PER_TRIAL = signals.shape[1]
    
    for duration in WINDOW_DURATIONS:
        WINDOW_SAMPLES = round(sampling_frequency * (duration / 1000))
        print(f"\n--- Extracting features with window size {WINDOW_SAMPLES} (duration {duration}ms) ---")
        
        contributor_samples_collected = 0
        for trial in range(trials_to_process):  # 1 - 85 
            target_char = target_char_string[trial]

            if target_char not in samples_by_window_size[WINDOW_SAMPLES]:
                print(
                    f"Warning: Target character '{target_char}' from trial {trial} not found in characters.txt. Skipping this trial.")
                continue
                
            if trial >= flashing.shape[0] or trial >= stimulus.shape[0]:
                print(
                    f"Warning: Trial index {trial} out of bounds for flashing/stimulus arrays. Stopping processing for this contributor.")
                break

            for sample_idx in range(SAMPLES_PER_TRIAL):  # 0 - 3394
                is_flash_start = False
                
                try:
                    if sample_idx >= flashing.shape[1]: break
                    if sample_idx == 0 and flashing[trial, sample_idx] == 1:
                        is_flash_start = True
                    elif sample_idx > 0:
                        if sample_idx - 1 >= flashing.shape[1]: break
                        if flashing[trial, sample_idx - 1] == 0 and flashing[trial, sample_idx] == 1:
                            is_flash_start = True
                except IndexError:
                    print(f"Warning: IndexError accessing flashing at trial {trial}, sample {sample_idx}. Skipping sample.")
                    continue
                    
                if stimulus[trial,sample_idx]==0:
                    continue

                if is_flash_start:
                    lower_sample = sample_idx
                    upper_sample = sample_idx + WINDOW_SAMPLES
                    if upper_sample > SAMPLES_PER_TRIAL: continue

                    try:
                        if upper_sample > signals.shape[1]: continue
                        window = signals[trial, lower_sample:upper_sample, :]
                    except IndexError:
                        print(
                            f"Warning: IndexError extracting window at trial {trial}, samples {lower_sample}:{upper_sample}. Skipping window.")
                        continue

                    if window.shape[0] != WINDOW_SAMPLES: continue

                    try:
                        if window.size == 0 or np.all(np.std(window, axis=0) == 0):
                            normalized_window = window
                        else:
                            normalized_window = scale(window, axis=0)
                    except ValueError as e:
                        print(
                            f"Warning: ValueError during scaling window at trial {trial}, sample {sample_idx}: {e}. Skipping window.")
                        continue

                    # Append to the dictionary for this window size
                    samples_by_window_size[WINDOW_SAMPLES][target_char].append(normalized_window)
                    contributor_samples_collected += 1

        print(
            f"Feature extraction complete for window size {WINDOW_SAMPLES}. Added {contributor_samples_collected} samples.")
            
# --- End of Contributor Loop ---
print(f"\n{'=' * 20} Finished Processing All Contributors {'=' * 20}")



Loading data from: ../../data/contributor_II.mat
Number of trials based on Signal array: 85
Extracted Target Character String (length 85): VGREAAH8TVRHBYN_UGCOLO4EUERDOOHCIFOMDNU6LQCPKEIREK...
TargetChar string length matches number of signal trials.

 Data Info (Initial):
Contributor     Sampling Freq. (Hz)  Recording (min)      Chars      Spelled Word                  
II              240.00               46.01                85         VGREAAH8TVRHBYN_UGCOLO4EUERDOO
                                                                     HCIFOMDNU6LQCPKEIREKOYRQIDJXPB
                                                                     KOJDWZEUEWWFOEBHXTQTTZUMO     

Applying Butterworth filter...
Filtering complete.
Downsampling signals from 240Hz to 120Hz...
# Samples of EEG signals before downsampling: 7794
# Samples of EEG signals after downsampling: 3897
Downsampling complete.

--- Extracting features with window size 78 (duration 650ms) ---
Feature extraction complete for window 

# Verification

In [3]:
# Verification for both window sizes
for window_size, samples in samples_by_window_size.items():
    print(f"\n{'=' * 20} Window Size: {window_size} {'=' * 20}")
    print("Total number of windows per character:")
    total_win_collected = 0
    for char, char_samples in samples.items():
        count = len(char_samples)
        if count > 0:
            print(f"Character '{char}': {count} windows")
        total_win_collected += count
    print(f"\nTotal windows collected: {total_win_collected}")
    
    # Print First Sample for Character 'A'
    char_to_print = 'A'
    if char_to_print in samples and samples[char_to_print]:
        sample_data = samples[char_to_print][0]
        print(f"\nFirst sample for '{char_to_print}' - Shape: {sample_data.shape}")
    else:
        print(f"No samples found for character '{char_to_print}'")



Total number of windows per character:
Character 'A': 60 windows
Character 'B': 90 windows
Character 'C': 90 windows
Character 'D': 120 windows
Character 'E': 240 windows
Character 'F': 60 windows
Character 'G': 60 windows
Character 'H': 120 windows
Character 'I': 90 windows
Character 'J': 60 windows
Character 'K': 90 windows
Character 'L': 60 windows
Character 'M': 60 windows
Character 'N': 60 windows
Character 'O': 270 windows
Character 'P': 60 windows
Character 'Q': 90 windows
Character 'R': 150 windows
Character 'T': 120 windows
Character 'U': 150 windows
Character 'V': 60 windows
Character 'W': 90 windows
Character 'X': 60 windows
Character 'Y': 60 windows
Character 'Z': 60 windows
Character '4': 30 windows
Character '6': 30 windows
Character '8': 30 windows
Character '_': 30 windows

Total windows collected: 2550

First sample for 'A' - Shape: (78, 64)

Total number of windows per character:
Character 'A': 60 windows
Character 'B': 90 windows
Character 'C': 90 windows
Character 

#  Group samples according to three strategies

In [4]:
import pickle
import random
import math

# Get contributor name from the configuration
contributor_name = "_".join(contributors_to_process)  # Join all contributors if multiple

# Grouping strategies for both window sizes
repetition_strategies = {
    5: {"name": "5_rep", "samples_per_chunk": 10},
    10: {"name": "10_rep", "samples_per_chunk": 20},
    15: {"name": "15_rep", "samples_per_chunk": 30}
}

CHUNK_SIZE = 30  # Divide total windows into chunks of 30

print("\nStarting sample grouping process...")
print(f"Contributor(s): {contributor_name}")

# Process each window size
for window_size, samples in samples_by_window_size.items():
    print(f"\n{'=' * 50}")
    print(f"Processing Window Size: {window_size}")
    print(f"{'=' * 50}")
    
    # Process each repetition strategy
    for num_reps, strategy_info in repetition_strategies.items():
        strategy_name = strategy_info["name"]
        samples_per_chunk = strategy_info["samples_per_chunk"]
        
        print(f"\n--- Strategy: {num_reps} repetitions ({samples_per_chunk} samples per chunk) ---")
        
        sample_groups = {}  # Dictionary for this strategy
        
        # Process each character
        for char, char_samples_list in samples.items():
            num_samples = len(char_samples_list)
            
            if num_samples == 0:
                sample_groups[char] = []
                continue
            
            # Divide samples into chunks of 30
            num_chunks = math.ceil(num_samples / CHUNK_SIZE)
            char_grouped_samples = []
            
            for chunk_idx in range(num_chunks):
                start_idx = chunk_idx * CHUNK_SIZE
                end_idx = min(start_idx + CHUNK_SIZE, num_samples)
                chunk = char_samples_list[start_idx:end_idx]
                
                # Take the first N samples from this chunk (where N = samples_per_chunk)
                samples_to_take = min(samples_per_chunk, len(chunk))
                for sample in chunk[:samples_to_take]:
                    char_grouped_samples.append(sample)
            
            sample_groups[char] = char_grouped_samples
            
            if num_samples > 0:
                print(f"  Character '{char}': {num_samples} windows -> {len(char_grouped_samples)} images")
        
        # Save to pickle file with dynamic contributor name
        output_filepath = f"../../data/characters_eeg_{contributor_name}_window{window_size}_{strategy_name}.pkl"
        print(f"\nSaving to: {output_filepath}")
        
        try:
            with open(output_filepath, "wb") as f:
                pickle.dump(sample_groups, f)
            print(f"Successfully saved!")
            
            # Verification
            total_images = sum(len(samples) for samples in sample_groups.values())
            print(f"Total images in this file: {total_images}")
            
        except Exception as e:
            print(f"Error saving: {e}")

print("\n" + "=" * 50)
print("Finished creating all 6 pickle files!")
print("=" * 50)



Starting sample grouping process...
Contributor(s): II

Processing Window Size: 78

--- Strategy: 5 repetitions (10 samples per chunk) ---
  Character 'A': 60 windows -> 20 images
  Character 'B': 90 windows -> 30 images
  Character 'C': 90 windows -> 30 images
  Character 'D': 120 windows -> 40 images
  Character 'E': 240 windows -> 80 images
  Character 'F': 60 windows -> 20 images
  Character 'G': 60 windows -> 20 images
  Character 'H': 120 windows -> 40 images
  Character 'I': 90 windows -> 30 images
  Character 'J': 60 windows -> 20 images
  Character 'K': 90 windows -> 30 images
  Character 'L': 60 windows -> 20 images
  Character 'M': 60 windows -> 20 images
  Character 'N': 60 windows -> 20 images
  Character 'O': 270 windows -> 90 images
  Character 'P': 60 windows -> 20 images
  Character 'Q': 90 windows -> 30 images
  Character 'R': 150 windows -> 50 images
  Character 'T': 120 windows -> 40 images
  Character 'U': 150 windows -> 50 images
  Character 'V': 60 windows -> 20