# Character Identity 

In [84]:
import numpy as np
import warnings
from sklearn.preprocessing import scale
from scipy.io import loadmat
from scipy import signal
import os


# Define a dummy print_data if DataCraft is not available/needed
def print_data(signals, words_string, contributor, freq):
    print(f"  Contributor: {contributor}")
    print(f"  Signals shape: {signals.shape}")
    if words_string is not None:
        print(f"  Target Characters String Length: {len(words_string) if isinstance(words_string, str) else 'N/A'}")
    print(f"  Sampling Frequency (initial): {freq}")


warnings.filterwarnings("ignore")

# --- Configuration ---
contributors_to_process = ["I", "II"]  # List of contributors to process
characters_file_path = "../../data/characters.txt"  # Updated path
channels = list(range(64))
initial_sampling_frequency = 240
down_sampling_frequency = 120
WINDOW_DURATION = 650  # ms

# --- File Checks & Character Loading ---
try:
    with open(characters_file_path, "r") as f:
        characters = f.read().strip()
    print(f"Loaded characters from {characters_file_path}: {characters}")
except FileNotFoundError:
    print(f"Error: Characters file not found at {characters_file_path}")
    exit()

# --- Initialize Combined Samples Dictionary --- 
# Initialize dictionary before the loop
samples = {char: [] for char in characters}
print(f"\nInitialized combined samples dictionary for characters: {', '.join(samples.keys())}")

Loaded characters from ../data/characters.txt: ABCDEFGHIJKLMNOPQRSTUVWXYZ123456789_

Initialized combined samples dictionary for characters: A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, U, V, W, X, Y, Z, 1, 2, 3, 4, 5, 6, 7, 8, 9, _


# Loop Through Contributors

In [85]:
for contributor_selected in contributors_to_process:
    print(f"\n{'=' * 20} Processing Contributor: {contributor_selected} {'=' * 20}")
    contributor_data_file_path = f"../../data/Contributor_{contributor_selected}.mat"

    # Check if data file exists for the current contributor
    if not os.path.exists(contributor_data_file_path):
        print(
            f"Warning: data file not found for contributor {contributor_selected} at {contributor_data_file_path}. Skipping this contributor.")
        continue  # Skip to the next contributor

    # --- Load Data ---
    print(f"Loading data from: {contributor_data_file_path}")
    try:
        data = loadmat(contributor_data_file_path)
        signals = data["Signal"]  # Shape: (Trials, Samples, Channels)
        flashing = data["Flashing"]
        stimulus = data["StimulusType"]
        word_raw = data["TargetChar"]  # Expected: String of target chars
    except Exception as e:
        print(f"Error loading data for contributor {contributor_selected}: {e}. Skipping this contributor.")
        continue
        
    # --- Data Consistency Checks & Target Character Extraction ---
    num_signal_trials = signals.shape[0]
    print(f"Number of trials based on Signal array: {num_signal_trials}")

    target_char_string = None
    if isinstance(word_raw, np.ndarray):
        if word_raw.size == 1:
            target_char_string = str(word_raw.item()).strip()
        else:
            print(f"Warning: Unexpected shape for TargetChar: {word_raw.shape}. Trying to flatten.")
            try:
                target_char_string = "".join(map(str, word_raw.flatten()))
            except Exception as e:
                print(f"Error processing TargetChar array: {e}")
    elif isinstance(word_raw, str):
        target_char_string = word_raw.strip()
    else:
        print(f"Error: Unexpected type for TargetChar: {type(word_raw)}")

    if target_char_string is None:
        print("Error: Could not extract target character string. Skipping this contributor.")
        continue

    print(f"Extracted Target Character String (length {len(target_char_string)}): {target_char_string[:50]}...")

##    print(f" target_char_string ====> {target_char_string}")
##    print(f" num_signal_trials ====> {num_signal_trials}")
    if len(target_char_string) != num_signal_trials:
        print(
            f"Error: Length of TargetChar string ({len(target_char_string)}) does not match number of trials in Signal ({num_signal_trials}). Please check data integrity.")
        print("Warning: Proceeding with minimum of the two lengths for processing this contributor.")
        trials_to_process = min(len(target_char_string), num_signal_trials)
    else:
        trials_to_process = num_signal_trials
        print("TargetChar string length matches number of signal trials.")

    print(f"\n Data Info (Initial):")
    print_data(signals, target_char_string, contributor_selected, initial_sampling_frequency)

    # --- Butterworth Filter ---
    print("Applying Butterworth filter...")
    sampling_frequency = initial_sampling_frequency
    b, a = signal.butter(4, [0.1 / sampling_frequency, 20 / sampling_frequency], "bandpass")
    for trial in range(num_signal_trials):
        try:
            signals[trial, :, :] = signal.filtfilt(b, a, signals[trial, :, :], axis=0)
        except IndexError:
            print(f"Error: IndexError during filtering trial {trial}. Check signal dimensions.")
            continue
    print("Filtering complete.")

    # --- Down-sampling ---
    print(f"Downsampling signals from {initial_sampling_frequency}Hz to {down_sampling_frequency}Hz...")
    SCALE_FACTOR = round(initial_sampling_frequency / down_sampling_frequency)
    sampling_frequency = down_sampling_frequency

    print(f"# Samples of EEG signals before downsampling: {signals.shape[1]}")
    try:
        signals = signals[:, ::SCALE_FACTOR, :]
        flashing = flashing[:, ::SCALE_FACTOR]
        stimulus = stimulus[:, ::SCALE_FACTOR]
    except IndexError as e:
        print(f"Error during downsampling: {e}. Check array dimensions after filtering.")
        continue  # Skip feature extraction for this contributor if downsampling fails
    print(f"# Samples of EEG signals after downsampling: {signals.shape[1]}")
    print("Downsampling complete.")

    # --- Feature Extraction & Grouping ---
    print("Extracting features and grouping by character...")
    N_CHANNELS = signals.shape[2]
    WINDOW_SAMPLES = round(sampling_frequency * (WINDOW_DURATION / 1000))
    SAMPLES_PER_TRIAL = signals.shape[1]
##    print(f"Samples per trials = {SAMPLES_PER_TRIAL}")

    contributor_samples_collected = 0
    for trial in range(trials_to_process):  # 1 - 85 
        target_char = target_char_string[trial]

        if target_char not in samples:
            print(
                f"Warning: Target character '{target_char}' from trial {trial} not found in characters.txt. Skipping this trial.")
            continue
            

        if trial >= flashing.shape[0] or trial >= stimulus.shape[0]:
            print(
                f"Warning: Trial index {trial} out of bounds for flashing/stimulus arrays. Stopping processing for this contributor.")
            break

        for sample_idx in range(SAMPLES_PER_TRIAL):  # 0 - 3394
            is_flash_start = False
            
            try:
                if sample_idx >= flashing.shape[1]: break
                if sample_idx == 0 and flashing[trial, sample_idx] == 1:
                    is_flash_start = True
                elif sample_idx > 0:
                    if sample_idx - 1 >= flashing.shape[1]: break
                    if flashing[trial, sample_idx - 1] == 0 and flashing[trial, sample_idx] == 1:
                        is_flash_start = True
            except IndexError:
                print(f"Warning: IndexError accessing flashing at trial {trial}, sample {sample_idx}. Skipping sample.")
                continue
                
            if stimulus[trial,sample_idx]==0:
                continue

            if is_flash_start:
                lower_sample = sample_idx
                upper_sample = sample_idx + WINDOW_SAMPLES
                if upper_sample > SAMPLES_PER_TRIAL: continue

                try:
                    if upper_sample > signals.shape[1]: continue
                    window = signals[trial, lower_sample:upper_sample, :]
                except IndexError:
                    print(
                        f"Warning: IndexError extracting window at trial {trial}, samples {lower_sample}:{upper_sample}. Skipping window.")
                    continue

                if window.shape[0] != WINDOW_SAMPLES: continue

                try:
                    if window.size == 0 or np.all(np.std(window, axis=0) == 0):
                        normalized_window = window
                    else:
                        normalized_window = scale(window, axis=0)
                except ValueError as e:
                    print(
                        f"Warning: ValueError during scaling window at trial {trial}, sample {sample_idx}: {e}. Skipping window.")
                    continue

                # Append to the main combined dictionary
                samples[target_char].append(normalized_window)
                contributor_samples_collected += 1

    print(
        f"Feature extraction and grouping complete for contributor {contributor_selected}. Added {contributor_samples_collected} samples.")

# --- End of Contributor Loop ---
print(f"\n{'=' * 20} Finished Processing All Contributors {'=' * 20}")


Loading data from: ../data/Contributor_I.mat
Number of trials based on Signal array: 85
Extracted Target Character String (length 85): EAEVQTDOJG8RBRGONCEDHCTUIDBPUHMEM6OUXOCFOUKWA4VJEF...
TargetChar string length matches number of signal trials.

 Data Info (Initial):
  Contributor: I
  Signals shape: (85, 7794, 64)
  Target Characters String Length: 85
  Sampling Frequency (initial): 240
Applying Butterworth filter...
Filtering complete.
Downsampling signals from 240Hz to 120Hz...
# Samples of EEG signals before downsampling: 7794
# Samples of EEG signals after downsampling: 3897
Downsampling complete.
Extracting features and grouping by character...
Feature extraction and grouping complete for contributor I. Added 2550 samples.

Loading data from: ../data/Contributor_II.mat
Number of trials based on Signal array: 85
Extracted Target Character String (length 85): VGREAAH8TVRHBYN_UGCOLO4EUERDOOHCIFOMDNU6LQCPKEIREK...
TargetChar string length matches number of signal trials.

 Data In

# Verification

In [86]:
# Verification
print("\n--- Verification (Combined Data) --- ")
print("Total number of samples per character across all contributors:")
total_samples_collected = 0
for char, char_samples in samples.items():
    count = len(char_samples)
    print(f"Character '{char}': {count} samples")
    total_samples_collected += count

print(f"\nTotal samples collected across all characters and contributors: {total_samples_collected}")

# --- Print First 5 Samples for Character 'A' (Combined Data) --- 
print("\n--- Samples for Character 'A' (First 5 from Combined Data) ---")
char_to_print = 'A'
num_samples_to_print = 5

if char_to_print in samples and samples[char_to_print]:
    print(
        f"Printing the first {min(num_samples_to_print, len(samples[char_to_print]))} samples for character '{char_to_print}':")
    for i, sample_data in enumerate(samples[char_to_print][:num_samples_to_print]):
        print(f"\n--- Sample {i + 1} for '{char_to_print}' (Shape: {sample_data.shape}) ---")
        # np.set_printoptions(threshold=np.inf) # Uncomment to print full array
        print(sample_data)
        # np.set_printoptions(threshold=1000) # Reset if needed
else:
    print(f"No samples found for character '{char_to_print}' in the combined data or the list is empty.")


--- Verification (Combined Data) --- 
Total number of samples per character across all contributors:
Character 'A': 120 samples
Character 'B': 180 samples
Character 'C': 180 samples
Character 'D': 240 samples
Character 'E': 480 samples
Character 'F': 120 samples
Character 'G': 120 samples
Character 'H': 240 samples
Character 'I': 180 samples
Character 'J': 120 samples
Character 'K': 180 samples
Character 'L': 120 samples
Character 'M': 120 samples
Character 'N': 120 samples
Character 'O': 540 samples
Character 'P': 120 samples
Character 'Q': 180 samples
Character 'R': 300 samples
Character 'S': 0 samples
Character 'T': 240 samples
Character 'U': 300 samples
Character 'V': 120 samples
Character 'W': 180 samples
Character 'X': 120 samples
Character 'Y': 120 samples
Character 'Z': 120 samples
Character '1': 0 samples
Character '2': 0 samples
Character '3': 0 samples
Character '4': 60 samples
Character '5': 0 samples
Character '6': 60 samples
Character '7': 0 samples
Character '8': 60 sam

#  Group samples according to three strategies

In [87]:
# 180 360 540 720 900 1080 1260 1440 1620 1800
# 30 60 90 120 150 180 210 240 270 300 330 360

import pickle
import random
import math

output_filepath = f"../../data/sample_groups.pkl"

strategies = {
    "set1": {"chunk_size": 30, "random": False},
    "set2": {"chunk_size": 20, "random": True}
}

print("\nStarting sample grouping process...")
sample_groups = {}  # Initialize the main dictionary to store grouped samples

# Process each character
for char, char_samples_list in samples.items():
    print(f"\nProcessing character: 		'{char}' ({len(char_samples_list)} samples)")
    if not char_samples_list:
        print(f"  Skipping character 		'{char}' as it has no samples.")
        sample_groups[char] = {}  # Still add char key, but with empty dict
        continue

    sample_groups[char] = {}  # Initialize dictionary for this character's sets

    # Process each strategy for the current character
    for set_name, params in strategies.items():
        print(f"  Grouping for {set_name} (chunk size: {params['chunk_size']}, random: {params['random']})...")

        chunk_size = params['chunk_size']
        is_random = params['random']
        current_samples_list = list(char_samples_list)  # Make a copy for potential shuffling

        if is_random:
            random.shuffle(current_samples_list)

        num_samples = len(current_samples_list)
        num_chunks = math.ceil(num_samples / chunk_size)

        set_chunks = []  # List to hold the chunks for this set

        for i in range(num_chunks):
            start_idx = i * chunk_size
            end_idx = start_idx + chunk_size
            # Slice the list of samples (which are numpy arrays)
            chunk = current_samples_list[start_idx:end_idx]

            if chunk:  # Ensure chunk is not empty
                set_chunks.append(chunk)  # Append the list of numpy arrays

        sample_groups[char][set_name] = set_chunks
        print(f"    Created {len(set_chunks)} chunks for {set_name}.")

print("\nFinished sample grouping.")

# --- Save Sample Groups Dictionary --- 
print(f"\nSaving sample groups dictionary to: {output_filepath}")
try:
    with open(output_filepath, "wb") as f:
        pickle.dump(sample_groups, f)
    print(f"Successfully saved sample groups to {output_filepath}.")
except Exception as e:
    print(f"Error saving sample groups dictionary: {e}")

print("\nScript finished.")  # Final finished print


Starting sample grouping process...

Processing character: 		'A' (120 samples)
  Grouping for set1 (chunk size: 30, random: False)...
    Created 4 chunks for set1.
  Grouping for set2 (chunk size: 15, random: False)...
    Created 8 chunks for set2.
  Grouping for set3 (chunk size: 20, random: True)...
    Created 6 chunks for set3.

Processing character: 		'B' (180 samples)
  Grouping for set1 (chunk size: 30, random: False)...
    Created 6 chunks for set1.
  Grouping for set2 (chunk size: 15, random: False)...
    Created 12 chunks for set2.
  Grouping for set3 (chunk size: 20, random: True)...
    Created 9 chunks for set3.

Processing character: 		'C' (180 samples)
  Grouping for set1 (chunk size: 30, random: False)...
    Created 6 chunks for set1.
  Grouping for set2 (chunk size: 15, random: False)...
    Created 12 chunks for set2.
  Grouping for set3 (chunk size: 20, random: True)...
    Created 9 chunks for set3.

Processing character: 		'D' (240 samples)
  Grouping for set