In [5]:
import numpy as np
import warnings
from sklearn.preprocessing import scale
from scipy.io import loadmat
from scipy import signal
import os

# Define a dummy print_data if DataCraft is not available/needed
def print_data(signals, words_string, contributor, freq):
    print(f"  Signals shape: {signals.shape}")
    print(f"  Target Characters String Length: {len(words_string) if isinstance(words_string, str) else 'N/A'}")
    print(f"  Contributor: {contributor}")
    print(f"  Sampling Frequency (initial): {freq}")

warnings.filterwarnings("ignore")

# --- Configuration ---
contributor_selected = "I"
contributor_train_file_path = f"../data/Contributor_{contributor_selected}_Train.mat"
characters_file_path = "../data/characters.txt"
channels = list(range(64))
initial_sampling_frequency = 240
down_sampling_frequency = 120
WINDOW_DURATION = 650 # ms

# --- File Checks ---
if not os.path.exists(contributor_train_file_path):
    print(f"Error: Train data file not found at {contributor_train_file_path}")
    exit()

try:
    with open(characters_file_path, "r") as f:
        characters = f.read().strip()
    print(f"Loaded characters: {characters}")
except FileNotFoundError:
    print(f"Error: Characters file not found at {characters_file_path}")
    exit()

# --- Load Data ---
print(f"\nLoading training data from: {contributor_train_file_path}")
data_train = loadmat(contributor_train_file_path)
signals_train = data_train["Signal"] # Shape: (Trials, Samples, Channels)
flashing_train = data_train["Flashing"]
stimulus_train = data_train["StimulusType"]
word_train_raw = data_train["TargetChar"] # Expected: String of target chars

# --- Data Consistency Checks & Target Character Extraction ---
num_signal_trials = signals_train.shape[0]
print(f"Number of trials based on Signal array: {num_signal_trials}")

# Extract the target character string robustly
target_char_string = None
if isinstance(word_train_raw, np.ndarray):
    if word_train_raw.size == 1:
        target_char_string = str(word_train_raw.item()).strip()
    else:
        # Handle unexpected array shape if necessary
        print(f"Warning: Unexpected shape for TargetChar: {word_train_raw.shape}. Trying to flatten.")
        try:
            target_char_string = "".join(map(str, word_train_raw.flatten()))
        except Exception as e:
             print(f"Error processing TargetChar array: {e}")
elif isinstance(word_train_raw, str):
    target_char_string = word_train_raw.strip()
else:
    print(f"Error: Unexpected type for TargetChar: {type(word_train_raw)}")

if target_char_string is None:
    print("Error: Could not extract target character string. Exiting.")
    exit()

print(f"Extracted Target Character String (length {len(target_char_string)}): {target_char_string[:50]}...") # Print first 50 chars

# Verify length consistency
if len(target_char_string) != num_signal_trials:
    print(f"Error: Length of TargetChar string ({len(target_char_string)}) does not match number of trials in Signal ({num_signal_trials}). Please check data integrity.")
    # Decide whether to exit or proceed with min length
    # For now, let's proceed but use the minimum length to avoid index errors
    print("Warning: Proceeding with minimum of the two lengths for processing.")
    trials_to_process = min(len(target_char_string), num_signal_trials)
else:
    trials_to_process = num_signal_trials
    print("TargetChar string length matches number of signal trials.")

print("\nTrain Data Info (Initial):")
print_data(signals_train, target_char_string, contributor_selected, initial_sampling_frequency)

# --- Butterworth Filter ---
print("\nApplying Butterworth filter...")
sampling_frequency = initial_sampling_frequency
b, a = signal.butter(4, [0.1 / sampling_frequency, 20 / sampling_frequency], "bandpass")
for trial in range(num_signal_trials): # Filter all signal trials
    try:
        signals_train[trial, :, :] = signal.filtfilt(b, a, signals_train[trial, :, :], axis=0)
    except IndexError:
        print(f"Error: IndexError during filtering trial {trial}. Check signal dimensions.")
        continue
print("Filtering complete.")

# --- Down-sampling ---
print(f"\nDownsampling signals from {initial_sampling_frequency}Hz to {down_sampling_frequency}Hz...")
SCALE_FACTOR = round(initial_sampling_frequency / down_sampling_frequency)
sampling_frequency = down_sampling_frequency

print(f"# Samples of EEG signals before downsampling: {signals_train.shape[1]}")
try:
    signals_train = signals_train[:, ::SCALE_FACTOR, :]
    flashing_train = flashing_train[:, ::SCALE_FACTOR]
    stimulus_train = stimulus_train[:, ::SCALE_FACTOR]
except IndexError as e:
    print(f"Error during downsampling: {e}. Check array dimensions after filtering.")
    exit()
print(f"# Samples of EEG signals after downsampling: {signals_train.shape[1]}")
print("Downsampling complete.")

# --- Feature Extraction & Grouping ---
print("\nExtracting features and grouping by character...")
N_CHANNELS = signals_train.shape[2]
WINDOW_SAMPLES = round(sampling_frequency * (WINDOW_DURATION / 1000))
SAMPLES_PER_TRIAL = signals_train.shape[1]

samples = {char: [] for char in characters}

# Loop through the determined number of trials to process
for trial in range(trials_to_process):
    # Get the target character for the current trial from the string
    target_char = target_char_string[trial]

    if target_char not in samples:
        # This case should ideally not happen if characters.txt is correct
        print(f"Warning: Target character 	'{target_char}'	 from trial {trial} not found in characters.txt. Skipping this trial.")
        continue

    # Ensure trial index is valid for other arrays (should be if trials_to_process is calculated correctly)
    if trial >= flashing_train.shape[0] or trial >= stimulus_train.shape[0]:
        print(f"Warning: Trial index {trial} out of bounds for flashing/stimulus arrays. Stopping processing.")
        break

    for sample_idx in range(SAMPLES_PER_TRIAL):
        is_flash_start = False
        try:
            # Check bounds for flashing_train access
            if sample_idx >= flashing_train.shape[1]:
                 # print(f"Warning: Sample index {sample_idx} exceeds flashing_train length ({flashing_train.shape[1]}) at trial {trial}. Stopping sample loop.")
                 break

            if sample_idx == 0 and flashing_train[trial, sample_idx] == 1:
                is_flash_start = True
            elif sample_idx > 0:
                 # Check bounds for previous sample index
                 if sample_idx - 1 >= flashing_train.shape[1]:
                     # print(f"Warning: Previous sample index {sample_idx-1} exceeds flashing_train length at trial {trial}. Stopping sample loop.")
                     break
                 if flashing_train[trial, sample_idx - 1] == 0 and flashing_train[trial, sample_idx] == 1:
                     is_flash_start = True
        except IndexError:
            print(f"Warning: IndexError accessing flashing_train at trial {trial}, sample {sample_idx}. Skipping sample.")
            continue # Skip this sample index

        if is_flash_start:
            lower_sample = sample_idx
            upper_sample = sample_idx + WINDOW_SAMPLES

            if upper_sample > SAMPLES_PER_TRIAL:
                continue

            try:
                if upper_sample > signals_train.shape[1]:
                    continue
                window = signals_train[trial, lower_sample:upper_sample, :]
            except IndexError:
                 print(f"Warning: IndexError extracting window at trial {trial}, samples {lower_sample}:{upper_sample}. Skipping window.")
                 continue

            if window.shape[0] != WINDOW_SAMPLES:
                continue

            try:
                if window.size == 0 or np.all(np.std(window, axis=0) == 0):
                    normalized_window = window
                else:
                    normalized_window = scale(window, axis=0)
            except ValueError as e:
                 print(f"Warning: ValueError during scaling window at trial {trial}, sample {sample_idx}: {e}. Skipping window.")
                 continue

            samples[target_char].append(normalized_window)

print("\nFeature extraction and grouping complete.")

# --- Verification ---
print("\n--- Verification --- ")
print("Number of samples per character:")
total_samples_collected = 0
for char, char_samples in samples.items():
    count = len(char_samples)
    print(f"Character 	'{char}': {count} samples")
    total_samples_collected += count

print(f"\nTotal samples collected across all characters: {total_samples_collected}")


# --- Print First 5 Samples for Character 'A' --- 
print("\n--- Samples for Character 'A' (First 5) ---")
char_to_print = 'E'
num_samples_to_print = 2

if char_to_print in samples and samples[char_to_print]:
    print(f"Printing the first {min(num_samples_to_print, len(samples[char_to_print]))} samples for character 	'{char_to_print}	':")
    for i, sample_data in enumerate(samples[char_to_print][:num_samples_to_print]):
        print(f"\n--- Sample {i+1} for 	'{char_to_print}	' (Shape: {sample_data.shape}) ---")
        # Set numpy print options for better readability if needed
        # np.set_printoptions(threshold=np.inf) # To print the full array without truncation
        print(sample_data)
        # np.set_printoptions(threshold=1000) # Reset to default or another value if needed
else:
    print(f"No samples found for character 	'{char_to_print}	' or the list is empty.")

# print all char has the nonzero samples
print("\n--- Non-zero Samples Check ---")
for char, char_samples in samples.items():
    count = len(char_samples)
    if count > 0:
        # print without new line
            print(f"{char} ", end=' ')


print("\nScript finished.")


Loaded characters: ABCDEFGHIJKLMNOPQRSTUVWXYZ123456789_

Loading training data from: ../data/Contributor_I_Train.mat
Number of trials based on Signal array: 85
Extracted Target Character String (length 85): EAEVQTDOJG8RBRGONCEDHCTUIDBPUHMEM6OUXOCFOUKWA4VJEF...
TargetChar string length matches number of signal trials.

Train Data Info (Initial):
  Signals shape: (85, 7794, 64)
  Target Characters String Length: 85
  Contributor: I
  Sampling Frequency (initial): 240

Applying Butterworth filter...
Filtering complete.

Downsampling signals from 240Hz to 120Hz...
# Samples of EEG signals before downsampling: 7794
# Samples of EEG signals after downsampling: 3897
Downsampling complete.

Extracting features and grouping by character...

Feature extraction and grouping complete.

--- Verification --- 
Number of samples per character:
Character 	'A': 360 samples
Character 	'B': 540 samples
Character 	'C': 540 samples
Character 	'D': 720 samples
Character 	'E': 1440 samples
Character 	'F': 360