# Create EEG Sentences


In [1]:
import numpy as np                
import warnings
from sklearn.preprocessing import scale

contributor_selected = "I"                                 
contributor_train_file_path = '../data/Contributor_' + contributor_selected + '_Train.mat'
contributor_test_file_path = '../data/Contributor_' + contributor_selected + '_Test.mat'
channel_name_file_path = '../data/channels.csv'
channels = [i for i in range(64)]
warnings.filterwarnings('ignore')

In [2]:
from scipy.io import loadmat
from scipy import signal
from bundle.DataCraft import * 


data_train = loadmat(contributor_train_file_path)
signals_train = data_train['Signal']
flashing_train = data_train['Flashing']
stimulus_train = data_train['StimulusType']
word_train = data_train['TargetChar']
sampling_frequency = 240
repetitions = 15
recording_duration_train = (len(signals_train)) * (len(signals_train[0])) / (sampling_frequency * 60)
trials_train = len(word_train[0])

print("Train Data:")
print_data(signals_train, word_train, contributor_selected, sampling_frequency)


Train Data:
Contributor     Sampling Freq. (Hz)  Recording (min)      Trials     Spelled Word                  
I               240.00               46.01                85         EAEVQTDOJG8RBRGONCEDHCTUIDBPUH
                                                                     MEM6OUXOCFOUKWA4VJEFRZROLHYNQD
                                                                     W_EKTLBWXEPOUIKZERYOOTHQI     


In [3]:
# Application of butterworth filter
b, a = signal.butter(4, [0.1 / sampling_frequency, 20 / sampling_frequency], 'bandpass')
for trial in range(trials_train):
    signals_train[trial, :, :] = signal.filtfilt(b, a, signals_train[trial, :, :], axis=0)
    
# Down-sampling of the signals from 240Hz to 120Hz
down_sampling_frequency = 120
SCALE_FACTOR = round(sampling_frequency / down_sampling_frequency)
sampling_frequency = down_sampling_frequency

print("# Samples of EEG signals before downsampling: {}".format(len(signals_train[0])))

signals_train = signals_train[:, 0:-1:SCALE_FACTOR, :]
flashing_train = flashing_train[:, 0:-1:SCALE_FACTOR]
stimulus_train = stimulus_train[:, 0:-1:SCALE_FACTOR]

print("# Samples of EEG signals after downsampling: {}".format(len(signals_train[0])))

# Samples of EEG signals before downsampling: 7794
# Samples of EEG signals after downsampling: 3897


In [4]:
# Number of EEG channels
N_CHANNELS = len(channels)
# Window duration after each flashing [ms]
WINDOW_DURATION = 650
# Number of samples of each window
WINDOW_SAMPLES = round(sampling_frequency * (WINDOW_DURATION / 1000))
# Number of samples for each character in trials
SAMPLES_PER_TRIAL = len(signals_train[0])

train_features = []
train_labels = []

count_positive = 0
count_negative = 0

for trial in range(trials_train):
    for sample in (range(SAMPLES_PER_TRIAL)):
        if (sample == 0) or (flashing_train[trial, sample - 1] == 0 and flashing_train[trial, sample] == 1):
            lower_sample = sample
            upper_sample = sample + WINDOW_SAMPLES
            window = signals_train[trial, lower_sample:upper_sample, :]                
            # Features extraction
            train_features.append(window)
            # Labels extraction
            if stimulus_train[trial, sample] == 1:
                count_positive += 1
                train_labels.append(1) # Class P300
            else:
                count_negative += 1
                train_labels.append(0) # Class no-P300

# Get negative-positive classes ratio
train_ratio = count_negative/count_positive

# Convert lists to numpy arrays
train_features = np.array(train_features)
train_labels = np.array(train_labels)

# 3D Tensor shape (SAMPLES, 64, 78)
dim_train = train_features.shape
print("Features tensor shape: {}".format(dim_train))

# Data normalization Zi = (Xi - mu) / sigma
for pattern in range(len(train_features)):
    train_features[pattern] = scale(train_features[pattern], axis=0)

Features tensor shape: (15300, 78, 64)


# =======================================================
# =======================================================
# =======================================================

In [5]:
# Number of EEG channels
N_CHANNELS = len(channels)
# Window duration after each flashing [ms]
WINDOW_DURATION = 650
# Number of samples of each window
WINDOW_SAMPLES = round(sampling_frequency * (WINDOW_DURATION / 1000))
# Number of samples for each character in trials
SAMPLES_PER_TRIAL = len(signals_train[0])

train_features = []
train_labels = []

count_positive = 0
count_negative = 0

for trial in range(trials_train):
    for sample in (range(SAMPLES_PER_TRIAL)):
        if (sample == 0) or (flashing_train[trial, sample - 1] == 0 and flashing_train[trial, sample] == 1):
            lower_sample = sample
            upper_sample = sample + WINDOW_SAMPLES
            window = signals_train[trial, lower_sample:upper_sample, :]                
            # Features extraction
            train_features.append(window)
            # Labels extraction
            if stimulus_train[trial, sample] == 1:
                count_positive += 1
                train_labels.append(1) # Class P300
            else:
                count_negative += 1
                train_labels.append(0) # Class no-P300

# Get negative-positive classes ratio
train_ratio = count_negative/count_positive

# Convert lists to numpy arrays
train_features = np.array(train_features)
train_labels = np.array(train_labels)

# 3D Tensor shape (SAMPLES, 64, 78)
dim_train = train_features.shape
print("Features tensor shape: {}".format(dim_train))

# Data normalization Zi = (Xi - mu) / sigma
for pattern in range(len(train_features)):
    train_features[pattern] = scale(train_features[pattern], axis=0)

# Generate synthetic sentences from collected data
import random
import string

def generate_sentences(num_sentences=50, mean_length=10, std_length=3):
    """Generate random but somewhat plausible sentences."""
    subjects = ['I', 'YOU', 'HE', 'SHE', 'THEY', 'WE']
    verbs = ['WRITE', 'LIKE', 'EAT', 'READ', 'SEE', 'USE', 'HELP', 'MAKE', 'PLAY', 'WANT']
    verbs_s = ['WRITES', 'LIKES', 'EATS', 'READS', 'SEES', 'USES', 'HELPS', 'MAKES', 'PLAYS', 'WANTS']
    objects = ['FOOD', 'MUSIC', 'BOOKS', 'GAMES', 'POEMS', 'PIZZA', 'WATER', 'PAPER', 'PHONES', 'EMAILS']
    
    sentences = []
    for _ in range(num_sentences):
        subject = random.choice(subjects)
        if subject in ['I', 'YOU', 'THEY', 'WE']:
            verb = random.choice(verbs)
        else:
            verb = random.choice(verbs_s)
        obj = random.choice(objects)
        
        sentence = f"{subject} {verb} {obj}"
        # Add a period sometimes
        if random.random() > 0.3:
            sentence += "."
            
        sentences.append(sentence)
    
    return sentences

# Generate sentences
num_sentences = 50
sentences = generate_sentences(num_sentences)

# Flatten sentences into a character stream
all_chars = []
char_to_sentence_map = []  # Will store (sentence_idx, position_in_sentence) for each char

for sent_idx, sentence in enumerate(sentences):
    for pos_idx, char in enumerate(sentence):
        all_chars.append(char)
        char_to_sentence_map.append((sent_idx, pos_idx))

# Print the first few sentences
print(f"Generated {len(sentences)} sentences from {len(all_chars)} characters.")
for i in range(min(5, len(sentences))):
    print(f"Sentence {i}: {sentences[i]}")

# Print the character to sentence mapping for the first 10 characters
print("Character to sentence mapping (first 10 characters):")
for i in range(min(10, len(all_chars))):
    sent_idx, pos_idx = char_to_sentence_map[i]
    print(f"Char {i}: '{all_chars[i]}', Sentence {sent_idx}, Position {pos_idx}")

# Now let's prepare the data structure for the ECD model training
# Based on the Wadsworth BCI dataset documentation, we use the proper variable names
# In the dataset, StimulusCode indicates which row/column was flashed:
# 1-6 for columns, 7-12 for rows

# First, let's create a mapping of our extracted P300 responses to characters for the ECD training
# We'll create a simplified version since we don't have access to the exact StimulusCode values

# Create a random mapping of P300 responses to characters for demonstration
# In a real implementation, we'd use the actual target characters from the dataset
import numpy as np

# Assuming we have a certain number of P300 responses identified by train_labels == 1
p300_indices = np.where(train_labels == 1)[0]

# Map a subset of P300 responses to characters in our generated sentences
num_chars_to_map = min(len(p300_indices), len(all_chars))
print(f"\nMapping {num_chars_to_map} P300 responses to characters in sentences")

ecd_train_data = []
for i in range(num_chars_to_map):
    ecd_train_data.append({
        'feature': train_features[p300_indices[i]],
        'target_char': all_chars[i],
        'sentence_idx': char_to_sentence_map[i][0],
        'position_idx': char_to_sentence_map[i][1]
    })

print(f"Created ECD training dataset with {len(ecd_train_data)} samples")

# Create context data for the NLP model
sentence_contexts = []
for i in range(len(sentences)):
    # For each position in the sentence, create a context
    sentence = sentences[i]
    for j in range(len(sentence)):
        context = sentence[:j]
        target = sentence[j]
        if context:  # Only add if there's actual context
            sentence_contexts.append({
                'context': context,
                'target': target,
                'sentence_idx': i,
                'position_idx': j
            })

print(f"Created {len(sentence_contexts)} context samples for NLP model training")

# Finally, create the combined dataset for the Decision Fusion Model
dfm_train_data = []
for ecd_sample in ecd_train_data:
    sent_idx = ecd_sample['sentence_idx']
    pos_idx = ecd_sample['position_idx']
    
    # Find matching NLP context
    matching_contexts = [c for c in sentence_contexts 
                         if c['sentence_idx'] == sent_idx and c['position_idx'] == pos_idx]
    
    if matching_contexts:
        nlp_sample = matching_contexts[0]
        dfm_train_data.append({
            'ecd_feature': ecd_sample['feature'],
            'nlp_context': nlp_sample['context'],
            'target_char': ecd_sample['target_char'],
            'sentence_idx': sent_idx,
            'position_idx': pos_idx
        })

print(f"Created {len(dfm_train_data)} combined samples for DFM training")

# Save a few examples of the generated data
print("\nExample of ECD training data:")
if ecd_train_data:
    example = ecd_train_data[0]
    print(f"Target character: '{example['target_char']}'")
    print(f"From sentence {example['sentence_idx']}: '{sentences[example['sentence_idx']]}'")
    print(f"At position {example['position_idx']}")
    print(f"Feature shape: {example['feature'].shape}")

print("\nExample of NLP context data:")
if sentence_contexts:
    example = sentence_contexts[0]
    print(f"Context: '{example['context']}'")
    print(f"Target: '{example['target']}'")
    print(f"From sentence {example['sentence_idx']}: '{sentences[example['sentence_idx']]}'")

print("\nExample of combined DFM data:")
if dfm_train_data:
    example = dfm_train_data[0]
    print(f"Context: '{example['nlp_context']}'")
    print(f"Target: '{example['target_char']}'")
    print(f"From sentence {example['sentence_idx']}: '{sentences[example['sentence_idx']]}'")
    print(f"Feature shape: {example['ecd_feature'].shape}")

Features tensor shape: (15300, 78, 64)
Generated 50 sentences from 728 characters.
Sentence 0: THEY MAKE FOOD.
Sentence 1: YOU SEE POEMS.
Sentence 2: THEY PLAY WATER.
Sentence 3: SHE SEES EMAILS
Sentence 4: HE EATS GAMES.
Character to sentence mapping (first 10 characters):
Char 0: 'T', Sentence 0, Position 0
Char 1: 'H', Sentence 0, Position 1
Char 2: 'E', Sentence 0, Position 2
Char 3: 'Y', Sentence 0, Position 3
Char 4: ' ', Sentence 0, Position 4
Char 5: 'M', Sentence 0, Position 5
Char 6: 'A', Sentence 0, Position 6
Char 7: 'K', Sentence 0, Position 7
Char 8: 'E', Sentence 0, Position 8
Char 9: ' ', Sentence 0, Position 9

Mapping 728 P300 responses to characters in sentences
Created ECD training dataset with 728 samples
Created 678 context samples for NLP model training
Created 678 combined samples for DFM training

Example of ECD training data:
Target character: 'T'
From sentence 0: 'THEY MAKE FOOD.'
At position 0
Feature shape: (78, 64)

Example of NLP context data:
Context: 'T