# Create EEG Sentences with Next-Char Probabilities


In [1]:
import sys
import os
# Add the parent directory to the path to import bundle
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

import random
import torch.nn.functional as F
from bundle.DataCraft import * 
from bundle.ApiCraft  import * 


# --- Configuration ---
model_path = "../../model/api/char_predictor.pth"
sentences_filepath = "../../data/sentences_val.txt"

# Probability matrix size (constant)
PROB_WINDOW_SIZE = 36

# All combinations to process
contributors = ["I", "II"]
window_sizes = [78, 36]
repetitions_list = [5, 10, 15]

print(f"Will process {len(contributors) * len(window_sizes) * len(repetitions_list)} file combinations")
print(f"Contributors: {contributors}")
print(f"Window sizes: {window_sizes}")
print(f"Repetitions: {repetitions_list}")


Will process 12 file combinations
Contributors: ['I', 'II']
Window sizes: [78, 36]
Repetitions: [5, 10, 15]


# NL Predictor Model Definition 


In [2]:
# Load characters from file
all_chars = load_characters()
char2idx = {ch: idx for idx, ch in enumerate(all_chars)}
idx2char = {idx: ch for ch, idx in char2idx.items()}
vocab_size = len(all_chars)

print("Vocabulary size:", vocab_size)
print("Characters in vocabulary:", all_chars)


def predict_next_chars(model, sentence, top_k=None):
 
    model.eval()
    with torch.no_grad():
        # Convert to indices, skipping unknown characters
        input_seq = [char2idx[ch] for ch in sentence if ch in char2idx]
        if not input_seq:
            print("Warning: Input sentence contains no known characters. Using empty sequence.")
            # Return uniform distribution if no valid input
            result = {ch: 1.0/vocab_size for ch in all_chars}
            return result

        input_seq = torch.tensor(input_seq).unsqueeze(0)
        output = model(input_seq)
        probs = F.softmax(output, dim=-1).squeeze(0)
        
        # If top_k is specified, get only top k predictions
        if top_k is not None:
            top_k = min(top_k, vocab_size)  # Ensure top_k doesn't exceed vocab size
            top_probs, top_indices = torch.topk(probs, top_k)
            
            result = {}
            for prob, idx in zip(top_probs, top_indices):
                result[idx2char[idx.item()]] = round(prob.item(), 4)
        else:
            # Return all probabilities
            result = {}
            for idx, prob in enumerate(probs):
                result[idx2char[idx]] = round(prob.item(), 4)
        
        return result

Loaded 37 characters from ../../data/characters.txt (including added space)
Vocabulary size: 37
Characters in vocabulary: ABCDEFGHIJKLMNOPQRSTUVWXYZ123456789_ 


# Add next-char prediction probabilities

In [3]:
# Load NLP Model
nlp_model = load_nlp_model(vocab_size, model_path)
if not nlp_model:
    exit()

all_chars = list(load_characters())
char_vector_len = len(all_chars)

# Read Sentences
sentences = []
print(f"\nReading sentences from: {sentences_filepath}")
try:
    with open(sentences_filepath, "r") as f:
        sentences = [line.strip() for line in f if line.strip()]
    print(f"Read {len(sentences)} sentences.\n")
except FileNotFoundError:
    print(f"Error: Sentences file not found at {sentences_filepath}")
    exit()
except Exception as e:
    print(f"Error reading sentences file: {e}")
    exit()

if not sentences:
    print("No sentences found in the file. Exiting.")
    exit()

# Loop through all combinations
print(f"\n{'='*70}")
print("Starting processing of all file combinations")
print(f"{'='*70}\n")

file_counter = 0
for contributor in contributors:
    for window_size in window_sizes:
        for repetitions in repetitions_list:
            file_counter += 1
            
            # Construct file paths
            characters_eeg_filepath = f"../../data/characters_eeg_{contributor}_window{window_size}_{repetitions}_rep.pkl"
            output_filepath = f"../../data/sentences_eeg_val_{contributor}_window{window_size}_{repetitions}_rep.pkl"
            
            print(f"\n{'#'*70}")
            print(f"Processing file {file_counter}/12")
            print(f"{'#'*70}")
            print(f"Contributor: {contributor}")
            print(f"Window Size: {window_size}")
            print(f"Repetitions: {repetitions}")
            print(f"Input: {characters_eeg_filepath}")
            print(f"Output: {output_filepath}")
            print(f"Final shape: ({window_size + PROB_WINDOW_SIZE}, 64)")
            
            # Load character EEG data
            try:
                with open(characters_eeg_filepath, "rb") as f:
                    characters_eeg_data = pickle.load(f)
                total_images = sum(len(images) for images in characters_eeg_data.values() if images)
                print(f"Loaded {total_images} total images")
            except FileNotFoundError:
                print(f"ERROR: File not found, skipping this combination")
                continue
            except Exception as e:
                print(f"ERROR loading file: {e}, skipping")
                continue
            
            # Process Sentences
            processed_data = []
            
            for sentence_idx, sentence in enumerate(sentences):
                if sentence_idx % 50 == 0:
                    print(f"  Processing sentence {sentence_idx + 1}/{len(sentences)}...")
                
                for char_idx, char in enumerate(sentence):
                    prefix = sentence[:char_idx]
                    
                    if char == ' ':
                        continue
                    
                    # Check if character has EEG data available
                    if char in characters_eeg_data and characters_eeg_data[char]:
                        
                        # Randomly select one image
                        available_images = characters_eeg_data[char]
                        selected_image = random.choice(available_images)
                        
                        # Predict next character probabilities
                        if len(prefix) != 0:
                            next_char_probs = predict_next_chars(nlp_model, prefix)
                        else:
                            next_char_probs = {ch: 1.0/char_vector_len for ch in all_chars}
                     
                        # Create probability vector
                        prob_vector = np.array([next_char_probs.get(c, 0.0) for c in all_chars])
                        
                        # Repeat and truncate to get PROB_WINDOW_SIZE values (36)
                        repeats = (PROB_WINDOW_SIZE + char_vector_len - 1) // char_vector_len
                        prob_vector_repeated = np.tile(prob_vector, repeats)[:PROB_WINDOW_SIZE]
                        
                        # Expand to PROB_WINDOW_SIZE×64 matrix
                        prob_matrix = np.tile(prob_vector_repeated[:, np.newaxis], (1, 64))
                        
                        # Concatenate EEG image with probability matrix
                        combined_data = np.vstack([selected_image, prob_matrix])
                        
                        # Store the combined information
                        char_data = {
                            "character": char,
                            "sentence": sentence,
                            "char_idx_in_sentence": char_idx,
                            "eeg_with_prob": combined_data
                        }
                        processed_data.append(char_data)
            
            print(f"  Collected {len(processed_data)} character instances")
            
            # Save processed data
            try:
                with open(output_filepath, "wb") as f:
                    pickle.dump(processed_data, f)
                print(f"  ✓ Successfully saved to {output_filepath}")
            except Exception as e:
                print(f"  ERROR saving file: {e}")

print(f"\n{'='*70}")
print(f"COMPLETED: Processed all {file_counter} file combinations!")
print(f"{'='*70}")


Model loaded from ../../model/api/char_predictor.pth.
Loaded 37 characters from ../../data/characters.txt (including added space)

Reading sentences from: ../../data/sentences_val.txt
Read 200 sentences.


Starting processing of all file combinations


######################################################################
Processing file 1/12
######################################################################
Contributor: I
Window Size: 78
Repetitions: 5
Input: ../../data/characters_eeg_I_window78_5_rep.pkl
Output: ../../data/sentences_eeg_val_I_window78_5_rep.pkl
Final shape: (114, 64)
Loaded 850 total images
  Processing sentence 1/200...
  Processing sentence 51/200...
  Processing sentence 101/200...
  Processing sentence 151/200...
  Collected 4264 character instances
  ✓ Successfully saved to ../../data/sentences_eeg_val_I_window78_5_rep.pkl

######################################################################
Processing file 2/12
############################################

# Verify One Output File (Example)


In [4]:
# Load and verify one example file
example_contributor = "I"
example_window = 78
example_reps = 15

example_filepath = f"../../data/sentences_eeg_val_{example_contributor}_window{example_window}_{example_reps}_rep.pkl"

print(f"Loading example file: {example_filepath}\n")
try:
    with open(example_filepath, "rb") as f:
        final_data = pickle.load(f)
    print(f"Successfully loaded {len(final_data)} items.")
except FileNotFoundError:
    print(f"Error: File not found at {example_filepath}")
    final_data = None
except Exception as e:
    print(f"Error loading data: {e}")
    final_data = None

if final_data:
    print(f"\n{'='*70}")
    print("DETAILED EXAMPLE - First Item:")
    print(f"{'='*70}")
    example = final_data[0]
    print(f"Character: '{example['character']}'")
    print(f"Sentence: {example['sentence']}")
    print(f"Character index in sentence: {example['char_idx_in_sentence']}")
    print(f"\n--- Shape Information ---")
    print(f"Combined data shape: {example['eeg_with_prob'].shape}")
    print(f"Expected shape: ({example_window + PROB_WINDOW_SIZE}, 64)")
    
    print(f"\n--- Data Preview ---")
    print(f"First 3 rows of EEG data:")
    print(example['eeg_with_prob'][:3, :5])
    print(f"...")
    print(f"First 3 rows of probability section (starting at row {example_window}):")
    print(example['eeg_with_prob'][example_window:example_window+3, :5])
    
    print(f"\n--- Sample Values ---")
    print(f"EEG value at [0, 0]: {example['eeg_with_prob'][0, 0]:.6f}")
    print(f"Probability value at [{example_window}, 0]: {example['eeg_with_prob'][example_window, 0]:.6f}")
    print(f"Probability value at [{example_window}, 10]: {example['eeg_with_prob'][example_window, 10]:.6f}")
    
    print(f"\n{'='*70}")
    print(f"Summary Statistics:")
    print(f"  Total samples: {len(final_data)}")
    print(f"  Sample shape: {example['eeg_with_prob'].shape}")
    print(f"  EEG section: rows 0-{example_window-1}")
    print(f"  Probability section: rows {example_window}-{example_window + PROB_WINDOW_SIZE - 1}")
    print(f"{'='*70}")
else:
    print("Failed to load data. Please run the processing cells first.")


Loading example file: ../../data/sentences_eeg_val_I_window78_15_rep.pkl

Successfully loaded 4264 items.

DETAILED EXAMPLE - First Item:
Character: 'A'
Sentence: A FINAL WAVE HIT LAND
Character index in sentence: 0

--- Shape Information ---
Combined data shape: (114, 64)
Expected shape: (114, 64)

--- Data Preview ---
First 3 rows of EEG data:
[[-0.33338854 -1.08783925 -0.53964508  0.2766158  -0.27607933]
 [-0.56028312 -1.19927239 -0.76699919  0.10025527 -0.46214241]
 [-0.7340014  -1.22635043 -0.8627038   0.01301341 -0.53187197]]
...
First 3 rows of probability section (starting at row 78):
[[0.02702703 0.02702703 0.02702703 0.02702703 0.02702703]
 [0.02702703 0.02702703 0.02702703 0.02702703 0.02702703]
 [0.02702703 0.02702703 0.02702703 0.02702703 0.02702703]]

--- Sample Values ---
EEG value at [0, 0]: -0.333389
Probability value at [78, 0]: 0.027027
Probability value at [78, 10]: 0.027027

Summary Statistics:
  Total samples: 4264
  Sample shape: (114, 64)
  EEG section: rows 0-7