# Create EEG Sentences with Next-Char Probabilities


In [1]:
import random
import torch.nn.functional as F
from bundle.DataCraft import * 
from bundle.ApiCraft  import * 


# --- Configuration ---
model_path = "../../model/api/char_predictor.pth"           # Input: Pretrained NLP model path
characters_eeg_filepath = "../../data/characters_eeg.pkl"
sentences_filepath = "../../data/sentences.txt"             # Input: Sentences file
output_filepath = "../../data/sentences_eeg.pkl"            # Output: Combined data

# NL Predictor Model Definition 


In [2]:
# Load characters from file
all_chars = load_characters()
char2idx = {ch: idx for idx, ch in enumerate(all_chars)}
idx2char = {idx: ch for ch, idx in char2idx.items()}
vocab_size = len(all_chars)

print("Vocabulary size:", vocab_size)
print("Characters in vocabulary:", all_chars)


def predict_next_chars(model, sentence, top_k=None):
 
    model.eval()
    with torch.no_grad():
        # Convert to indices, skipping unknown characters
        input_seq = [char2idx[ch] for ch in sentence if ch in char2idx]
        if not input_seq:
            print("Warning: Input sentence contains no known characters. Using empty sequence.")
            # Return uniform distribution if no valid input
            result = {ch: 1.0/vocab_size for ch in all_chars}
            return result

        input_seq = torch.tensor(input_seq).unsqueeze(0)
        output = model(input_seq)
        probs = F.softmax(output, dim=-1).squeeze(0)
        
        # If top_k is specified, get only top k predictions
        if top_k is not None:
            top_k = min(top_k, vocab_size)  # Ensure top_k doesn't exceed vocab size
            top_probs, top_indices = torch.topk(probs, top_k)
            
            result = {}
            for prob, idx in zip(top_probs, top_indices):
                result[idx2char[idx.item()]] = round(prob.item(), 4)
        else:
            # Return all probabilities
            result = {}
            for idx, prob in enumerate(probs):
                result[idx2char[idx]] = round(prob.item(), 4)
        
        return result

Loaded 37 characters from ../../data/characters.txt (including added space)
Vocabulary size: 37
Characters in vocabulary: ABCDEFGHIJKLMNOPQRSTUVWXYZ123456789_ 


# Add next-char prediction probabilities

In [3]:
# Load Sample Groups for characters
sample_groups_data = load_characters_eeg(characters_eeg_filepath)
if not sample_groups_data:
    exit()

# Load NLP Model
nlp_model = load_nlp_model(vocab_size,model_path)
if not nlp_model:
    exit()
    
all_chars = list(load_characters())
char_vector_len = len(all_chars)


# Read Sentences
sentences = []
print(f"Reading sentences from: {sentences_filepath}")
try:
    with open(sentences_filepath, "r") as f:
        sentences = [line.strip() for line in f if line.strip()] # Read non-empty lines
    print(f"Read {len(sentences)} sentences.")
except FileNotFoundError:
    print(f"Error: Sentences file not found at {sentences_filepath}")
    exit()
except Exception as e:
    print(f"Error reading sentences file: {e}")
    exit()

if not sentences:
    print("No sentences found in the file. Exiting.")
    exit()

# Process Sentences and Combine Data
processed_data = []
print("\nProcessing sentences to combine EEG chunks and probabilities...")

for sentence_idx, sentence in enumerate(sentences):
    print(f"Processing sentence {sentence_idx + 1}/{len(sentences)}:{sentence}")
    for char_idx, char in enumerate(sentence):
        prefix = sentence[:char_idx]
        
        if char==' ':
            continue
        
        # Check if character is valid and has data in set1
        if char in sample_groups_data and "set1" in sample_groups_data[char] and sample_groups_data[char]["set1"]:
            
            # Randomly select a chunk from set1 for this character
            available_chunks = sample_groups_data[char]["set1"]
            selected_chunk = random.choice(available_chunks)
            # Predict next character probabilities based on the prefix
            if len(prefix)!=0:
                next_char_probs = predict_next_chars(nlp_model, prefix)
            else:
                next_char_probs= {ch: 0 for ch in all_chars}  # Uniform distribution if no prefix
         
         
            prob_vector = np.array([next_char_probs.get(c, 0.0) for c in all_chars])  # shape (37,)
            # Repeat and truncate to get 78 values
            repeats = (78 + char_vector_len - 1) // char_vector_len  # ceiling division
            prob_vector_repeated = np.tile(prob_vector, repeats)[:78]  # shape (78,)
            # Expand to 78Ã—64 matrix
            prob_matrix = np.tile(prob_vector_repeated[:, np.newaxis], (1, 64))  # shape (78, 64)
            
            combined_chunk = selected_chunk + [prob_matrix] 
            
            # Store the combined information
            char_data = {
                	"character": char,
                    "sentence": sentence,
                	"char_idx_in_sentence": char_idx,
                	"eeg_chunk": combined_chunk, 
                    "prob_matrix_78x64": prob_matrix
            }
            processed_data.append(char_data)
            
        else:
            # Handle cases where character is not in groups or set1 is missing/empty
            # Also handles characters not in our defined vocabulary (like punctuation if any)
            if char not in sample_groups_data:
                 print(f"  - Warning: Character {char} not found in sample groups. Skipping.")
            elif "set1" not in sample_groups_data[char] or not sample_groups_data[char]["set1"]:
                 print(f"  - Warning: No chunks found in set1 for character {char}. Skipping.")
            # else: # Character might be punctuation or not in NLP vocab
            #     print(f"  - Info: Skipping character 	"{char}	" (possibly not in vocab or no EEG data).")
            pass # Silently skip characters without EEG data or not in vocab


for i in range(0, len(processed_data)):
    if len(processed_data[i]["eeg_chunk"]) != 31:
        print(f"Warning: EEG chunk for character '{processed_data[i]['character']}' in sentence '{processed_data[i]['sentence']}' has unexpected length {len(processed_data[i]['eeg_chunk'])}. Expected 31.")
    

print(f"\nFinished processing. Collected data for {len(processed_data)} character instances.")

Attempting to load sample groups from: ../../data/characters_eeg.pkl
Successfully loaded sample groups dictionary.
Model loaded from ../../model/api/char_predictor.pth.
Loaded 37 characters from ../../data/characters.txt (including added space)
Reading sentences from: ../../data/800.txt
Read 200 sentences.

Processing sentences to combine EEG chunks and probabilities...
Processing sentence 1/200:ONE EYE WATCHED THEM MOVE
Processing sentence 2/200:NEW FLAME LIT THE HOLE
Processing sentence 3/200:A THICK FACE LOOKED BACK
Processing sentence 4/200:ORANGE FLAME TORE THROUGH FIELD
Processing sentence 5/200:A THIN LINE GLOWED BRIGHT
Processing sentence 6/200:WIND LIFTED HER THICK HAIR
Processing sentence 7/200:THE CROW FLEW OVER PEAK
Processing sentence 8/200:ALL WENT QUIET AFTER NIGHT
Processing sentence 9/200:OLD MARK FELL OFF FENCE
Processing sentence 10/200:THEY DREW A YELLOW LINE
Processing sentence 11/200:HIDE THE MAP UNDER ROCK
Processing sentence 12/200:THE NEW DAY CAME QUICKLY
Proce

# Save Processed Data

In [4]:
print(f"\nSaving combined data to: {output_filepath}")
try:
    with open(output_filepath, "wb") as f:
        pickle.dump(processed_data, f)
    print(f"Successfully saved combined data to {output_filepath}.")
except Exception as e:
    print(f"Error saving combined data: {e}")


Saving combined data to: ../../data/sentences_eeg_test.pkl
Successfully saved combined data to ../../data/sentences_eeg_test.pkl.


# Load Processed Data

In [5]:
final_data = load_sentence_eeg_prob_data(output_filepath)

if final_data:
    # Print example of converted item
    print(f"Loaded {len(final_data)} items.")
    if final_data:
        print("Example of first item:", final_data[1]["character"])
        print("Example of first item:", final_data[1]["char_idx_in_sentence"])
        print("Example of first item:", final_data[1]["sentence"])
        print("Example of first item:", final_data[1]["eeg_chunk"][0:1])
        print("Example of first item:", final_data[1]["prob_matrix_78x64"].shape)
else:
    print("Failed to load data. Please check the file path.")

Attempting to load processed data from: ../../data/sentences_eeg_test.pkl
Successfully loaded processed data.
Loaded 4269 items.
Example of first item: N
Example of first item: 1
Example of first item: ONE EYE WATCHED THEM MOVE
Example of first item: [array([[-2.234351  , -2.003461  , -2.0083575 , ..., -0.7633322 ,
        -1.218751  , -0.7329582 ],
       [-2.1980574 , -1.9654995 , -1.9032848 , ..., -0.83712494,
        -1.4114181 , -0.90939796],
       [-2.1304502 , -1.8494486 , -1.7174835 , ..., -0.8985499 ,
        -1.5590546 , -1.0670564 ],
       ...,
       [ 0.61637515,  1.8428282 ,  1.5125121 , ..., -1.0767108 ,
        -0.33654448, -0.6102961 ],
       [ 0.6458686 ,  2.0412307 ,  1.8207182 , ..., -0.9355174 ,
        -0.1309225 , -0.43053272],
       [ 0.66025656,  2.1966252 ,  2.0514784 , ..., -0.7522709 ,
         0.14229628, -0.15255098]], dtype=float32)]
Example of first item: (78, 64)
