# Create EEG Sentences with Next-Char Probabilities


In [1]:
import random
import torch.nn.functional as F
from bundle.DataCraft import * 
from bundle.ApiCraft  import * 


# --- Configuration ---
model_path = "../../model/api/char_predictor.pth"           # Input: Pretrained NLP model path
characters_eeg_filepath = "../../data/characters_eeg.pkl"
sentences_filepath = "../../data/sentences.txt"             # Input: Sentences file
output_filepath = "../../data/sentences_eeg.pkl"            # Output: Combined data

# NL Predictor Model Definition 


In [2]:
# Load characters from file
all_chars = load_characters()
char2idx = {ch: idx for idx, ch in enumerate(all_chars)}
idx2char = {idx: ch for ch, idx in char2idx.items()}
vocab_size = len(all_chars)

print("Vocabulary size:", vocab_size)
print("Characters in vocabulary:", all_chars)


def predict_next_chars(model, sentence, top_k=None):
 
    model.eval()
    with torch.no_grad():
        # Convert to indices, skipping unknown characters
        input_seq = [char2idx[ch] for ch in sentence if ch in char2idx]
        if not input_seq:
            print("Warning: Input sentence contains no known characters. Using empty sequence.")
            # Return uniform distribution if no valid input
            result = {ch: 1.0/vocab_size for ch in all_chars}
            return result

        input_seq = torch.tensor(input_seq).unsqueeze(0)
        output = model(input_seq)
        probs = F.softmax(output, dim=-1).squeeze(0)
        
        # If top_k is specified, get only top k predictions
        if top_k is not None:
            top_k = min(top_k, vocab_size)  # Ensure top_k doesn't exceed vocab size
            top_probs, top_indices = torch.topk(probs, top_k)
            
            result = {}
            for prob, idx in zip(top_probs, top_indices):
                result[idx2char[idx.item()]] = round(prob.item(), 4)
        else:
            # Return all probabilities
            result = {}
            for idx, prob in enumerate(probs):
                result[idx2char[idx]] = round(prob.item(), 4)
        
        return result

Loaded 37 characters from ../../data/characters.txt (including added space)
Vocabulary size: 37
Characters in vocabulary: ABCDEFGHIJKLMNOPQRSTUVWXYZ123456789_ 


# Add next-char prediction probabilities

In [3]:
# Load Sample Groups for characters
sample_groups_data = load_characters_eeg(characters_eeg_filepath)
if not sample_groups_data:
    exit()

# Load NLP Model
nlp_model = load_nlp_model(vocab_size,model_path)
if not nlp_model:
    exit()

# Read Sentences
sentences = []
print(f"Reading sentences from: {sentences_filepath}")
try:
    with open(sentences_filepath, "r") as f:
        sentences = [line.strip() for line in f if line.strip()] # Read non-empty lines
    print(f"Read {len(sentences)} sentences.")
except FileNotFoundError:
    print(f"Error: Sentences file not found at {sentences_filepath}")
    exit()
except Exception as e:
    print(f"Error reading sentences file: {e}")
    exit()

if not sentences:
    print("No sentences found in the file. Exiting.")
    exit()

# Process Sentences and Combine Data
processed_data = []
print("\nProcessing sentences to combine EEG chunks and probabilities...")

for sentence_idx, sentence in enumerate(sentences):
    print(f"Processing sentence {sentence_idx + 1}/{len(sentences)}:{sentence}")
    for char_idx, char in enumerate(sentence):
        prefix = sentence[:char_idx]
        
        if char==' ':
            continue
        
        # Check if character is valid and has data in set1
        if char in sample_groups_data and "set1" in sample_groups_data[char] and sample_groups_data[char]["set1"]:
            
            # Randomly select a chunk from set1 for this character
            available_chunks = sample_groups_data[char]["set1"]
            selected_chunk = random.choice(available_chunks)
            
            # Predict next character probabilities based on the prefix
            if len(prefix)!=0:
                next_char_probs = predict_next_chars(nlp_model, prefix)
            else:
                next_char_probs= {ch: 0 for ch in all_chars}  # Uniform distribution if no prefix
            
            # Store the combined information
            char_data = {
                	"character": char,
                    "sentence": sentence,
                	"char_idx_in_sentence": char_idx,
                	"eeg_chunk": selected_chunk, # List of numpy arrays
                	"next_char_probabilities": next_char_probs
            }
            processed_data.append(char_data)
            
        else:
            # Handle cases where character is not in groups or set1 is missing/empty
            # Also handles characters not in our defined vocabulary (like punctuation if any)
            if char not in sample_groups_data:
                 print(f"  - Warning: Character {char} not found in sample groups. Skipping.")
            elif "set1" not in sample_groups_data[char] or not sample_groups_data[char]["set1"]:
                 print(f"  - Warning: No chunks found in set1 for character {char}. Skipping.")
            # else: # Character might be punctuation or not in NLP vocab
            #     print(f"  - Info: Skipping character 	"{char}	" (possibly not in vocab or no EEG data).")
            pass # Silently skip characters without EEG data or not in vocab



processed_data = convert_probabilities_to_78x2(processed_data)

print(f"\nFinished processing. Collected data for {len(processed_data)} character instances.")

Attempting to load sample groups from: ../../data/characters_eeg.pkl
Successfully loaded sample groups dictionary.
Model loaded from ../../model/api/char_predictor.pth.
Reading sentences from: ../../data/sentences.txt
Read 800 sentences.

Processing sentences to combine EEG chunks and probabilities...
Processing sentence 1/800:THE QUICK DOG JUMPED OVER
Processing sentence 2/800:BLUE WATER FLOWED DOWN GENTLY
Processing sentence 3/800:EIGHT CHILDREN RAN THROUGH MEADOW
Processing sentence 4/800:FIND THE DARK PATH AHEAD
Processing sentence 5/800:FOX JUMPED OVER THE LOG
Processing sentence 6/800:GREEN VINE GREW UP WALL
Processing sentence 7/800:WARM FIRE GLOWED IN HOME
Processing sentence 8/800:THE WIND BLEW QUITE HARD
Processing sentence 9/800:BRIGHT LIGHT CAME FROM DOOR
Processing sentence 10/800:THE KING HELD GOLD RING
Processing sentence 11/800:COLD RAIN FELL ON GROUND
Processing sentence 12/800:THE CAT HID UNDER BED
Processing sentence 13/800:WALK QUIETLY INTO DARK ROOM
Processing sent

# Save Processed Data

In [4]:
print(f"\nSaving combined data to: {output_filepath}")
try:
    with open(output_filepath, "wb") as f:
        pickle.dump(processed_data, f)
    print(f"Successfully saved combined data to {output_filepath}.")
except Exception as e:
    print(f"Error saving combined data: {e}")


Saving combined data to: ../../data/sentences_eeg.pkl
Successfully saved combined data to ../../data/sentences_eeg.pkl.


# Load Processed Data

In [5]:
final_data = load_sentence_eeg_prob_data(output_filepath)

if final_data:
    # Print example of converted item
    print(f"Loaded {len(final_data)} items.")
    if final_data:
        print("Example of first item:", final_data[1]["character"])
        print("Example of first item:", final_data[1]["char_idx_in_sentence"])
        print("Example of first item:", final_data[1]["sentence"])
        print("Example of first item:", final_data[1]["eeg_chunk"][0:1])
        print("Example of first item:", final_data[1]["next_char_probabilities"])
        print("Example of first item:", final_data[1]["prob_chunk"])
else:
    print("Failed to load data. Please check the file path.")

Attempting to load processed data from: ../../data/sentences_eeg.pkl
Successfully loaded processed data.
Loaded 16270 items.
Example of first item: H
Example of first item: 1
Example of first item: THE QUICK DOG JUMPED OVER
Example of first item: [array([[ 0.71168303,  0.93274367,  1.0181258 , ...,  0.53064907,
         1.3929416 ,  1.2498274 ],
       [ 0.83918434,  1.0260444 ,  1.0319407 , ...,  0.79336935,
         1.4869034 ,  1.4126298 ],
       [ 0.96911293,  1.103266  ,  1.0132477 , ...,  1.0485287 ,
         1.6188169 ,  1.5175755 ],
       ...,
       [-0.5430504 , -0.5097524 , -0.69078404, ..., -1.1742845 ,
        -1.5947932 , -1.685561  ],
       [-0.6588568 , -0.6368154 , -0.6984955 , ..., -1.3331498 ,
        -1.7142816 , -1.9098184 ],
       [-0.77863264, -0.8102392 , -0.7087084 , ..., -1.5143999 ,
        -1.823878  , -2.159508  ]], dtype=float32)]
Example of first item: {'A': 0.0045, 'B': 0.0002, 'C': 0.0006, 'D': 0.0, 'E': 0.0017, 'F': 0.0, 'G': 0.0, 'H': 0.9564, 'I':