# Model Training 

In [1]:
from bundle.DataCraft import * 

data = load_sentence_eeg_prob_data()

if data:
    # Print example of converted item
    print(f"Loaded {len(data)} items.")
    if data:
        print("Example of first item:", data[1]["character"])
        print("Example of first item:", data[1]["char_idx_in_sentence"])
        print("Example of first item:", data[1]["sentence"])
        print("Example of first item:", data[1]["eeg_chunk"][0:1])
        print("Example of first item:", data[1]["prob_chunk"])

Attempting to load processed data from: ../../data/sentences_eeg.pkl
Successfully loaded processed data.
Loaded 16270 items.
Example of first item: H
Example of first item: 1
Example of first item: THE QUICK DOG JUMPED OVER
Example of first item: [array([[-1.2835948 , -0.8696827 , -0.63357687, ..., -0.8235423 ,
        -0.7878795 , -0.60256517],
       [-1.289872  , -0.85476553, -0.64955145, ..., -0.6701227 ,
        -0.5584078 , -0.4237045 ],
       [-1.2481786 , -0.7888243 , -0.64355093, ..., -0.5354535 ,
        -0.35739008, -0.2570909 ],
       ...,
       [ 0.6414426 ,  0.82679033,  0.8949905 , ...,  0.7397481 ,
         0.8634642 ,  0.8789248 ],
       [ 0.5591661 ,  0.81037956,  0.9656031 , ...,  0.7616999 ,
         0.9122697 ,  0.9680821 ],
       [ 0.43155485,  0.71449965,  0.9222388 , ...,  0.7240543 ,
         0.8748701 ,  1.0227684 ]], dtype=float32)]
Example of first item: [[1.490e-02 1.490e-02]
 [1.490e-02 1.490e-02]
 [4.000e-04 4.000e-04]
 [4.000e-04 4.000e-04]
 [4.000e

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, Dense, concatenate, Dropout
from tensorflow.keras.models import Model


def create_cnn_model(eeg_input_shape, prob_input_shape, num_classes):
    """
    Creates a Convolutional Neural Network (CNN) model for character prediction
    using EEG and probability chunk data.

    The model consists of two branches:
    1. EEG Branch: Processes EEG time-series data using 1D convolutional layers.
    2. Probability Branch: Processes probability chunk data using 1D convolutional layers.

    The outputs of both branches are concatenated and fed into dense layers
    for final character classification.

    Args:
        eeg_input_shape (tuple): Shape of the EEG input data (e.g., (78, 64)).
        prob_input_shape (tuple): Shape of the probability input data (e.g., (78, 2)).
        num_classes (int): Number of unique characters to classify (e.g., 26 for A-Z).

    Returns:
        tf.keras.Model: The compiled CNN model.
    """
    # EEG branch: Designed to capture temporal features from multi-channel EEG data
    eeg_input = Input(shape=eeg_input_shape, name='eeg_input')
    # Conv1D layer with 64 filters, kernel size 3, ReLU activation, and same padding
    eeg_conv1 = Conv1D(filters=64, kernel_size=3, activation='relu', padding='same')(eeg_input)
    # MaxPooling1D to reduce dimensionality and provide translational invariance
    eeg_pool1 = MaxPooling1D(pool_size=2)(eeg_conv1)
    # Dropout layer to prevent overfitting by randomly setting a fraction of input units to 0 at each update during training
    eeg_dropout1 = Dropout(0.25)(eeg_pool1) 
    eeg_conv2 = Conv1D(filters=128, kernel_size=3, activation='relu', padding='same')(eeg_dropout1)
    eeg_pool2 = MaxPooling1D(pool_size=2)(eeg_conv2)
    eeg_dropout2 = Dropout(0.25)(eeg_pool2) 
    # Flatten the output to feed into dense layers
    eeg_flatten = Flatten()(eeg_dropout2)

    # Probability branch: Processes the probability chunk data
    prob_input = Input(shape=prob_input_shape, name='prob_input')
    prob_conv1 = Conv1D(filters=32, kernel_size=3, activation='relu', padding='same')(prob_input)
    prob_pool1 = MaxPooling1D(pool_size=2)(prob_conv1)
    prob_dropout1 = Dropout(0.25)(prob_pool1) 
    prob_flatten = Flatten()(prob_dropout1)

    # Concatenate branches: Merges the features extracted from both EEG and probability data
    merged = concatenate([eeg_flatten, prob_flatten])

    # Dense layers for classification
    dense1 = Dense(256, activation='relu')(merged)
    # Another dropout layer before the final output to further prevent overfitting
    dropout_dense = Dropout(0.5)(dense1) 
    # Output layer with softmax activation for multi-class classification
    output = Dense(num_classes, activation='softmax')(dropout_dense)

    # Create the model with two inputs and one output
    model = Model(inputs=[eeg_input, prob_input], outputs=output)
    return model

def preprocess_data(data):
    """
    Preprocesses the raw data loaded from load_sentence_eeg_prob_data.

    Args:
        data (list): A list of dictionaries, each containing \'character\', \'eeg_chunk\', and \'prob_chunk\'.

    Returns:
        tuple: A tuple containing:
            - eeg_chunks (np.array): Preprocessed EEG data.
            - prob_chunks (np.array): Preprocessed probability data.
            - one_hot_characters (np.array): One-hot encoded character labels.
            - classes (list): List of unique characters corresponding to the one-hot encoding.
    """
    eeg_chunks = []
    prob_chunks = []
    characters = []

    for item in data:
        # Convert eeg_chunk to numpy array if it's a list
        if isinstance(item["eeg_chunk"], list):
            eeg_chunks.append(np.array(item["eeg_chunk"])) # Convert inner list to numpy array
        else:
            eeg_chunks.append(item["eeg_chunk"])
        prob_chunks.append(item["prob_chunk"])
        characters.append(item["character"])

    eeg_chunks = np.array(eeg_chunks)
    prob_chunks = np.array(prob_chunks)

    # Normalize EEG data: Standard scaling is applied to EEG data to ensure zero mean and unit variance.
    # This helps in stabilizing the training process and improving model performance.
    eeg_chunks = (eeg_chunks - np.mean(eeg_chunks)) / np.std(eeg_chunks)

    # One-hot encode characters: Converts categorical character labels into a numerical format
    # that can be used by machine learning algorithms. Each character becomes a binary vector.
    label_binarizer = LabelBinarizer()
    one_hot_characters = label_binarizer.fit_transform(characters)

    return eeg_chunks, prob_chunks, one_hot_characters, label_binarizer.classes_

print("Starting data loading and preprocessing...")
# 1. Load data: This step loads the actual dataset from the provided pickle file.
data = load_sentence_eeg_prob_data()

if data is None:
    print("Failed to load data. Exiting.")
else:
    # 2. Preprocess data: Transforms raw data into a format suitable for model training.
    eeg_chunks, prob_chunks, one_hot_characters, classes = preprocess_data(data)
    print("Data preprocessing complete.")

    # 3. Split data into training and validation sets: Divides the dataset into two subsets
    # to train the model on one and evaluate its performance on unseen data with the other.
    eeg_train, eeg_val, prob_train, prob_val, char_train, char_val = train_test_split(
        eeg_chunks, prob_chunks, one_hot_characters, test_size=0.2, random_state=42
    )

    print(f"EEG train shape: {eeg_train.shape}")
    print(f"Prob train shape: {prob_train.shape}")
    print(f"Character train shape: {char_train.shape}")

    # 4. Create and compile the model: Initializes the CNN model and configures it for training.
    # \'adam\' optimizer is chosen for its efficiency, and \'categorical_crossentropy\' is used
    # as the loss function for multi-class classification.
    eeg_input_shape = eeg_train.shape[1:]
    prob_input_shape = prob_train.shape[1:]
    num_classes = char_train.shape[1]

    model = create_cnn_model(eeg_input_shape, prob_input_shape, num_classes)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    print("Model summary:")
    model.summary()

    # 5. Train the model: The core training loop where the model learns from the training data.
    # \'epochs\' determines the number of complete passes through the training dataset.
    # \'batch_size\' defines the number of samples per gradient update.
    # \'validation_data\' is used to monitor the model\'s performance on unseen data during training.
    print("\nStarting model training...")
    history = model.fit(
        {'eeg_input': eeg_train, 'prob_input': prob_train},
        char_train,
        epochs=10,  # Number of epochs can be adjusted for better convergence
        batch_size=32, # Batch size can be adjusted based on memory and performance
        validation_data=(
            {'eeg_input': eeg_val, 'prob_input': prob_val},
            char_val
        )
    )

    print("\nModel training complete.")
    print("Training History:")
    for key, value in history.history.items():
        print(f"  {key}: {value[-1]:.4f}")




Starting data loading and preprocessing...
Attempting to load processed data from: ../../data/sentences_eeg.pkl
Successfully loaded processed data.


MemoryError: Unable to allocate 9.08 GiB for an array with shape (16270, 30, 78, 64) and data type float32