In [2]:
!pip3 install fastapi pandas torch transformers uvicorn google-cloud-storage scikit-learn


Collecting fastapi
  Obtaining dependency information for fastapi from https://files.pythonhosted.org/packages/4d/d2/3ad038a2365fefbac19d9a046cab7ce45f4c7bfa81d877cbece9707de9ce/fastapi-0.103.2-py3-none-any.whl.metadata
  Downloading fastapi-0.103.2-py3-none-any.whl.metadata (24 kB)
Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/1a/d1/3bba59606141ae808017f6fde91453882f931957f125009417b87a281067/transformers-4.34.0-py3-none-any.whl.metadata
  Downloading transformers-4.34.0-py3-none-any.whl.metadata (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.5/121.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn
  Obtaining dependency information for uvicorn from https://files.pythonhosted.org/packages/79/96/b0882a1c3f7ef3dd86879e041212ae5b62b4bd352320889231cc735a8e8f/uvicorn-0.23.2-py3-none-any.whl.metadata
  Using cached uvicorn-0.23.2-py3-none-any.whl.metadata (6.2 kB)
Co

In [3]:
from fastapi import FastAPI, UploadFile, HTTPException
import pandas as pd
import torch
import json
import io
from transformers import BartTokenizer


  from .autonotebook import tqdm as notebook_tqdm


In [35]:
# Load the new JSON files and inspect their structure

with open("./datasets/input_old_model/us-central1_eeg-test_dodadqada_dodadqada_saved_data_input_embeddings_1.json", 'r') as file:
    input_embeddings_data = json.load(file)

with open("./datasets/input_old_model/us-central1_eeg-test_dodadqada_dodadqada_saved_data_input_mask_invert_0.json", 'r') as file:
    input_mask_invert_data = json.load(file)

with open("./datasets/input_old_model/us-central1_eeg-test_dodadqada_dodadqada_saved_data_input_masks_1.json", 'r') as file:
    input_masks_data = json.load(file)

input_embeddings_data, input_mask_invert_data, input_masks_data

([[[-0.348483444134148,
    -0.6535227600847071,
    -0.5522270146668123,
    -0.7520958423500038,
    -0.4815909692807001,
    -0.7724416064238558,
    -1.0537253001915838,
    -0.9287407371292281,
    -0.43562928763401515,
    -0.16534712290061296,
    0.060918176455104917,
    -1.150487533572633,
    -0.2155153406165425,
    -0.44945583647657045,
    0.11818200755878136,
    0.5500977926908195,
    -0.6170510652147195,
    -0.16228366901592398,
    0.7847533917202186,
    -0.3774539209459981,
    0.7662849227609717,
    0.4468961671413056,
    0.17597310880916495,
    -0.4669668268018468,
    -0.2911148278427539,
    0.46897271816456454,
    0.5462435132100726,
    0.39009304051797455,
    -0.023314622901409673,
    -0.3977621821145195,
    -0.5250592252789202,
    0.5690815881963498,
    0.40861724627919904,
    0.16061443490425106,
    0.19661451680550615,
    -0.6649594741064728,
    -0.10302923784162257,
    0.5028794850315504,
    0.5160499320908277,
    0.15087020868653347,
  

In [4]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig

class BrainTranslator(nn.Module):
    def __init__(self, pretrained_layers, in_feature=840, decoder_embedding_size=1024, additional_encoder_nhead=8, additional_encoder_dim_feedforward=2048):
        super(BrainTranslator, self).__init__()
        
        self.pretrained = pretrained_layers
        self.additional_encoder_layer = nn.TransformerEncoderLayer(d_model=in_feature, nhead=additional_encoder_nhead, dim_feedforward=additional_encoder_dim_feedforward, batch_first=True)
        self.additional_encoder = nn.TransformerEncoder(self.additional_encoder_layer, num_layers=6)
        self.fc1 = nn.Linear(in_feature, decoder_embedding_size)

    def forward(self, input_embeddings_batch, input_masks_batch, input_masks_invert):
        encoded_embedding = self.additional_encoder(input_embeddings_batch, src_key_padding_mask=input_masks_invert)
        encoded_embedding = F.relu(self.fc1(encoded_embedding))
        out = self.pretrained(inputs_embeds=encoded_embedding, attention_mask=input_masks_batch, return_dict=True)
        
        return out


In [7]:
from transformers import BartForConditionalGeneration

_MODEL = None

def get_model():
    global _MODEL

    if _MODEL is None:
        pretrained_bart = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
        checkpoint_path = '/Users/michaelholborn/Documents/SoftwareLocal/monotropism/thoughtx/task1_task2_taskNRv2_finetune_BrainTranslator_skipstep1_b1_20_30_5e-05_5e-07_unique_sent.pt'  # Change to the path of your model
        _MODEL = BrainTranslator(pretrained_bart)
        model_weights = torch.load(checkpoint_path, map_location=torch.device('cpu'))
        _MODEL.load_state_dict(model_weights)
        _MODEL.eval()

    return _MODEL


In [32]:
import numpy as np
from sklearn.preprocessing import StandardScaler

def preprocess_eeg_data_for_inference(raw_eeg_data: np.ndarray, segment: bool = False, segment_length: int = 128) -> np.ndarray:
    eeg_data_df = pd.DataFrame(raw_eeg_data)
    eeg_data_filled = eeg_data_df.fillna(eeg_data_df.mean())
    scaler = StandardScaler()
    normalized_data = scaler.fit_transform(eeg_data_filled)
    if segment:
        segmented_data = segment_eeg_data(normalized_data, segment_length)
        return segmented_data
    print("Segment Data: ",segment)
    print("Normalised Data: ",normalized_data)

    return normalized_data

def segment_eeg_data(eeg_data: np.ndarray, segment_length: int = 128) -> np.ndarray:
    num_segments = eeg_data.shape[1] // segment_length
    segmented_data = []
    for i in range(num_segments):
        start_idx = i * segment_length
        end_idx = (i + 1) * segment_length
        segment = eeg_data[:, start_idx:end_idx]
        segmented_data.append(segment)
    return np.array(segmented_data)




In [31]:
# Load data
with open("datasets/results.json", 'r') as file:
    data = json.load(file)

raw_eeg_data = pd.DataFrame(data)

In [20]:

eeg_tensor = preprocess_eeg_data_for_inference(raw_eeg_data)



Segment Data:  False
Normalised Data:  [[-0.12980619 -0.25275585 -0.15944155 ...  0.92220916  0.12201108
  -0.43434176]
 [ 0.02935555 -0.49875623 -0.50513756 ...  1.62593771  0.4048283
  -0.4977017 ]
 [-0.68281424 -1.04184985 -0.89878684 ...  1.64638032 -0.21750464
  -0.85330254]
 ...
 [-0.5933969  -0.83465343 -0.66745245 ...  1.21945275 -0.01502839
  -0.20512466]
 [ 0.20163897 -0.38502061 -0.3968381  ...  1.36628486 -0.14704242
  -0.5807963 ]
 [-0.55645603 -0.13538309  0.14307366 ...  0.37879432  0.09646755
  -0.3338418 ]]


In [24]:
model =get_model()
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')



In [26]:

# def generate_text_from_eeg(input_sample: dict, model, tokenizer, device="cpu") -> str:
#     """
#     Generate text from preprocessed EEG data using a trained model.
    
#     Parameters:
#     - input_sample: The prepared input sample.
#     - model: The trained EEG-to-text model.
#     - tokenizer: The BART tokenizer.
#     - device: The device to run the model on (e.g., "cpu", "cuda").
    
#     Returns:
#     - Generated text.
#     """
    
#     # Move the sample and model to the specified device
#     input_sample["sent_level_EEG"] = input_sample["sent_level_EEG"].to(device)
#     input_sample["target_ids"] = input_sample["target_ids"].to(device)
#     model = model.to(device)
    
#     # Set the model to evaluation mode
#     model.eval()
    
#     print("input_sample: ",input_sample)
    
#     # Perform inference
#     with torch.no_grad():
#         outputs = model(input_ids=None, encoder_outputs=(input_sample["sent_level_EEG"], None), decoder_input_ids=input_sample["target_ids"])
    
#     # Extract the generated token IDs from the model's outputs
#     generated_ids = outputs.logits.argmax(dim=-1)
    
#     # Decode the token IDs to text
#     generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    
#     return generated_text

In [28]:
def generate_text_from_eeg(input_sample: dict, model, tokenizer, device="cpu") -> str:
    """
    Generate text from preprocessed EEG data using a trained model.
    
    Parameters:
    - input_sample: The prepared input sample.
    - model: The trained EEG-to-text model.
    - tokenizer: The BART tokenizer.
    - device: The device to run the model on (e.g., "cpu", "cuda").
    
    Returns:
    - Generated text.
    """
    
    # Move the sample and model to the specified device
    input_sample["sent_level_EEG"] = input_sample["sent_level_EEG"].to(device)
    input_sample["target_ids"] = input_sample["target_ids"].to(device)
    model = model.to(device)
    
    # Set the model to evaluation mode
    model.eval()
    
    # Perform inference
    with torch.no_grad():
        outputs = model(input_sample["sent_level_EEG"], input_sample["target_ids"], None)  # Adjusted this line to match the model's forward method
    
    # Extract the generated token IDs from the model's outputs
    generated_ids = outputs.logits.argmax(dim=-1)
    
    # Decode the token IDs to text
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    
    return generated_text


### Step 1: Ensure input_sample is structured correctly
Before calling generate_text_from_eeg, the input_sample should be structured as a dictionary with keys sent_level_EEG and target_ids. We need to check if this is the case.

In [36]:
import torch

# Convert loaded data to PyTorch tensors
input_embeddings_tensor = torch.tensor(input_embeddings_data)
input_masks_tensor = torch.tensor(input_masks_data)
input_mask_invert_tensor = torch.tensor(input_mask_invert_data)

input_embeddings_tensor.shape, input_masks_tensor.shape, input_mask_invert_tensor.shape

(torch.Size([1, 56, 840]), torch.Size([1, 56]), torch.Size([1, 56]))

In [47]:
input_embeddings_tensor

tensor([[[-0.3485, -0.6535, -0.5522,  ...,  1.0953,  1.0049, -1.5333],
         [-0.4472, -0.3577, -0.2128,  ...,  1.1794,  1.0307, -1.3763],
         [ 1.0523,  0.4421,  0.1681,  ...,  0.2377,  0.3593, -1.8879],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]]])

In [48]:
input_masks_tensor

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]])

TypeError: BrainTranslator.forward() got an unexpected keyword argument 'decoder_input_ids'

In [37]:
# Model-related imports and definitions
from transformers import BartTokenizer, BartForConditionalGeneration
import torch.nn as nn

# Define the BrainTranslator model as provided in the user's code
class BrainTranslator(nn.Module):
    def __init__(self, pretrained_layers, in_feature=840, decoder_embedding_size=1024, additional_encoder_nhead=8, additional_encoder_dim_feedforward=2048):
        super(BrainTranslator, self).__init__()
        
        self.pretrained = pretrained_layers
        self.additional_encoder_layer = nn.TransformerEncoderLayer(d_model=in_feature, nhead=additional_encoder_nhead, dim_feedforward=additional_encoder_dim_feedforward, batch_first=True)
        self.additional_encoder = nn.TransformerEncoder(self.additional_encoder_layer, num_layers=6)
        self.fc1 = nn.Linear(in_feature, decoder_embedding_size)

    def forward(self, input_embeddings_batch, input_masks_batch, input_masks_invert):
        encoded_embedding = self.additional_encoder(input_embeddings_batch, src_key_padding_mask=input_masks_invert)
        encoded_embedding = F.relu(self.fc1(encoded_embedding))
        out = self.pretrained(inputs_embeds=encoded_embedding, attention_mask=input_masks_batch, return_dict=True)
        
        return out

# Initialize model
pretrained_bart = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
model = BrainTranslator(pretrained_bart)

# Attempt a forward pass
with torch.no_grad():
    try:
        outputs = model(input_embeddings_tensor, input_masks_tensor, input_mask_invert_tensor)
        result = "Forward pass successful!"
    except Exception as e:
        result = str(e)

result


'If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.'

In [42]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch.nn as nn
import torch.nn.functional as F

# Define the BrainTranslator model
class BrainTranslator(nn.Module):
    def __init__(self, pretrained_layers, in_feature=840, decoder_embedding_size=1024, additional_encoder_nhead=8, additional_encoder_dim_feedforward=2048):
        super(BrainTranslator, self).__init__()
        
        self.pretrained = pretrained_layers
        self.additional_encoder_layer = nn.TransformerEncoderLayer(d_model=in_feature, nhead=additional_encoder_nhead, dim_feedforward=additional_encoder_dim_feedforward, batch_first=True)
        self.additional_encoder = nn.TransformerEncoder(self.additional_encoder_layer, num_layers=6)
        self.fc1 = nn.Linear(in_feature, decoder_embedding_size)

    def forward(self, input_embeddings_batch, input_masks_batch, input_mask_invert, decoder_input_ids):
        encoded_embedding = self.additional_encoder(input_embeddings_batch, src_key_padding_mask=input_mask_invert)
        encoded_embedding = F.relu(self.fc1(encoded_embedding))
        out = self.pretrained(inputs_embeds=encoded_embedding, attention_mask=input_masks_batch, decoder_input_ids=decoder_input_ids, return_dict=True)
        return out

# Initialize model
pretrained_bart = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
model = BrainTranslator(pretrained_bart)

# Create a placeholder token
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
placeholder_token = tokenizer("<s>", return_tensors="pt")

# Attempt a forward pass
with torch.no_grad():
    try:
        outputs = model(input_embeddings_tensor, input_masks_tensor, input_mask_invert_tensor, placeholder_token["input_ids"])
        result = "Forward pass successful!"
    except Exception as e:
        result = str(e)

result



'Forward pass successful!'

In [43]:
outputs

Seq2SeqLMOutput(loss=None, logits=tensor([[[19.3021, -1.3920,  9.5053,  ..., -3.4870, -3.3493,  7.0294],
         [19.3021, -1.3920,  9.5054,  ..., -3.4870, -3.3493,  7.0294],
         [19.3021, -1.3920,  9.5053,  ..., -3.4870, -3.3493,  7.0294]]]), past_key_values=((tensor([[[[ 1.1687,  2.8813,  4.4058,  ..., -1.6669,  3.0377, -2.8006],
          [-0.0745,  0.6451,  1.7084,  ...,  0.9115,  2.5264, -1.6802],
          [-2.5676, -2.6669, -4.6881,  ..., -1.6383, -2.6659, -2.1031]],

         [[-1.3328, -4.4495,  1.5480,  ..., -1.8666,  0.8618, -0.2465],
          [-1.8701, -2.2585, -0.1146,  ..., -1.3593, -0.1351,  0.1502],
          [-3.3394, -3.4319,  1.0716,  ..., -4.8966, -1.3012,  4.3711]],

         [[ 0.1951,  0.9602,  0.9443,  ..., -0.1951, -1.0370,  2.6584],
          [-0.0807,  0.2310,  0.5215,  ..., -0.2660, -0.5006,  3.1165],
          [ 0.7811,  1.6962,  2.5301,  ..., -1.8475, -0.5803, -0.0753]],

         ...,

         [[ 0.9762,  1.0852,  0.8219,  ...,  1.1969,  1.7232,  

In [46]:
# Extract the generated token IDs from the model's outputs
generated_ids = outputs.logits.argmax(dim=-1)
generated_ids


tensor([[0, 0, 0]])

In [None]:

# Decode the token IDs to text
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

In [51]:
for x in 1000:
    print(x)
    generated_text = tokenizer.decode(generated_ids[x], skip_special_tokens=True)
    print(generated_text)
    



TypeError: 'int' object is not iterable

In [45]:
generated_text

''

In [29]:

# Preprocess the data
placeholder_token = tokenizer("<s>", return_tensors="pt")
input_sample = {
    "sent_level_EEG": torch.tensor(eeg_tensor),
    "target_ids": placeholder_token["input_ids"]
}

results_json = generate_text_from_eeg(input_sample,model,tokenizer)


AssertionError: was expecting embedding dimension of 840, but got 1140