In [63]:
import sys
sys.path.append("..")  # add parent directory to system path

import pandas as pd
import json
from model.model_loader import get_model
from transformers import BartTokenizer

from model.brain_translator_model import BrainTranslator
from handler.inference import infer
from handler.generate_masks import generate_masks_from_embeddings
from handler.handler import process_uploaded_file
from transformers import BartForConditionalGeneration
from model.model_loader import get_model
import os

tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")


In [57]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [58]:
import torch

def load_embeddings_from_file(filepath: str) -> torch.Tensor:
    """
    Load embeddings from a given JSON file.

    Parameters:
    - filepath (str): The path to the JSON file containing embeddings.

    Returns:
    - torch.Tensor: A tensor containing the loaded embeddings.
    """
    with open(filepath, 'r') as file:
        embeddings_data = json.load(file)
    return torch.tensor(embeddings_data)

In [59]:
def generate_masks_from_embeddings(embeddings: torch.Tensor) -> (torch.Tensor, torch.Tensor):
    """
    Generate attention masks and their inverse for a given embeddings tensor.

    Parameters:
    - embeddings (torch.Tensor): The embeddings tensor.

    Returns:
    - tuple: A tuple containing the attention mask and its inverse.
    """
    # Assuming non-zero embeddings represent valid tokens and zeros represent padding
    attn_mask = (embeddings.sum(dim=-1) != 0).float()
    attn_mask_invert = 1.0 - attn_mask
    return attn_mask, attn_mask_invert

In [60]:
results = []
running_loss = 0.0
pretrained_bart = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

# Iterate over data.
sample_count = 0

# Create a placeholder token
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
placeholder_token = tokenizer("<s>", return_tensors="pt")
target_tokens_list = []
target_string_list = []
pred_tokens_list = []
pred_string_list = []
max_file_number=50

model = get_model()

In [61]:
if bool(model.state_dict()):
    print("Model has weights loaded.")
else:
    print("Model does not have weights loaded.")

if not model.training:
    print("Model is in evaluation mode.")
else:
    print("Model is in training mode.")


Model has weights loaded.
Model is in evaluation mode.


In [65]:
for i in range(1, max_file_number + 1):
    # Step 3: Dynamically build the file paths based on the loop index
    file = f"../datasets/saved_data/input_embeddings_{i}.json"# Process the uploaded EEG data file

    if os.path.exists(file):
        print(f"File {file} exists!")
    else:
        print(f"File {file} does NOT exist!")


    input_embeddings_data =load_embeddings_from_file(file)

    # Generate the necessary masks
    attn_mask, attn_mask_invert = generate_masks_from_embeddings(
    input_embeddings_data
    )

    model = model.to(device)
    input_embeddings_tensor = input_embeddings_tensor
    input_masks_tensor = input_masks_tensor
    input_mask_invert_tensor = input_mask_invert_tensor

    # Acquire the model and generate text
    # model = BrainTranslator(pretrained_bart)
    
    # Step 5: Process the data with the model
    with torch.no_grad():
        try:
            outputs = model(input_embeddings_tensor, input_masks_tensor, input_mask_invert_tensor, placeholder_token["input_ids"])
            # Extract the generated token IDs from the model's outputs
            logits=outputs.logits
            probs = logits[0].softmax(dim = 1)
            values, predictions = probs.topk(1)
            predictions = torch.squeeze(predictions)
            predicted_string = tokenizer.decode(predictions).split('</s></s>')[0].replace('<s>','')
            predictions = predictions.tolist()
            truncated_prediction = []
            for t in predictions:
                if t != tokenizer.eos_token_id:
                    truncated_prediction.append(t)
                else:
                    break
            pred_tokens = tokenizer.convert_ids_to_tokens(truncated_prediction, skip_special_tokens = True)
            # print('predicted tokens:',pred_tokens)
            pred_tokens_list.append(pred_tokens)
            pred_string_list.append(predicted_string)
            print('predicted string:',predicted_string)
            # results.append(generated_text)
        except Exception as e:
            logging.error(f"Error during inference: {str(e)}")
            results.append(str(e))


pred_string_list

File ../datasets/saved_data/input_embeddings_1.json exists!
predicted string: 
File ../datasets/saved_data/input_embeddings_2.json exists!
predicted string: 
File ../datasets/saved_data/input_embeddings_3.json exists!
predicted string: 
File ../datasets/saved_data/input_embeddings_4.json exists!
predicted string: 
File ../datasets/saved_data/input_embeddings_5.json exists!
predicted string: 
File ../datasets/saved_data/input_embeddings_6.json exists!
predicted string: 
File ../datasets/saved_data/input_embeddings_7.json exists!
predicted string: 
File ../datasets/saved_data/input_embeddings_8.json exists!
predicted string: 
File ../datasets/saved_data/input_embeddings_9.json exists!
predicted string: 
File ../datasets/saved_data/input_embeddings_10.json exists!
predicted string: 
File ../datasets/saved_data/input_embeddings_11.json exists!
predicted string: 
File ../datasets/saved_data/input_embeddings_12.json exists!
predicted string: 
File ../datasets/saved_data/input_embeddings_13.j

['',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '']

In [2]:
import pandas as pd

In [4]:
pickle_file = "/Users/michaelholborn/Documents/SoftwareLocal/monotropism/thoughtx/datasets/datasets_eeg_text/zuco/task1-SR/pickle/task1-SR-dataset.pickle"



In [8]:
import pandas as pd

# Read the pickled data into a DataFrame
df = pd.read_pickle(pickle_file)

# Get the first 100 rows
# df = df.head(100)


# # Print the first 100 rows
# print(df.head())



In [9]:
df

In [None]:
df.type

1. data properties
    1. Open the pickle, see the initial properties, consider re-running how the eval_decoding script is done.
2. the conversion
    1. Is the conversion to device important?
3. in the model itself.
    Is our model trash?