<a href="https://colab.research.google.com/github/rprimi/colB5BERT/blob/main/python/b5_contextualreps_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
__author__ = "Ricardo Primi adapted from modules from Christopher Potts, CS224u, Stanford, Spring 2021"

### General set-up



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip3 install transformers

In [4]:
!git clone https://github.com/rprimi/colB5BERT.git

%cd /content/colB5BERT
!git pull



Cloning into 'colB5BERT'...
remote: Enumerating objects: 123, done.[K
remote: Counting objects: 100% (123/123), done.[K
remote: Compressing objects: 100% (97/97), done.[K
remote: Total 123 (delta 69), reused 48 (delta 19), pack-reused 0[K
Receiving objects: 100% (123/123), 11.94 MiB | 16.84 MiB/s, done.
Resolving deltas: 100% (69/69), done.
/content/colB5BERT
Already up to date.


In [5]:
import os
import pandas as pd
import torch
from transformers import BertModel, BertTokenizer
from transformers import RobertaModel, RobertaTokenizer


Modules `vsm`, `utils` and `sst` are from Stanford's CS224u https://github.com/cgpotts/cs224u

In [6]:
import sys
sys.path.append('/content/colB5BERT/python/')

import utils
import vsm
import sst


In [None]:
if torch.cuda.is_available():
   dev = "cuda:0"
else:
   dev = "cpu"
device = torch.device(dev)
print('Using {}'.format(device))

In [8]:
!nvidia-smi

Wed Jun 14 14:43:32 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    45W / 400W |      3MiB / 40960MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

### Data

In [9]:
b5_data = pd.read_csv('/content/colB5BERT/data/db_textos.splitted.csv', sep=';')
b5_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11537 entries, 0 to 11536
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              11537 non-null  int64 
 1   id_divisao      11537 non-null  int64 
 2   texto_dividido  11537 non-null  object
dtypes: int64(2), object(1)
memory usage: 270.5+ KB


In [10]:
b5_data

Unnamed: 0,id,id_divisao,texto_dividido
0,100,1,ajudando porque a zuzu é um amor e tem a voz f...
1,100,2,vai ter share sim e se reclamar dou share mais...
2,100,3,"quanto parece , A , $NUMBER$ . MvC $NUMBER$ CL..."
3,100,4,$NUMBER$ jeitos de dar entry então é só sucess...
4,100,5,quiser se vira Esse livro é de co-autoria de $...
...,...,...,...
11532,999,6,"amo muito ! < $NUMBER$ < $NUMBER$ "" Fique por ..."
11533,999,7,pai ! Feliz aniversário ! < $NUMBER$ < $NUMBER...
11534,999,8,rei do $NAME$ Club de $NAME$ Oeste : $NAME$ $N...
11535,999,9,todo tipo de público . A realização do projeto...


In [11]:
import pandas as pd
base_itens_b5 = pd.read_excel('/content/colB5BERT/data/base_itens.xlsx')


base_itens_b5
base_itens_b5.info()
# base_itens_b5['item_pt_text'].tolist()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 415 entries, 0 to 414
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   ord0_index    415 non-null    int64  
 1   test          415 non-null    object 
 2   coditem       415 non-null    object 
 3   item_pt_text  415 non-null    object 
 4   item_en_text  415 non-null    object 
 5   domain        413 non-null    object 
 6   facet         413 non-null    object 
 7   pole          415 non-null    int64  
 8   seman_pairs   273 non-null    float64
dtypes: float64(1), int64(2), object(6)
memory usage: 29.3+ KB


### Loading Transformer models
Specify a model, a tokenizer, and load a model pretrained weights:

In [None]:
bert_weights_name = 'neuralmind/bert-base-portuguese-cased'
bert_tokenizer = BertTokenizer.from_pretrained(bert_weights_name)
bert_model = BertModel.from_pretrained(bert_weights_name)

### The basics of tokenizing


In [None]:
import textwrap

def print_cell(df, row, column, wrap_length=80):
    if row < len(df) and column in df.columns:
        text = df.loc[row, column]
        print('\n'.join(textwrap.wrap(text, width=wrap_length)))
    else:
        print("Invalid row or column")

print_cell(b5_data, 3, 'texto_dividido')

ex_ids = bert_tokenizer.encode(b5_data.loc[3, 'texto_dividido'], add_special_tokens=True)
bert_tokenizer.convert_ids_to_tokens(ex_ids)


### Getting BERT embeddings

To obtain the representations for a batch of examples, we use the `forward` method of the model, as follows:

In [None]:
with torch.no_grad():
    reps = bert_model(torch.tensor([ex_ids]), output_hidden_states=True)

In [13]:
def tokenize_texts(texts):
    # Tokenize each text and convert to input IDs
    input_ids = [bert_tokenizer.encode(text, add_special_tokens=True) for text in texts]
    return input_ids


def tokenize_texts(bert_tokenizer, texts):
    tokenized_texts = []
    for text in texts:
        encoded_text = bert_tokenizer.encode(text, add_special_tokens=True)
        # truncate the encoded text to the first 512 tokens
        encoded_text = encoded_text[:512]
        # encoded_text = encoded_text
        tokenized_texts.append(encoded_text)
    return tokenized_texts



In [None]:
# tokenize_texts(ex_of_texts)
ex_of_texts = b5_data.iloc[[0, 1, 2, 3], b5_data.columns.get_loc('texto_dividido')].tolist()
lengths = [len(sublist) for sublist in tokenize_texts(bert_tokenizer, ex_of_texts)]

print(lengths)  # Output: [3, 2, 4]

[512, 512, 477, 512]


In [14]:
import torch
def get_bert_embeddings(bert_model, examples, layers):
    # Move model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    bert_model = bert_model.to(device)

    embeddings = {layer: [] for layer in layers}
    for ex_ids in examples:
        # Convert data to tensor and move to GPU
        ex_ids_tensor = torch.tensor([ex_ids]).to(device)
        with torch.no_grad():
            # Output includes 'last_hidden_state', 'pooler_output', 'hidden_states'
            output = bert_model(ex_ids_tensor, output_hidden_states=True)
            hidden_states = output.hidden_states
            for layer in layers:
                # Verify layer index is valid
                if layer < 0 or layer >= len(hidden_states):
                    print(f"Invalid layer {layer}")
                else:
                    # Hidden states is a tuple. Indexing into it gives a tensor of shape
                    # (batch_size, sequence_length, hidden_size). Since batch_size is 1,
                    # we remove the batch dimension.
                    layer_output = hidden_states[layer].squeeze(0)
                    # Convert back to CPU for further processing or storage
                    embeddings[layer].append(layer_output.to('cpu'))
    return embeddings

In [None]:
layers = [6, 9, 11, 12]  # Specify the layers you want
embeddings = get_bert_embeddings(bert_model, tokenize_texts(bert_tokenizer, ex_of_texts), layers)

import pprint
pprint.pprint(embeddings )


dir(embeddings)
vars(embeddings)
import inspect
inspect.getmembers(embeddings)

pprint.pprint(embeddings[9])
len(embeddings[9])
len(embeddings[9][2])
pprint.pprint(embeddings[9][2])
x = embeddings[9][2]
x.shape

dimensions = [len(inner_list) for inner_list in embeddings[9][0]]



torch.Size([477, 768])

### Finaly getting the embeddings

In [15]:
# Specify the layers you want
layers = [ 6, 9, 11, 12]
len(b5_data['texto_dividido'].tolist())

embeddings_posts = get_bert_embeddings(bert_model, tokenize_texts(bert_tokenizer, b5_data['texto_dividido'].tolist()), layers)
embeddings_itens = get_bert_embeddings(bert_model, tokenize_texts(bert_tokenizer, base_itens_b5['item_pt_text'].tolist()), layers)



In [None]:
import numpy as np



def save_embeddings_to_disk(embeddings, filename):
    # Convert tensors to numpy arrays and store them in the same structure
    numpy_embeddings = {str(layer): [t.numpy() for t in tensors] for layer, tensors in embeddings.items()}

    # Use numpy's savez function to store the dictionary
    # We use ** to unpack the dictionary into keyword arguments
    np.savez(filename, **numpy_embeddings)

def load_embeddings_from_disk(filename):
    with np.load(filename) as data:
        embeddings = {layer: data[layer] for layer in data.files}
    return embeddings

filename="/content/drive/MyDrive/colB5BERT/embeddings_itens"

save_embeddings_to_disk(embeddings=embeddings_itens, filename=filename)

filename="/content/drive/MyDrive/colB5BERT/embeddings_posts"
save_embeddings_to_disk(embeddings=embeddings_posts, filename=filename)


Saving embeddings to disk is a common task when working with models like BERT, because it allows you to avoid recomputing the embeddings for the same input multiple times. There are several ways to store embeddings, but two popular methods are:

1. Saving as numpy arrays using np.save or np.savez (for multiple arrays at once).
2. Saving as a pickle file, which is a Python-specific binary format.

In your case, since you have a dictionary where each entry is a list of PyTorch tensors, you can first convert your tensors to numpy arrays, and then use one of these methods to store them. Here is an example of how you can do this with numpy:

```python
import numpy as np

def save_embeddings_to_disk(embeddings, filename):
    # Convert tensors to numpy arrays and store them in the same structure
    numpy_embeddings = {layer: [t.numpy() for t in tensors] for layer, tensors in embeddings.items()}

    # Use numpy's savez function to store the dictionary
    # We use ** to unpack the dictionary into keyword arguments
    np.savez(filename, **numpy_embeddings)
```

Then, you can load your embeddings back with:

```python
def load_embeddings_from_disk(filename):
    with np.load(filename) as data:
        embeddings = {layer: data[layer] for layer in data.files}
    return embeddings
```

This will give you a dictionary where the keys are the layers, and the values are lists of numpy arrays.

Please note that you may have to handle large files depending on the size of your embeddings and the number of examples. This can be managed by saving in chunks or using compressed formats if necessary.

Embeddings is a dict of `layers` keys. Each component of one key is composed of `batch size` elments of a tensor with size `num_of_tokens X embedding_dim`.

The final structure is `layers X batch size X num_of_tokens X embedding_dim`

In [None]:
# how many posts
len(embeddings_posts[6])
embeddings_posts[6]

# what is the first element ? of the first post
type(embeddings_posts[6][0])

# what is the dimension
embeddings_posts[6][0].shape



# how many posts
len(embeddings_itens[6])

# what is the first element ? of the first post
type(embeddings_itens[6][0])

# what is the dimension
embeddings_itens[6][0].shape

type(embeddings_itens[6])

I have a list of A of i elements. Each elment is a torch tensor of n_tokens1 X 768 (dim of embedings) . Then I have a list of B of p elements. Each elment is a torch tensor of n_tokens2 X 768 (dim of embedings). I want to claculate the cosine similarity between embedding vectors n_tokens1 with n_tokens2. I want the result to be a list of i elements by p elments of matrices n_tokens1 X n_tokens2 containing the cossine similarities. Please creeate a code in python using efficient vectorized operations

In [36]:
import torch
from torch.nn.functional import cosine_similarity
from tqdm import tqdm

def calculate_cosine_similarity(list_A, list_B):
    result = []
    for tensor_A in list_A:
        for tensor_B in list_B:
            # Expand dimensions so that shapes are [n_tokens_A, 1, 768] and [1, n_tokens_B, 768]
            tensor_A_exp = tensor_A.unsqueeze(1)
            tensor_B_exp = tensor_B.unsqueeze(0)
            # Compute cosine similarity
            similarity = cosine_similarity(tensor_A_exp, tensor_B_exp, dim=-1)
            result.append(similarity)
    return result


def calculate_cosine_similarity(list_A, list_B, topk=5):
    result = []
    for tensor_A in list_A:
        similarities = []
        for tensor_B in list_B:
            # Expand dimensions so that shapes are [n_tokens_A, 1, 768] and [1, n_tokens_B, 768]
            tensor_A_exp = tensor_A.unsqueeze(1)
            tensor_B_exp = tensor_B.unsqueeze(0)
            # Compute cosine similarity
            similarity = torch.nn.functional.cosine_similarity(tensor_A_exp, tensor_B_exp, dim=-1)
            similarities.append(similarity)
        # Concatenate all similarity scores and find the topk
        similarities = torch.cat(similarities, dim=-1)
        topk_values, topk_indices = torch.topk(similarities, topk, dim=-1)
        result.append((topk_values, topk_indices))
    return result


def calculate_cosine_similarity(list_A, list_B, topk=5, batch_size=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    result = []

    # Move tensors to GPU
    list_A = [tensor.to(device) for tensor in list_A]
    list_B = [tensor.to(device) for tensor in list_B]

    # Process batch_size tensor_A at a time
    for i in range(0, len(list_A), batch_size):
        batch_A = list_A[i:i+batch_size]
        batch_A = torch.stack(batch_A).unsqueeze(2)  # [batch_size, n_tokens_A, 1, 768]
        similarities = []

        for tensor_B in list_B:
            tensor_B = tensor_B.unsqueeze(0).unsqueeze(0)  # [1, 1, n_tokens_B, 768]
            similarity = torch.nn.functional.cosine_similarity(batch_A, tensor_B, dim=-1)  # [batch_size, n_tokens_A, n_tokens_B]
            similarities.append(similarity)

        similarities = torch.cat(similarities, dim=-1)  # [batch_size, n_tokens_A, n_tokens_B*len(list_B)]
        topk_values, topk_indices = torch.topk(similarities, topk, dim=-1)  # [batch_size, n_tokens_A, topk]
        result.extend(zip(topk_values, topk_indices))

    # Move tensors back to CPU
    result = [(values.to('cpu'), indices.to('cpu')) for values, indices in result]

    return result


def calculate_cosine_similarity(list_A, list_B, topk=5, batch_size=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    result = []

    # Move tensors to GPU
    list_A = [tensor.to(device) for tensor in list_A]
    list_B = [tensor.to(device) for tensor in list_B]

    # Process batch_size tensor_A at a time
    for i in tqdm(range(0, len(list_A), batch_size), desc='Processing', dynamic_ncols=True):
        batch_A = list_A[i:i+batch_size]
        batch_A = torch.nn.utils.rnn.pad_sequence(batch_A, batch_first=True).unsqueeze(2)  # [batch_size, max_n_tokens_A, 1, 768]
        similarities = []

        for tensor_B in list_B:
            tensor_B = tensor_B.unsqueeze(0).unsqueeze(0)  # [1, 1, n_tokens_B, 768]
            similarity = torch.nn.functional.cosine_similarity(batch_A, tensor_B, dim=-1)  # [batch_size, max_n_tokens_A, n_tokens_B]
            similarities.append(similarity)

        similarities = torch.cat(similarities, dim=-1)  # [batch_size, max_n_tokens_A, n_tokens_B*len(list_B)]
        topk_values, topk_indices = torch.topk(similarities, topk, dim=-1)  # [batch_size, max_n_tokens_A, topk]
        result.extend(zip(topk_values, topk_indices))

    # Move tensors back to CPU
    result = [(values.to('cpu'), indices.to('cpu')) for values, indices in result]

    return result


def calculate_cosine_similarity(list_A, list_B, topk=5, batch_size=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    result = []

    # Move list_A tensors to GPU
    list_A = [tensor.to(device) for tensor in list_A]

    # Process batch_size tensor_A at a time
    for i in tqdm(range(0, len(list_A), batch_size), desc='Processing', dynamic_ncols=True):
        batch_A = list_A[i:i+batch_size]
        batch_A = torch.nn.utils.rnn.pad_sequence(batch_A, batch_first=True).unsqueeze(2)  # [batch_size, max_n_tokens_A, 1, 768]
        similarities = []

        for tensor_B in list_B:
            tensor_B = tensor_B.to(device).unsqueeze(0).unsqueeze(0)  # [1, 1, n_tokens_B, 768]
            similarity = torch.nn.functional.cosine_similarity(batch_A, tensor_B, dim=-1)  # [batch_size, max_n_tokens_A, n_tokens_B]
            similarities.append(similarity.cpu())  # Move similarity to CPU
            del tensor_B  # Delete tensor_B from GPU memory

        similarities = torch.cat(similarities, dim=-1)  # [batch_size, max_n_tokens_A, n_tokens_B*len(list_B)]
        topk_values, topk_indices = torch.topk(similarities, topk, dim=-1)  # [batch_size, max_n_tokens_A, topk]
        result.extend(zip(topk_values, topk_indices))

    return result



In [39]:
# temp = calculate_cosine_similarity(embeddings_itens[6], embeddings_posts[6])
# temp = calculate_cosine_similarity(embeddings_itens[6][0:10], embeddings_posts[6][0:10])

embeddings_L6 = calculate_cosine_similarity(embeddings_itens[6], embeddings_posts[6], topk = 10, batch_size=10)


len(embeddings_L6)

embeddings_L6[0][0]
temp[1][0]
temp

tensor([[1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
         1.0000],
        [0.8899, 0.8798, 0.8748, 0.8673, 0.8559, 0.8541, 0.8522, 0.8483, 0.8465,
         0.8451],
        [0.8344, 0.8274, 0.8268, 0.8252, 0.8208, 0.8167, 0.8166, 0.8138, 0.8114,
         0.8090],
        [0.8738, 0.8692, 0.8630, 0.8606, 0.8597, 0.8495, 0.8476, 0.8421, 0.8406,
         0.8387],
        [0.8716, 0.8700, 0.8577, 0.8485, 0.8432, 0.8425, 0.8385, 0.8280, 0.8277,
         0.8240],
        [0.8677, 0.8470, 0.8469, 0.8409, 0.8336, 0.8275, 0.8259, 0.8203, 0.8199,
         0.8158],
        [0.8503, 0.8463, 0.8411, 0.8386, 0.8379, 0.8369, 0.8365, 0.8361, 0.8349,
         0.8340],
        [0.8795, 0.8573, 0.8426, 0.8331, 0.8322, 0.8272, 0.8189, 0.8187, 0.8172,
         0.8170],
        [0.8749, 0.8705, 0.8654, 0.8611, 0.8589, 0.8553, 0.8551, 0.8523, 0.8508,
         0.8484],
        [0.8484, 0.8440, 0.8328, 0.8124, 0.8084, 0.8067, 0.8028, 0.8028, 0.7981,
         0.7966],
        [0

In [None]:
temp = calculate_cosine_similarity(embeddings_itens[6], embeddings_posts[6])
len(embeddings_itens[6][0])
len(embeddings_itens[6][1])

len(embeddings_posts[6][100])


len(temp[511][0])
import numpy as np
temp[511].numpy()

tensor([[-0.0060, -0.0287,  0.0209,  ..., -0.0638,  0.0099,  0.0068],
        [ 0.8962, -0.1241, -0.5605,  ...,  0.0854, -0.2038, -0.5854],
        [ 0.8592, -0.0043, -0.6851,  ..., -0.1660,  0.3144,  0.0245],
        ...,
        [-0.5750, -0.5947, -1.1486,  ..., -0.0437,  0.1247,  0.2317],
        [ 0.1908, -0.5221,  0.1416,  ...,  0.0479, -0.7863, -0.3726],
        [-0.3147,  0.5036,  0.3528,  ...,  0.1980,  0.3203,  0.0838]])

In [42]:
import numpy as np


# Save each tuple of (topk_values, topk_indices) in the result
for i, (topk_values, topk_indices) in enumerate(embeddings_L6):
    np.savez(f'/content/drive/MyDrive/colB5BERT/output_{i}.npz', values=topk_values, indices=topk_indices)

data = np.load('output_0.npz')
values = data['values']
indices = data['indices']


In [None]:
import numpy as np
import pandas as pd

def save_to_csv(tensor_list, filename):
    flattened_tensors = [tensor.numpy().flatten() for tensor in tensor_list]
    shapes = [tensor.shape for tensor in tensor_list]

    data_df = pd.DataFrame({
        'tensor': [tensor.tolist() for tensor in flattened_tensors],
        'shape': shapes
    })

    data_df.to_csv(filename, index=False)

# Assuming your list of tensors is called tensor_list
save_to_csv(embeddings_itens[6], 'embeddings_itens6.csv')



First, to compute cosine similarity between tensors, you can use the `torch.nn.functional.cosine_similarity` function provided by PyTorch.

Here's a function that computes cosine similarity between every pair of tokens in `A` and `B`, assuming that `A` and `B` are PyTorch tensors.

```python
import torch
import torch.nn.functional as F

def compute_cosine_similarity(A, B):
    batch_size1, num_of_tokens1, embedding_dim1 = A.shape
    batch_size2, num_of_tokens2, embedding_dim2 = B.shape

    if embedding_dim1 != embedding_dim2:
        raise ValueError("Embedding dimensions must match!")

    if batch_size1 != batch_size2:
        raise ValueError("Batch sizes must match!")

    # We will compute cosine similarity for each example in the batch separately
    cosine_similarities = []
    for i in range(batch_size1):
        a = A[i]
        b = B[i]

        # Compute cosine similarity between all pairs of tokens.
        # The resulting matrix will have shape (num_of_tokens1, num_of_tokens2)
        similarity_matrix = torch.zeros((num_of_tokens1, num_of_tokens2))

        for j in range(num_of_tokens1):
            for k in range(num_of_tokens2):
                similarity_matrix[j, k] = F.cosine_similarity(a[j], b[k], dim=0)

        cosine_similarities.append(similarity_matrix)

    return cosine_similarities
```

This function will return a list of 2D tensors, each with shape `(num_of_tokens1, num_of_tokens2)`. Each tensor in the list corresponds to an example in the batch. The values in the 2D tensor represent the cosine similarity between the corresponding tokens in `A` and `B`. Note that this is quite a computationally expensive way to compute these similarities due to the nested for loops, and it might be worth looking into more efficient methods if this becomes a bottleneck in your code.

Can you do this using vector operations ?

```python
import torch
import torch.nn.functional as F

def compute_cosine_similarity_vectorized(A, B):
    # Check for matching dimensions
    batch_size1, num_of_tokens1, embedding_dim1 = A.shape
    batch_size2, num_of_tokens2, embedding_dim2 = B.shape

    if embedding_dim1 != embedding_dim2:
        raise ValueError("Embedding dimensions must match!")

    if batch_size1 != batch_size2:
        raise ValueError("Batch sizes must match!")

    A_unsqueezed = A.unsqueeze(2)  # Shape becomes [batch_size, num_of_tokens1, 1, embedding_dim]
    B_unsqueezed = B.unsqueeze(1)  # Shape becomes [batch_size, 1, num_of_tokens2, embedding_dim]

    # Calculate cosine similarity. The result has shape [batch_size, num_of_tokens1, num_of_tokens2]
    similarity_matrix = F.cosine_similarity(A_unsqueezed, B_unsqueezed, dim=-1)

    return similarity_matrix

def get_bert_embeddings(bert_model, examples, layers):
    # Move model to GPU if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    bert_model = bert_model.to(device)

    embeddings = {layer: [] for layer in layers}
    for ex_ids in examples:
        # Convert data to tensor and move to GPU
        ex_ids_tensor = torch.tensor([ex_ids]).to(device)
        with torch.no_grad():
            # Output includes 'last_hidden_state', 'pooler_output', 'hidden_states'
            output = bert_model(ex_ids_tensor, output_hidden_states=True)
            hidden_states = output.hidden_states
            for layer in layers:
                # Verify layer index is valid
                if layer < 0 or layer >= len(hidden_states):
                    print(f"Invalid layer {layer}")
                else:
                    # Hidden states is a tuple. Indexing into it gives a tensor of shape
                    # (batch_size, sequence_length, hidden_size). Since batch_size is 1,
                    # we remove the batch dimension.
                    layer_output = hidden_states[layer].squeeze(0)
                    # Convert back to CPU for further processing or storage
                    embeddings[layer].append(layer_output.to('cpu'))
    return embeddings
```


### Miscelaneous

### Python equivalent of R str

Yes, there is a similar command in Python called `dir()` which returns a list of all the attributes and methods of any object passed to it¹. Another similar command is `vars()` which returns the __dict__ attribute of an object¹. There is also a function called `inspect.getmembers()` which returns all the members of an object in a list of (name, value) pairs sorted by name¹. I hope this helps!

Origem: conversa com o Bing, 05/06/2023
(1) Is there a Python equivalent of R's str (), returning only the .... https://stackoverflow.com/questions/27749573/is-there-a-python-equivalent-of-rs-str-returning-only-the-structure-of-an-ob.
(2) What are Python pandas equivalents for R functions like str(), summary .... https://stackoverflow.com/questions/27637281/what-are-python-pandas-equivalents-for-r-functions-like-str-summary-and-he.
(3) Qual é a diferença entre 'string' e r'string' em Python?. https://pt.stackoverflow.com/questions/80545/qual-%c3%a9-a-diferen%c3%a7a-entre-string-e-rstring-em-python.

Understanding complex data structures in Python code often requires carefully examining the code and using built-in Python functions that give insights about these structures. Here are a few steps that might help:

1. **Print Statements**: Use `print()` statements liberally to output variables and their types. This can give you an idea of what data structures are being used at various points in the code.

2. **Type Checking**: Use the `type()` function to check the type of data structures. For instance, `type(my_var)` would return the type of `my_var`.

3. **Introspection**: Use dir() to view the attributes and methods of an object. For example, `dir(my_var)` would list all the methods that can be used with `my_var`.

4. **Length and Structure**: Use `len()` to find the length of a data structure. For dictionaries, lists, tuples, etc., you can also print individual elements.

5. **Variable Explorer**: If you're using an Integrated Development Environment (IDE) like PyCharm or Jupyter notebook, you can make use of the variable explorer to inspect your variables and data structures.

6. **Debugger**: A debugger can help you step through the code one line at a time and examine the changes in your data structures as the code executes. Python's built-in debugger is pdb.

7. **Visualization Tools**: For complex data structures like nested dictionaries or dataframes, consider using data visualization tools or libraries like pandas, matplotlib, or seaborn to visualize the data.

Remember, understanding complex data structures can be challenging, but it is often a matter of breaking down the structure into smaller, more manageable parts and understanding those individually.

In Python, lists and dictionaries don't have dimensions in the way that arrays in NumPy or dataframes in pandas do. Instead, they have lengths, and those lengths can be nested. You can use the built-in `len()` function to find out the number of elements in a list or dictionary.

For a list:

```python
my_list = [1, 2, 3, 4, 5]
print(len(my_list))  # Output: 5
```

For a dictionary:

```python
my_dict = {'one': 1, 'two': 2, 'three': 3}
print(len(my_dict))  # Output: 3
```

For nested structures, you'd need to use additional `len()` calls or use a loop or comprehension to iterate over the elements.

For example, for a list of lists (a 2D list), you could use a list comprehension:

```python
my_list_of_lists = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
dimensions = [len(inner_list) for inner_list in my_list_of_lists]
print(dimensions)  # Output: [3, 3, 3]
```

This tells you that you have a "3x3" list. Note that this only works for regularly-shaped data; if your lists have differing lengths, you'll get a variety of numbers.

For a nested dictionary, things can get more complex, and you may need a recursive function to fully explore the structure if the nesting can be more than one level deep.

Old functions

In [None]:

def calculate_cosine_similarity(list_A, list_B):
    result = []
    for tensor_A in list_A:
        for tensor_B in list_B:
            similarity = cosine_similarity(tensor_A.unsqueeze(0), tensor_B.unsqueeze(0), dim=-1)
            result.append(similarity)
    return result


def calculate_cosine_similarity(list_A, list_B):
    result = []
    for tensor_A in list_A:
        for tensor_B in list_B:
            # Expand dimensions so that shapes are [1, n_tokens_A, 768] and [n_tokens_B, 1, 768]
            tensor_A_exp = tensor_A.unsqueeze(0).unsqueeze(1)
            tensor_B_exp = tensor_B.unsqueeze(0).unsqueeze(2)

            # Repeat tensors so that shapes are [n_tokens_B, n_tokens_A, 768] and [n_tokens_B, n_tokens_A, 768]
            tensor_A_rep = tensor_A_exp.repeat(tensor_B.shape[0], 1, 1)
            tensor_B_rep = tensor_B_exp.repeat(1, tensor_A.shape[0], 1)
            similarity = cosine_similarity(tensor_A_rep, tensor_B_rep, dim=-1)
            result.append(similarity)
    return result
