In [1]:
from transformers import AutoTokenizer  # Or BertTokenizer
from transformers import AutoTokenizer  # Or BertTokenizer
from transformers import BertModel
import torch
import tqdm
import pandas as pd
import numpy as np
import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
max_length = 100
# Data Paths
data_input_path = '../../dataset/processed/artigos_tratados/artigos_tratados.parquet'
data_output_path = f'../../dataset/processed/artigos_tratados/bertimbau/bertimbau_full__max_lenght={max_length}'

In [2]:
import datetime
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, BertModel
from tqdm import tqdm



# Auxiliary Functions
def bert_text_preparation_batch(texts, tokenizer):
    marked_texts = ["[CLS] " + t + " [SEP]" for t in texts]
    tokenized_texts = tokenizer.batch_encode_plus(marked_texts, truncation=True, max_length=max_length, padding='max_length', return_tensors='pt', return_attention_mask=True)
    return tokenized_texts

def get_bert_embeddings_batch(tokens_tensors, attention_mask, model):
    with torch.no_grad():
        outputs = model(input_ids=tokens_tensors, attention_mask=attention_mask)
        hidden_states = outputs[2][1:]

    token_embeddings = hidden_states[-1]
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    return token_embeddings.tolist()

# Import Models
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=True)
model = BertModel.from_pretrained('neuralmind/bert-base-portuguese-cased', output_hidden_states=True)

# Get Data
data = pd.read_parquet(data_input_path)
data_bert = data.copy()
emb_vector = []

# Parameters for Batch Processing
batch_size = 64
num_batches = (len(data_bert) + batch_size - 1) // batch_size

# Processing in Batches with tqdm
print(f'Start of Embedding. Datetime: {datetime.datetime.today()}')
for batch_num in tqdm(range(num_batches), desc="Processing Batches"):
    start_idx = batch_num * batch_size
    end_idx = min((batch_num + 1) * batch_size, len(data_bert))
    
    batch_data = data_bert.iloc[start_idx:end_idx]
    texts = batch_data['Conteudo'].tolist()
    
    tokenized_texts = bert_text_preparation_batch(texts, tokenizer)
    tokens_tensor = tokenized_texts['input_ids']
    attention_mask = tokenized_texts['attention_mask']

    del tokenized_texts

    list_token_embeddings = get_bert_embeddings_batch(tokens_tensor, attention_mask, model)
    
    list_token_embeddings = np.array(list_token_embeddings)
        
    if len(emb_vector) == 0:
        emb_vector = list_token_embeddings
    else:
        emb_vector = np.concatenate([emb_vector, list_token_embeddings])
        
    del list_token_embeddings, tokens_tensor, attention_mask

print(f'End of Embedding. Datetime: {datetime.datetime.today()}')


Start of Embedding. Datetime: 2023-12-03 00:20:52.810226


Processing Batches: 100%|██████████| 176/176 [17:53<00:00,  6.10s/it]

End of Embedding. Datetime: 2023-12-03 00:38:46.381177





In [6]:
max_length

100

In [10]:
np.save(data_output_path, emb_vector)

In [5]:
emb_vector.shape

(11241, 100, 768)

In [22]:
list_token_embeddings.shape

(8, 100, 768)

In [16]:
list_token_embeddings

NameError: name 'list_token_embeddings' is not defined

In [15]:
emb_vector.shape

(9, 8, 100, 768)

96

In [None]:
################ Data Path #################################
data_input_path = '../../dataset/processed/artigos_tratados/artigos_tratados.parquet'
data_output_path = '../../dataset/processed/artigos_tratados/bertimbau/bertimbau_full_parts/'
max_length = 150
############################################################

In [None]:
################ Aux Functions #############################
"""
ref: https://towardsdatascience.com/3-types-of-contextualized-word-embeddings-from-bert-using-transfer-learning-81fcefe3fe6d
"""
def bert_text_preparation(text, tokenizer):
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text, truncation=True, max_length=max_length, padding = 'max_length')
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensors

def get_bert_embeddings(tokens_tensor, segments_tensors, model):
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2][1:]

    token_embeddings = hidden_states[-1]
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings
############################################################


# import of models 
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=True)
model = BertModel.from_pretrained('neuralmind/bert-base-portuguese-cased', output_hidden_states = True)

In [None]:
import datetime
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, BertModel
from torch.utils.data import DataLoader

# Data Paths
data_input_path = '../../dataset/processed/artigos_tratados/artigos_tratados.parquet'
data_output_path = '../../dataset/processed/artigos_tratados/bertimbau/bertimbau_full_parts/'
max_length = 150

# Auxiliary Functions
def bert_text_preparation_batch(texts, tokenizer):
    marked_texts = ["[CLS] " + t + " [SEP]" for t in texts]
    tokenized_texts = tokenizer.batch_encode_plus(marked_texts, truncation=True, max_length=max_length, padding='max_length', return_tensors='pt')
    return tokenized_texts

def get_bert_embeddings_batch(tokens_tensors, model):
    with torch.no_grad():
        outputs = model(**tokens_tensors)
        hidden_states = outputs[2][1:]

    token_embeddings = hidden_states[-1]
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    return token_embeddings.tolist()

# Import Models
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=True)
model = BertModel.from_pretrained('neuralmind/bert-base-portuguese-cased', output_hidden_states=True)

# Get Data
data = pd.read_parquet(data_input_path)
data_bert = data.copy()
emb_vector = []

# Parameters for Batch Processing
batch_size = 256
num_batches = (len(data_bert) + batch_size - 1) // batch_size

# Processing in Batches
print(f'Start of Embedding. Datetime: {datetime.datetime.today()}')
for batch_num in range(num_batches):
    start_idx = batch_num * batch_size
    end_idx = min((batch_num + 1) * batch_size, len(data_bert))
    
    batch_data = data_bert.iloc[start_idx:end_idx]
    texts = batch_data['Conteudo'].tolist()
    
    tokenized_texts = bert_text_preparation_batch(texts, tokenizer)
    tokens_tensor = tokenized_texts['input_ids']

    del tokenized_texts

    list_token_embeddings = get_bert_embeddings_batch({'input_ids': tokens_tensor}, model)
    
    emb_vector.extend(list_token_embeddings)

    del list_token_embeddings, tokens_tensor

    print(f'Progress: {end_idx}/{len(data_bert)}. Datetime: {datetime.datetime.today()}')

print(f'End of Embedding. Datetime: {datetime.datetime.today()}')


In [None]:


# get data
data = pd.read_parquet(data_input_path)
data_bert = data.copy()
emb_vector = None
j = 0
len_df = data_bert.shape[0]
print(f'Start of Embeddding. Datetime: {datetime.datetime.today()}')
for i, row in data_bert.iterrows():

    text = row['Conteudo']
    label = row['Vies']
    
    tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(text, tokenizer)
    
    list_token_embeddings = get_bert_embeddings(tokens_tensor, segments_tensors, model)
    
    del tokenized_text, tokens_tensor, segments_tensors
    
    bert_emb = np.array(list_token_embeddings)
    bert_emb = np.expand_dims(bert_emb, axis = 0)
    
    del list_token_embeddings
    if j== 0:
        emb_vector = bert_emb
    else:
        emb_vector = np.concatenate([emb_vector, bert_emb])
    
    del bert_emb
    if j % 100 == 0:
        print(f'Progress: {j}/{len_df - 1}. Datetime: {datetime.datetime.today()}') 
    j += 1
        
print(f'End of Embedding. Datetime: {datetime.datetime.today()}')

In [None]:
import h5py
import numpy as np

# Caminho para o arquivo HDF5 gerado
output_file = 'merged_arrays.h5'

# Leitura do array do arquivo HDF5
with h5py.File(output_file, 'r') as h5f:
    emb_vector = h5f['emb_vector'][:]
