In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset

In [3]:
train_df = pd.read_csv('train.csv')
texts = train_df['text'].tolist()

In [4]:
test_df = pd.read_csv('test.csv')
texts_test = test_df['text'].tolist()

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [7]:
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

In [8]:
def get_bert_embeddings_batched(texts, batch_size=16, max_length=128):
    dataset = TextDataset(texts, tokenizer, max_length)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    embeddings = []
    
    for batch_texts in tqdm(dataloader, desc="Vectorizing texts"):
        
        inputs = tokenizer(
            batch_texts,
            return_tensors='pt',
            max_length=max_length,
            truncation=True,
            padding=True
        )
        inputs = {key: val.to(device) for key, val in inputs.items()}
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.extend(cls_embeddings)
    
    return np.array(embeddings)


batch_size = 32
embeddings_train = get_bert_embeddings_batched(texts, batch_size=batch_size)
embeddings_test = get_bert_embeddings_batched(texts_test, batch_size=batch_size)

Vectorizing texts: 100%|██████████████████████| 238/238 [00:24<00:00,  9.67it/s]
Vectorizing texts: 100%|██████████████████████| 102/102 [00:10<00:00,  9.59it/s]


In [12]:
embeddings_train_df = pd.DataFrame(embeddings_train)
embeddings_test_df = pd.DataFrame(embeddings_test)

preprocess_train_df = pd.concat([embeddings_train_df, train_df[['target']]], axis=1)
preprocess_train_df.to_csv('train_embeddings_and_targets.csv', index=False)
embeddings_test_df.to_csv('test_embeddings.csv', index=False)