In [None]:
import os
import shutil
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer, AutoModel, pipeline
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# https://drive.google.com/file/d/1pV7MIW_r3DExQzMNdkcCMZZr_Ww69Ykz/view?usp=sharing

# Baixa e extrai o dataset
!curl -L -o data/bloom_dataset_ptbr.csv "https://drive.google.com/uc?export=download&id=1pV7MIW_r3DExQzMNdkcCMZZr_Ww69Ykz"
!unzip data/bloom_dataset_ptbr.csv -d data

In [None]:
df = pd.read_csv('data/bloom_dataset_ptbr.csv')

df.fillna(0, inplace=True)
df['one_hot'] = list(df.iloc[:, 1:-1].values)

LABELS = list(df.iloc[:, 1:-1])
print(LABELS)

df.head()

In [None]:
class BloomDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __getitem__(self, index):
        text, one_hot = df.loc[index, ['text', 'one_hot']]
        target = np.argmax(one_hot)
        target = torch.tensor(one_hot).float()
        return text, target

    def __len__(self):
        return len(self.df)

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2)

train_set = BloomDataset(df_train)
test_set = BloomDataset(df_test)

train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
val_loader = DataLoader(test_set, batch_size=32, shuffle=True)

In [None]:
train_set[0]

In [None]:
class BloomClassifier(nn.Module):
    def __init__(
        self,
        num_classes=6,
        bert_model='neuralmind/bert-base-portuguese-cased',
        freeze_bert=True,
        labels=LABELS
    ):
        super().__init__()
        self.num_classes = num_classes
        self.tokenizer = AutoTokenizer.from_pretrained(bert_model)
        self.bert = AutoModel.from_pretrained(bert_model)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)
        self.labels = labels

        if freeze_bert:
            # Congelando todas as camadas...
            for param in self.bert.parameters():
                    param.requires_grad = False

            # ... menos as duas últimas
            for param in self.bert.encoder.layer[-2:].parameters():
                param.requires_grad = True

    def forward(self, **kwargs):
        encodings = self.bert(**kwargs).last_hidden_state
        encodings = encodings[:, 0, :]
        outputs = self.fc(encodings)
        outputs = torch.sigmoid(outputs)
        return outputs

    def predict(self, text, return_labels=True, threshold=0.5):
        inputs = self.tokenizer(text, return_tensors='pt', padding=True).to(self.encoder.device)
        outputs = (self.forward(**inputs) > threshold).int().squeeze()

        if return_labels:
            outputs = [LABELS[i] for i in range(len(self.labels)) if outputs[i]]

        return outputs


model = BloomClassifier(bert_model='neuralmind/bert-base-portuguese-cased', freeze_bert=True).to(device)

In [None]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

In [None]:
# Training Loop
num_epochs = 5

all_train_losses = []
all_train_accs = []
all_val_losses = []
all_val_accs = []

for epoch in range(num_epochs):
    model.train()
    train_losses = []
    train_accs = []

    for batch in tqdm(train_loader):
        text, target = batch
        inputs = model.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        target = target.to(device)

        optimizer.zero_grad()
        pred = model(**inputs)
        loss = criterion(pred, target)
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())
        acc = ((pred > 0.5) == target).int().sum()/pred.numel()
        train_accs.append(acc.item())

    train_loss = np.mean(train_losses)
    train_acc = np.mean(train_accs)

    all_train_losses.append(train_loss)
    all_train_accs.append(train_acc)
    
    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.4f}')

    # Validation Loop
    model.eval()
    val_losses = []
    val_accs = []

    with torch.no_grad():
        for batch in tqdm(val_loader):
            text, target = batch
            inputs = model.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
            target = target.to(device)
            pred = model(**inputs)
            loss = criterion(pred, target)
            
            val_losses.append(loss.item())
            acc = ((pred > 0.5) == target).int().sum()/pred.numel()
            val_accs.append(acc.item())

    val_loss = np.mean(val_losses)
    val_acc = np.mean(val_accs)

    all_val_losses.append(val_loss)
    all_val_accs.append(val_acc)

    print(f'Epoch {epoch+1}/{num_epochs}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}')

In [None]:
# ploat all
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(all_train_losses, label='Training Loss')
plt.plot(all_val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(all_train_accs, label='Training Accuracy')
plt.plot(all_val_accs, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

In [None]:
embeddings = []
labels = []

with torch.no_grad():
    for batch in tqdm(val_loader):
        text, target = batch
        inputs = model.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        pred = model.bert(**inputs).last_hidden_state[:, 0, :].detach().cpu().numpy()
        target = target.argmax(dim=-1).detach().cpu().numpy()
        embeddings.extend(pred)
        labels.extend(target)

embeddings = np.array(embeddings)
labels = np.array(labels)

In [None]:
from sklearn.manifold import TSNE

# Apply t-SNE
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)

In [None]:
# Scatter plot
plt.figure(figsize=(10, 8))
scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=labels, cmap='rainbow', alpha=0.7)

# Add a legend
legend = plt.legend(*scatter.legend_elements(), title="Classes", loc="upper right")
plt.gca().add_artist(legend)

# Add title and labels
plt.title("t-SNE of Embeddings")
plt.xlabel("t-SNE component 1")
plt.ylabel("t-SNE component 2")

# Show plot
plt.show()