In [5]:
import pandas as pd

df = pd.read_csv('merged_result.csv', delimiter=';')

#filter
df = df[df['Role'] == 'User']
df = df[['Role', 'Message', 'Classification']]

In [6]:
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer

# Inicializar o tokenizer do BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Codificar os rótulos em inteiros
label_encoder = LabelEncoder()
df['encoded_classification'] = label_encoder.fit_transform(df['Classification'])

# Exibir as primeiras linhas para verificar
print(df.head())


    Role                                            Message  \
0   User                                que es un bytecode?   
3   User  a . Los bytecodes son la representación interm...   
6   User                               Cual es la correcta?   
9   User  a. Una clase abstracta no puede ser instanciad...   
12  User  public class A {\n  public static int varA = 0...   

                Classification  encoded_classification  
0         Conceptual Questions                       2  
3   Multiple Question Exercise                       6  
6   Multiple Question Exercise                       6  
9   Multiple Question Exercise                       6  
12              Debugging Help                       4  




In [7]:
from sklearn.model_selection import train_test_split

# Dividir os dados em treino e teste
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Exibir o tamanho dos conjuntos
print(f'Tamanho do conjunto de treino: {len(train_df)}')
print(f'Tamanho do conjunto de teste: {len(test_df)}')


Tamanho do conjunto de treino: 112
Tamanho do conjunto de teste: 29


In [8]:
import torch
from torch.utils.data import Dataset, DataLoader

class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenização
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Defina alguns hiperparâmetros
MAX_LEN = 128
BATCH_SIZE = 16

# Criar os datasets
train_dataset = TextClassificationDataset(
    texts=train_df['Message'].to_numpy(),
    labels=train_df['encoded_classification'].to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

test_dataset = TextClassificationDataset(
    texts=test_df['Message'].to_numpy(),
    labels=test_df['encoded_classification'].to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

# Criar os DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [9]:
from transformers import BertModel

class TextClassifier(torch.nn.Module):
    def __init__(self, n_classes):
        super(TextClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop = torch.nn.Dropout(p=0.3)
        self.out = torch.nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.pooler_output
        output = self.drop(pooled_output)
        return self.out(output)

# Instanciar o modelo
model = TextClassifier(n_classes=len(label_encoder.classes_))


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

## Treinamento

In [18]:
import torch
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Movendo o modelo para a GPU se disponível
model = model.to(device)

# Definir otimizador e função de perda
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = CrossEntropyLoss().to(device)

def train_epoch(model, data_loader, loss_fn, optimizer, device, n_examples):
    model.train()
    losses = []
    correct_predictions = 0

    for d in tqdm(data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, torch.tensor(losses).mean()

# Treinamento por épocas
EPOCHS = 6

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        loss_fn,
        optimizer,
        device,
        len(train_dataset)
    )

    print(f'Train loss {train_loss:.4f}, accuracy {train_acc:.4f}')

Epoch 1/6
----------


100%|██████████| 7/7 [00:56<00:00,  8.14s/it]


Train loss 0.4952, accuracy 0.9464
Epoch 2/6
----------


100%|██████████| 7/7 [00:50<00:00,  7.24s/it]


Train loss 0.4483, accuracy 0.9554
Epoch 3/6
----------


100%|██████████| 7/7 [01:05<00:00,  9.35s/it]


Train loss 0.3713, accuracy 0.9643
Epoch 4/6
----------


100%|██████████| 7/7 [00:52<00:00,  7.57s/it]


Train loss 0.3434, accuracy 0.9286
Epoch 5/6
----------


100%|██████████| 7/7 [00:45<00:00,  6.45s/it]


Train loss 0.2979, accuracy 0.9643
Epoch 6/6
----------


100%|██████████| 7/7 [00:48<00:00,  6.95s/it]

Train loss 0.2600, accuracy 0.9643





## Teste de modelo

In [20]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in tqdm(data_loader):
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, torch.tensor(losses).mean()

# Avaliar o modelo no conjunto de teste
test_acc, test_loss = eval_model(
    model,
    test_loader,
    loss_fn,
    device,
    len(test_dataset)
)

print(f'Test loss {test_loss:.4f}, accuracy {test_acc:.4f}')


100%|██████████| 2/2 [00:04<00:00,  2.06s/it]

Test loss 1.7485, accuracy 0.5172





## Salvar o modelo

In [21]:
torch.save(model.state_dict(), 'interaction_model.bin')

## Carregar o modelo

In [24]:
import torch
from transformers import BertTokenizer

# Carregar o modelo salvo
model = TextClassifier(n_classes=len(label_encoder.classes_))
model.load_state_dict(torch.load('interaction_model.bin'))
model = model.to(device)
model.eval()

# Inicializar o tokenizer do BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [25]:
def predict_class(model, tokenizer, sentence, max_len=128):
    # Tokenizar a frase
    encoding = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Colocar o modelo em modo de avaliação e fazer a previsão
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, predicted_class = torch.max(outputs, dim=1)
    
    # Converter a previsão numérica de volta para o nome da classe original
    predicted_class_name = label_encoder.inverse_transform([predicted_class.cpu().item()])[0]

    return predicted_class_name


In [37]:
# Exemplo de frase
frase = "Gracias"

# Fazer a previsão
classe_prevista = predict_class(model, tokenizer, frase)

# Exibir o resultado
print(f'Classification is: {classe_prevista}')


Classification is: Debugging Help
