In [1]:
import pandas as pd

df = pd.read_csv('merged_result.csv', delimiter=';')

# Filter
df = df[df['Role'] == 'User']
df = df[['Role', 'Message', 'Classification']]

# Convert the Message column to string
df['Message'] = df['Message'].astype(str)

# Save the filtered data
df.to_csv('filtered_result.csv', sep=';', index=False)

In [43]:
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Code the labels into integers
label_encoder = LabelEncoder()
df['encoded_classification'] = label_encoder.fit_transform(df['Classification'])

# Show the first lines to check
print(df.head())


    Role                                            Message  \
0   User  que funcion hace el metodo stack push y stack pop   
3   User              como saber la altura de un arbol dado   
6   User  como saber la altura de un arbol dado unos val...   
9   User                                   que es una deque   
12  User                     que es un NullPointerException   

         Classification  encoded_classification  
0   Conceptual Question                       2  
3   Conceptual Question                       2  
6   Conceptual Question                       2  
9   Conceptual Question                       2  
12  Conceptual Question                       2  




In [44]:
from sklearn.model_selection import train_test_split

# 80% of the data will be used for training and 20% for testing
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Show the size of the sets
print(f'Tamanho do conjunto de treino: {len(train_df)}')
print(f'Tamanho do conjunto de teste: {len(test_df)}')


Tamanho do conjunto de treino: 270
Tamanho do conjunto de teste: 68


In [45]:
import torch
from torch.utils.data import Dataset, DataLoader

class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenization of the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Some parameters
MAX_LEN = 128
BATCH_SIZE = 16

# Create the datasets
train_dataset = TextClassificationDataset(
    texts=train_df['Message'].to_numpy(),
    labels=train_df['encoded_classification'].to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

test_dataset = TextClassificationDataset(
    texts=test_df['Message'].to_numpy(),
    labels=test_df['encoded_classification'].to_numpy(),
    tokenizer=tokenizer,
    max_len=MAX_LEN
)

# Create the DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [46]:
from transformers import BertModel

class TextClassifier(torch.nn.Module):
    def __init__(self, n_classes):
        super(TextClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop = torch.nn.Dropout(p=0.3)
        self.out = torch.nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.pooler_output
        output = self.drop(pooled_output)
        return self.out(output)

# Initialize the model
model = TextClassifier(n_classes=len(label_encoder.classes_))


### Training

In [47]:
import torch
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Moving the model to the GPU if available
model = model.to(device)

# AdamW and loss function CrossEntropyLoss
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = CrossEntropyLoss().to(device)

def train_epoch(model, data_loader, loss_fn, optimizer, device, n_examples):
    model.train()
    losses = []
    correct_predictions = 0

    for d in tqdm(data_loader):
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, labels)

        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, torch.tensor(losses).mean()

# Train the model
EPOCHS = 8

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        loss_fn,
        optimizer,
        device,
        len(train_dataset)
    )

    print(f'Train loss {train_loss:.4f}, accuracy {train_acc:.4f}')

Epoch 1/8
----------


100%|██████████| 17/17 [01:19<00:00,  4.65s/it]


Train loss 1.9429, accuracy 0.3037
Epoch 2/8
----------


100%|██████████| 17/17 [01:19<00:00,  4.68s/it]


Train loss 1.5023, accuracy 0.4963
Epoch 3/8
----------


100%|██████████| 17/17 [01:19<00:00,  4.70s/it]


Train loss 1.2846, accuracy 0.6259
Epoch 4/8
----------


100%|██████████| 17/17 [01:18<00:00,  4.63s/it]


Train loss 1.0716, accuracy 0.7185
Epoch 5/8
----------


100%|██████████| 17/17 [01:18<00:00,  4.64s/it]


Train loss 0.9158, accuracy 0.7741
Epoch 6/8
----------


100%|██████████| 17/17 [01:20<00:00,  4.71s/it]


Train loss 0.7614, accuracy 0.8148
Epoch 7/8
----------


100%|██████████| 17/17 [01:15<00:00,  4.42s/it]


Train loss 0.6318, accuracy 0.8296
Epoch 8/8
----------


100%|██████████| 17/17 [01:15<00:00,  4.47s/it]

Train loss 0.5228, accuracy 0.8556





### Model testing

In [48]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model.eval()
    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in tqdm(data_loader):
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, torch.tensor(losses).mean()

# Test the model on the test set
test_acc, test_loss = eval_model(
    model,
    test_loader,
    loss_fn,
    device,
    len(test_dataset)
)

print(f'Test loss {test_loss:.4f}, accuracy {test_acc:.4f}')


100%|██████████| 5/5 [00:03<00:00,  1.26it/s]

Test loss 0.8600, accuracy 0.7500





### Saving the model

In [49]:
torch.save(model.state_dict(), 'interaction_model.bin')

### Loading the model

In [50]:
import torch
from transformers import BertTokenizer

# Load the saved model
model = TextClassifier(n_classes=len(label_encoder.classes_))
model.load_state_dict(torch.load('interaction_model.bin'))
model = model.to(device)
model.eval()

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


  model.load_state_dict(torch.load('interaction_model.bin'))


In [51]:
def predict_class(model, tokenizer, sentence, max_len=128):
    # Tokenize the sentence
    encoding = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Put the model in evaluation mode and make the prediction
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, predicted_class = torch.max(outputs, dim=1)

    # Convert the numerical prediction back to the original class name
    predicted_class_name = label_encoder.inverse_transform([predicted_class.cpu().item()])[0]

    return predicted_class_name


In [56]:
# Example of sentence
sentence = "por favor en español"

# Make the prediction
prediction = predict_class(model, tokenizer, sentence)

# Show the prediction
print(f'Classification: {prediction}')


Classification: Conceptual Question
