In [18]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification

Set up the device for GPU usage

In [19]:
from torch import cuda

device = "cuda" if cuda.is_available() else "cpu"

Importing our data

In [20]:
def load_excel(file):
    df = pd.read_excel(file)
    df_sentences = df["Sentence"].tolist()
    df_labels = df.drop(columns="Sentence").values.tolist()
    df_data = pd.DataFrame({
        "Sentence": df_sentences,
        "Labels": df_labels
    })
    return df_sentences, df_labels, df_data

In [21]:
s, l, df = load_excel("../data/adjusted-labels-multiclass.xlsx")

In [22]:
# This is a data frame that has both the sentences and labels
df.head()

Unnamed: 0,Sentence,Labels
0,this is Charlie,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,Roger over,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,Bravo I didn't find anything relevant just abo...,"[1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0]"
3,nothing really relevant just saying its open I...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,Charlie I've got advertisement feature for the...,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


Creating a class for preprocessing and creating our custom dataset

In [23]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = data
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sentence = str(self.data.iloc[index, 0])
        labels = self.data.iloc[index, 1]

        encoding = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
            return_attention_mask=True
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(labels, dtype=torch.float)
        }

In [24]:
num_labels = 11

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size=0.2, random_state=26)

In [26]:
MAX_LEN = 128
BATCH_SIZE = 32
LEARNING_RATE = 0.001
EPOCHS = 50

Have a maximum length for input, have padding- model needs to see the same length input

In [27]:
train_dataset = CustomDataset(train_data, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_data, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [28]:
from torch import nn, optim

optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCEWithLogitsLoss()

Training Loop

In [29]:
from tqdm import tqdm

checkpoint_interval = 10
checkpoint_path = "temp_mode.pt"
# model.load_state_dict(torch.load(checkpoint_path))

model.to(device)

# Training loop
model.train()
for epoch in range(EPOCHS):
    total_loss = 0
    for batch_idx, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = criterion(outputs.logits, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    if epoch % checkpoint_interval == 0:
      torch.save(model.state_dict(), checkpoint_path)
      print(f"Checkpoint saved: {epoch}")

    print("")
    print(f"Average training loss: {total_loss / len(train_loader)}")

# Save the final trained model
final_model_path = "final_model.pt"
torch.save(model.state_dict(), final_model_path)
print(f"Final trained model saved: {final_model_path}")

Epoch 1/50:   1%|▏         | 1/68 [00:19<21:33, 19.30s/it]


KeyboardInterrupt: 

In [ ]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score

# Evaluation loop
model.eval()
all_predictions = []
all_labels = []
total_loss = 0
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        loss = criterion(logits, labels)
        
        total_loss += loss.item()
        predictions = torch.sigmoid(logits)
        predictions[predictions >= 0.5] = 1
        predictions[predictions < 0.5] = 0
        
        all_predictions.extend(predictions.cpu().detach())
        all_labels.extend(labels.cpu().detach())
        
# Convert predictions and labels to numpy arrays
all_predictions = np.array(all_predictions)
all_labels = np.array(all_labels)

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_predictions)

label_names = df.columns[1:]
# Calculate classification report
class_report = classification_report(all_labels, all_predictions)

In [ ]:
# Calculate average test loss
average_test_loss = total_loss / len(test_loader)
print(f"Average test loss: {average_test_loss}")
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(f"Learning rate: {LEARNING_RATE}, Epochs: {EPOCHS}, Batch Size: {BATCH_SIZE}")

print(class_report)