In [18]:
import torch
import torch.nn as nn
from transformers import BertModel
from torch.utils.data import DataLoader, TensorDataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd

In [19]:
# Read data from Excel file
df = pd.read_excel("../data/adjusted-labels-multiclass.xlsx")

# Extract sentences and labels
sentences = df['Sentence'].tolist()
labels = df.drop(columns=['Sentence']).values.tolist()

# TF-IDF tokenization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(sentences).toarray()

# Convert labels to tensors
labels_tensor = torch.tensor(labels, dtype=torch.float32)

In [22]:
# This is a data frame that has both the sentences and labels
df.head()

Unnamed: 0,Sentence,Labels
0,this is Charlie,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,Roger over,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,Bravo I didn't find anything relevant just abo...,"[1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0]"
3,nothing really relevant just saying its open I...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,Charlie I've got advertisement feature for the...,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [20]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, labels_tensor, test_size=0.2, random_state=47)

# Convert numpy arrays to PyTorch tensors
train_inputs = torch.tensor(X_train, dtype=torch.float32)
test_inputs = torch.tensor(X_test, dtype=torch.float32)

train_labels = y_train
test_labels = y_test

In [21]:
# Hyper parameters
BATCH = 32
LR = 0.01
EPOCHS = 100

In [23]:
# Create DataLoader for training and test data
train_dataset = TensorDataset(train_inputs, train_labels)
test_dataset = TensorDataset(test_inputs, test_labels)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH)

In [24]:
# Define your BERT-based model
class BERTClassifier(nn.Module):
    def __init__(self, bert_model, input_size, num_labels):
        super(BERTClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(input_size, num_labels)

    def forward(self, input_ids):
        logits = self.fc(self.dropout(input_ids))
        # return logits
        probabilities = self.sigmoid(logits)  # Apply sigmoid activation
        return probabilities

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
# Load pre-trained BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Instantiate the BERT classifier model
input_size = X_train.shape[1]
# num_labels = len(labels[0])  # Number of labels
num_labels = 11
model = BERTClassifier(bert_model, input_size, num_labels)

# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

In [26]:
# Training loop
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

avg_train_loss = 0

num_epochs = EPOCHS
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        inputs, labels = tuple(t.to(device) for t in batch)

        optimizer.zero_grad()
        logits = model(inputs)

        loss = criterion(logits, labels)
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    if (epoch+1) % 10 == 0:
      print(f'Epoch {epoch+1}/{num_epochs}, Average Training Loss: {avg_train_loss}')

In [ ]:
import numpy as np
from sklearn.metrics import classification_report, accuracy_score

# Evaluation on test set
model.eval()
test_loss = 0
y_true = []
y_pred = []
with torch.no_grad():
    for batch in test_dataloader:
        inputs, labels = tuple(t.to(device) for t in batch)
        logits = model(inputs)
        test_loss += criterion(logits, labels).item()

        y_true.extend(labels.cpu().detach().numpy())
        y_pred.extend(torch.sigmoid(logits).cpu().detach().numpy())

avg_test_loss = test_loss / len(test_dataloader)

print(f'Average Training Loss: {avg_train_loss}')
print(f'Average Test Loss: {avg_test_loss}')
print("")

# Convert predictions and labels to numpy arrays
y_true = np.array(y_true)
y_pred = np.array(y_pred)

# Calculate accuracy
accuracy = accuracy_score(y_true, (y_pred >= 0.5))
print(f"Accuracy: {accuracy}")

# Print classification report
y_pred[y_pred >= 0.5] = 1
y_pred[y_pred < 0.5] = 0
print("Classification Report")
print(f"Using BCE Loss, Adam - Learning Rate {LR}, Epochs {EPOCHS}, Batch {BATCH}")
print(classification_report(y_true, y_pred, target_names=df.columns[1:]))


# Save the trained model
torch.save(model.state_dict(), 'bert_multi_label_model.pth')