In [ ]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertModel
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import LabelEncoder

In [ ]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model, output_dim):
        super(BERTClassifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(bert_model.config.hidden_size, output_dim)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.linear(pooled_output)
        return logits

In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Load data from CSV
data = pd.read_csv("../data/adjusted-labels-prioritised-importance.csv")
data = data[data["Label"] != "Other"]

sentences = data['Sentence'].values
labels = data["Label"].values

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(sentences).toarray()

# Split data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(data['Sentence'], data['Label'], test_size=0.2, random_state=47)

# Tokenize texts using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, return_tensors='pt', max_length=512)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, return_tensors='pt', max_length=512)

# Convert labels to tensors
label_encoder = LabelEncoder()
train_labels = torch.tensor(label_encoder.fit_transform(train_labels))
test_labels = torch.tensor(label_encoder.transform(test_labels))

# Convert input encodings to tensors
train_input_ids = train_encodings['input_ids']
train_attention_masks = train_encodings['attention_mask']
test_input_ids = test_encodings['input_ids']
test_attention_masks = test_encodings['attention_mask']

# Define DataLoader
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [ ]:
# Load pre-trained BERT model
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Freeze BERT parameters
for param in bert_model.parameters():
    param.requires_grad = False

# Define and initialize classifier
output_dim = len(set(train_labels.tolist()))
model = BERTClassifier(bert_model, output_dim)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [ ]:
# Define loss function and optimizer
learning_rate = 0.01
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [ ]:
# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for input_ids, attention_mask, labels in train_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    epoch_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

In [ ]:
# Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for input_ids, attention_mask, labels in test_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        outputs = model(input_ids, attention_mask)
        _, preds = torch.max(outputs, 1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Decode label encodings
all_preds = label_encoder.inverse_transform(all_preds)
all_labels = label_encoder.inverse_transform(all_labels)

# Classification report
print("Classification Report")
print(f"Cross Entropy, Learning Rate: {learning_rate} Epochs: {num_epochs}, Batch Size: {batch_size}")
print(classification_report(all_labels, all_preds))