In [4]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split, KFold
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, BertTokenizer
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from tqdm.auto import tqdm
import csv
from torch.optim.lr_scheduler import StepLR
import time

In [5]:
#Define custom dataset
class CustomTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),  
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

class BertRNN(nn.Module):
    def __init__(self, bert_model, rnn_hidden_dim, num_labels):
        super(BertRNN, self).__init__()
        self.bert = bert_model
        self.bert.requires_grad_(False)  # Freeze BERT parameters
        self.rnn = nn.RNN(bert_model.config.hidden_size, rnn_hidden_dim, batch_first=True, bidirectional=True)
        self.classifier = nn.Linear(rnn_hidden_dim * 2, num_labels)  # Adjust dimension for bidirectional

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        rnn_output, _ = self.rnn(sequence_output)
        cls_output = rnn_output[:, 0, :]  
        logits = self.classifier(cls_output)
        return logits

In [6]:
# Define Focal Loss 
class FocalLoss(nn.Module):
    def __init__(self, alpha, gamma, num_labels, reduction):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.num_labels = num_labels
        self.reduction = reduction

    def forward(self, inputs, targets):
        BCE_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss
        if self.reduction == 'mean':
            return F_loss.mean()
        elif self.reduction == 'sum':
            return F_loss.sum()
        else:
            return F_loss

In [7]:
# Record hyperparameters as csv
def write_hyperparameters(filepath, hyperparams):
    with open(filepath, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Hyperparameters'])
        for key, value in hyperparams.items():
            writer.writerow([key, value])
        writer.writerow([])
        writer.writerow(['Fold', 'Epoch', 'Train Loss', 'Train Accuracy', 'Validation Loss', 'Validation Accuracy', 'Learning Rate'])


# Define hyperparameters
hyperparams = {
    'rnn_hidden_dim': 128,
    'max_len': 128,
    'batch_size': 8,
    'learning_rate': 0.0001,
    'num_epochs': 5,
    'alpha': 0.25,
    'gamma': 2.0,
    'reduction': 'mean'
}

In [8]:
# Data loading and preprocessing
tokenizer = BertTokenizer.from_pretrained('../Bert/FYP-Bert_model(multiple)')
bert_model = AutoModel.from_pretrained('../Bert/FYP-Bert_model(multiple)')
df = pd.read_csv('../Web_scraping_and_Data_preproecssing/preprocessed_data.csv')
label_map = {label: i for i, label in enumerate(df['Classification'].unique())}
texts = df['Title_preprocessed'].tolist()
labels = [label_map[label] for label in df['Classification']]
num_labels = len(df['Classification'].unique())

# Spliting datasets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

train_dataset = CustomTextDataset(X_train, y_train, tokenizer, max_len=hyperparams['max_len'])
test_dataset = CustomTextDataset(X_test, y_test, tokenizer, max_len=hyperparams['max_len'])

Some weights of BertModel were not initialized from the model checkpoint at ../Bert/FYP-Bert_model(multiple) and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    
    progress_bar = tqdm(train_loader, desc='Training', leave=False)
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        _, predicted_labels = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(predicted_labels == labels).item()
        total_predictions += labels.size(0)
        
        progress_bar.set_postfix({'Loss': loss.item()})
    
    average_loss = total_loss / len(train_loader)
    train_accuracy = correct_predictions / total_predictions
    return average_loss, train_accuracy


def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            _, predicted_labels = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(predicted_labels == labels).item()
    average_loss = total_loss / len(loader)
    accuracy = correct_predictions / len(loader.dataset)
    return average_loss, accuracy

In [10]:
# Set up 2-Fold cross validation
kf = KFold(n_splits=3, shuffle=True, random_state=42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Cross-validation process
for fold, (train_idx, valid_idx) in enumerate(kf.split(X_train)):
    train_texts = [X_train[i] for i in train_idx]
    train_labels = [y_train[i] for i in train_idx]
    valid_texts = [X_train[i] for i in valid_idx]
    valid_labels = [y_train[i] for i in valid_idx]

    train_subdataset = CustomTextDataset(train_texts, train_labels, tokenizer, max_len=hyperparams['max_len'])
    valid_subdataset = CustomTextDataset(valid_texts, valid_labels, tokenizer, max_len=hyperparams['max_len'])

    train_loader = DataLoader(train_subdataset, batch_size=hyperparams['batch_size'], shuffle=True)
    valid_loader = DataLoader(valid_subdataset, batch_size=hyperparams['batch_size'])

    model = BertRNN(bert_model, hyperparams['rnn_hidden_dim'], len(set(labels))).to(device)
    optimizer = Adam(model.parameters(), lr=hyperparams['learning_rate'])
    criterion = CrossEntropyLoss()
    #criterion = FocalLoss(alpha=hyperparams['alpha'], gamma=hyperparams['gamma'], num_labels=num_labels, reduction=hyperparams['reduction'])
    scheduler = StepLR(optimizer, step_size=10, gamma=0.1)


    for epoch in range(hyperparams['num_epochs']):
        train_loss, train_accuracy = train(model, train_loader, optimizer, criterion, device)
        valid_loss, valid_accuracy = evaluate(model, valid_loader, criterion, device)
        scheduler.step()

        print(f'Fold {fold+1}, Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Valid Loss: {valid_loss:.4f}, Valid Accuracy: {valid_accuracy:.4f}')



Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 1, Epoch 1, Train Loss: 2.8028, Train Accuracy: 0.2576, Valid Loss: 2.5594, Valid Accuracy: 0.3108


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 1, Epoch 2, Train Loss: 2.4962, Train Accuracy: 0.3423, Valid Loss: 2.3739, Valid Accuracy: 0.3676


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 1, Epoch 3, Train Loss: 2.3111, Train Accuracy: 0.3853, Valid Loss: 2.2324, Valid Accuracy: 0.3997


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 1, Epoch 4, Train Loss: 2.1698, Train Accuracy: 0.4158, Valid Loss: 2.1419, Valid Accuracy: 0.4151


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 1, Epoch 5, Train Loss: 2.0737, Train Accuracy: 0.4239, Valid Loss: 2.0831, Valid Accuracy: 0.4337


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 2, Epoch 1, Train Loss: 2.8009, Train Accuracy: 0.2557, Valid Loss: 2.5580, Valid Accuracy: 0.3309


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 2, Epoch 2, Train Loss: 2.4838, Train Accuracy: 0.3381, Valid Loss: 2.3573, Valid Accuracy: 0.3760


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 2, Epoch 3, Train Loss: 2.3000, Train Accuracy: 0.3862, Valid Loss: 2.2180, Valid Accuracy: 0.4076


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 2, Epoch 4, Train Loss: 2.1497, Train Accuracy: 0.4281, Valid Loss: 2.1452, Valid Accuracy: 0.4267


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 2, Epoch 5, Train Loss: 2.0639, Train Accuracy: 0.4339, Valid Loss: 2.0883, Valid Accuracy: 0.4346


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 3, Epoch 1, Train Loss: 2.7557, Train Accuracy: 0.2713, Valid Loss: 2.6172, Valid Accuracy: 0.2880


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 3, Epoch 2, Train Loss: 2.4614, Train Accuracy: 0.3418, Valid Loss: 2.4272, Valid Accuracy: 0.3485


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 3, Epoch 3, Train Loss: 2.2797, Train Accuracy: 0.3886, Valid Loss: 2.2998, Valid Accuracy: 0.3765


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 3, Epoch 4, Train Loss: 2.1500, Train Accuracy: 0.4204, Valid Loss: 2.2065, Valid Accuracy: 0.3955


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 3, Epoch 5, Train Loss: 2.0535, Train Accuracy: 0.4388, Valid Loss: 2.1403, Valid Accuracy: 0.4169


In [11]:
# Test set evaluation
test_loader = DataLoader(test_dataset, batch_size=8)
test_loss, test_accuracy = evaluate(model, test_loader, criterion, device)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

Test Loss: 2.1320, Test Accuracy: 0.4156


In [12]:
from sklearn.metrics import classification_report
#Training and validation loop
for epoch in range(3):
    model.train()
    for data in train_loader:
        input_ids, attention_mask, labels = data['input_ids'].to(device), data['attention_mask'].to(device), data['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for data in test_loader:
            input_ids, attention_mask, labels = data['input_ids'].to(device), data['attention_mask'].to(device), data['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    

    classification_metrics = classification_report(all_labels, all_preds, output_dict=True)
    print(f"Epoch {epoch+1}:")
    print(f"Precision: {classification_metrics['weighted avg']['precision']}")
    print(f"Recall: {classification_metrics['weighted avg']['recall']}")
    print(f"F1-Score: {classification_metrics['weighted avg']['f1-score']}")

scheduler.step()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1:
Precision: 0.3461519313360342
Recall: 0.42431761786600497
F1-Score: 0.3525783234821868


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2:
Precision: 0.3625559722710519
Recall: 0.423697270471464
F1-Score: 0.37308582473038815
Epoch 3:
Precision: 0.3580235792589565
Recall: 0.43300248138957814
F1-Score: 0.36523807235083694


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
