In [8]:
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, BertTokenizer
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from tqdm.auto import tqdm
import csv
from torch.optim.lr_scheduler import StepLR
import torch.nn.functional as F
import time

#Define CustomTextDataset
class CustomTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

#Define BertClassifier model
class BertClassifier(nn.Module):
    def __init__(self, bert_model, num_labels):
        super(BertClassifier, self).__init__()
        self.bert = bert_model
        self.bert.requires_grad_(False)  # Freeze BERT parameters
        self.classifier = nn.Linear(bert_model.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.classifier(pooled_output)

# Define Focal Loss 
class FocalLoss(nn.Module):
    def __init__(self, alpha, gamma, num_labels, reduction):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.num_labels = num_labels
        self.reduction = reduction

    def forward(self, inputs, targets):
        BCE_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss
        if self.reduction == 'mean':
            return F_loss.mean()
        elif self.reduction == 'sum':
            return F_loss.sum()
        else:
            return F_loss







In [9]:
# Record hyperparameters as csv
def write_hyperparameters(filepath, hyperparams):
    with open(filepath, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Hyperparameters'])
        for key, value in hyperparams.items():
            writer.writerow([key, value])
        writer.writerow([])
        # 在这里添加 'Learning Rate'
        writer.writerow(['Epoch', 'Train Loss', 'Train Accuracy', 'Validation Loss', 'Validation Accuracy', 'Learning Rate'])


# Define hyperparameters
hyperparams = {
    'max_len': 128,
    'batch_size': 8,
    'learning_rate': 0.0001,
    'num_epochs': 5,
    'alpha': 0.25,
    'gamma': 2.0,
    'reduction': 'mean'
}

In [10]:

tokenizer = BertTokenizer.from_pretrained('../Bert/FYP-Bert_model(multiple)')
bert_model = AutoModel.from_pretrained('../Bert/FYP-Bert_model(multiple)')
df = pd.read_csv('../Web_scraping_and_Data_preproecssing/preprocessed_data.csv')
label_map = {label: i for i, label in enumerate(df['Classification'].unique())}
texts = df['Title_preprocessed'].tolist()
labels = [label_map[label] for label in df['Classification']]
num_labels = len(df['Classification'].unique())


X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels)

train_dataset = CustomTextDataset(X_train, y_train, tokenizer, max_len=hyperparams['max_len'])
test_dataset = CustomTextDataset(X_test, y_test, tokenizer, max_len=hyperparams['max_len'])


Some weights of BertModel were not initialized from the model checkpoint at ../Bert/FYP-Bert_model(multiple) and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    
    progress_bar = tqdm(train_loader, desc='Training', leave=False)
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        _, predicted_labels = torch.max(outputs, dim=1)
        correct_predictions += torch.sum(predicted_labels == labels).item()
        total_predictions += labels.size(0)
        
        progress_bar.set_postfix({'Loss': loss.item()})
    
    average_loss = total_loss / len(train_loader)
    train_accuracy = correct_predictions / total_predictions
    return average_loss, train_accuracy


def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct_predictions = 0
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            _, predicted_labels = torch.max(outputs, dim=1)
            correct_predictions += torch.sum(predicted_labels == labels).item()
    average_loss = total_loss / len(loader)
    accuracy = correct_predictions / len(loader.dataset)
    return average_loss, accuracy

In [12]:
# Set up 3-Fold cross validation
kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Cross-validation process
for fold, (train_idx, valid_idx) in enumerate(kf.split(X_train, y_train)):  # 注意这里也传入了y_train
    train_texts = [X_train[i] for i in train_idx]
    train_labels = [y_train[i] for i in train_idx]
    valid_texts = [X_train[i] for i in valid_idx]
    valid_labels = [y_train[i] for i in valid_idx]

    train_subdataset = CustomTextDataset(train_texts, train_labels, tokenizer, max_len=hyperparams['max_len'])
    valid_subdataset = CustomTextDataset(valid_texts, valid_labels, tokenizer, max_len=hyperparams['max_len'])

    train_loader = DataLoader(train_subdataset, batch_size=hyperparams['batch_size'], shuffle=True)
    valid_loader = DataLoader(valid_subdataset, batch_size=hyperparams['batch_size'])


    model = BertClassifier(bert_model, len(set(labels))).to(device)
    optimizer = Adam(model.parameters(), lr=hyperparams['learning_rate'])
    #criterion = CrossEntropyLoss()
    criterion = FocalLoss(alpha=hyperparams['alpha'], gamma=hyperparams['gamma'], num_labels=num_labels, reduction=hyperparams['reduction'])
    scheduler = StepLR(optimizer, step_size=10, gamma=0.1)

    # Training and validation loop
    for epoch in range(hyperparams['num_epochs']):
        train_loss, train_accuracy = train(model, train_loader, optimizer, criterion, device)
        valid_loss, valid_accuracy = evaluate(model, valid_loader, criterion, device)
        scheduler.step()

        print(f'Fold {fold+1}, Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}, Valid Loss: {valid_loss:.4f}, Valid Accuracy: {valid_accuracy:.4f}')



Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 1, Epoch 1, Train Loss: 0.6714, Train Accuracy: 0.2278, Valid Loss: 0.6141, Valid Accuracy: 0.2331


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 1, Epoch 2, Train Loss: 0.6075, Train Accuracy: 0.2387, Valid Loss: 0.5931, Valid Accuracy: 0.2503


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 1, Epoch 3, Train Loss: 0.5890, Train Accuracy: 0.2664, Valid Loss: 0.5775, Valid Accuracy: 0.2843


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 1, Epoch 4, Train Loss: 0.5746, Train Accuracy: 0.2808, Valid Loss: 0.5650, Valid Accuracy: 0.2973


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 1, Epoch 5, Train Loss: 0.5619, Train Accuracy: 0.2966, Valid Loss: 0.5547, Valid Accuracy: 0.3169


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 2, Epoch 1, Train Loss: 0.6742, Train Accuracy: 0.2150, Valid Loss: 0.6150, Valid Accuracy: 0.2313


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 2, Epoch 2, Train Loss: 0.6083, Train Accuracy: 0.2380, Valid Loss: 0.5937, Valid Accuracy: 0.2443


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 2, Epoch 3, Train Loss: 0.5892, Train Accuracy: 0.2559, Valid Loss: 0.5780, Valid Accuracy: 0.2811


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 2, Epoch 4, Train Loss: 0.5744, Train Accuracy: 0.2843, Valid Loss: 0.5655, Valid Accuracy: 0.2876


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 2, Epoch 5, Train Loss: 0.5627, Train Accuracy: 0.2976, Valid Loss: 0.5545, Valid Accuracy: 0.3053


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 3, Epoch 1, Train Loss: 0.6759, Train Accuracy: 0.2161, Valid Loss: 0.6129, Valid Accuracy: 0.2313


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 3, Epoch 2, Train Loss: 0.6076, Train Accuracy: 0.2403, Valid Loss: 0.5930, Valid Accuracy: 0.2448


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 3, Epoch 3, Train Loss: 0.5887, Train Accuracy: 0.2678, Valid Loss: 0.5777, Valid Accuracy: 0.2690


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 3, Epoch 4, Train Loss: 0.5743, Train Accuracy: 0.2890, Valid Loss: 0.5647, Valid Accuracy: 0.2880


Training:   0%|          | 0/538 [00:00<?, ?it/s]

Fold 3, Epoch 5, Train Loss: 0.5616, Train Accuracy: 0.3029, Valid Loss: 0.5542, Valid Accuracy: 0.3039


In [13]:

test_loader = DataLoader(test_dataset, batch_size=8)
test_loss, test_accuracy = evaluate(model, test_loader, criterion, device)
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

Test Loss: 0.5518, Test Accuracy: 0.3083


In [14]:
from sklearn.metrics import classification_report

for epoch in range(3):
    model.train()
    for data in train_loader:
        input_ids, attention_mask, labels = data['input_ids'].to(device), data['attention_mask'].to(device), data['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for data in test_loader:
            input_ids, attention_mask, labels = data['input_ids'].to(device), data['attention_mask'].to(device), data['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    

    classification_metrics = classification_report(all_labels, all_preds, output_dict=True)
    print(f"Epoch {epoch+1}:")
    print(f"Precision: {classification_metrics['weighted avg']['precision']}")
    print(f"Recall: {classification_metrics['weighted avg']['recall']}")
    print(f"F1-Score: {classification_metrics['weighted avg']['f1-score']}")

scheduler.step()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 1:
Precision: 0.22914136039120436
Recall: 0.32878411910669975
F1-Score: 0.20633943471037602


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2:
Precision: 0.2253029985012158
Recall: 0.3356079404466501
F1-Score: 0.2204932646974562
Epoch 3:
Precision: 0.2233821060821681
Recall: 0.337468982630273
F1-Score: 0.22155540864421702


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
model.eval()

BertClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a