In [1]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from torchcrf import CRF
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, cohen_kappa_score

class MixedTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        boundary_idx = int(self.labels[idx])
        words = text.split()
        if len(words) > self.max_len - 2:
            if boundary_idx > self.max_len - 2:
                words = words[-(self.max_len - 2):]
                boundary_idx = 0
            else:
                words = words[:self.max_len - 2]
        word_labels = [0 if i <= boundary_idx else 1 for i in range(len(words))]
        truncated_text = " ".join(words)
        encoding = self.tokenizer(
            truncated_text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
            return_special_tokens_mask=True
        )
        special_tokens_mask = encoding['special_tokens_mask'][0]
        token_labels = []
        current_word_idx = 0
        for is_special in special_tokens_mask:
            if is_special:
                token_labels.append(-100)
            else:
                if current_word_idx < len(word_labels):
                    token_labels.append(word_labels[current_word_idx])
                    current_word_idx += 1
                else:
                    token_labels.append(-100)
        token_labels = token_labels[:self.max_len]
        if len(token_labels) < self.max_len:
            token_labels.extend([-100] * (self.max_len - len(token_labels)))
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(token_labels)
        }

class DistilBERTBiGRUCRFTagger(nn.Module):
    def __init__(self, model_name, num_labels, hidden_dim=512, num_layers=2, dropout=0.3):
        super(DistilBERTBiGRUCRFTagger, self).__init__()
        self.num_labels = num_labels
        self.distilbert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout)
        hidden_size = self.distilbert.config.hidden_size
        self.gru = nn.GRU(hidden_size, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=dropout if num_layers > 1 else 0)
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)
        self.hidden2hidden = nn.Sequential(nn.Linear(hidden_dim * 2, hidden_dim * 2), nn.ReLU(), nn.Dropout(dropout))
        self.classifier = nn.Linear(hidden_dim * 2, num_labels)
        self.crf = CRF(num_labels, batch_first=True)
        nn.init.xavier_uniform_(self.classifier.weight)
        nn.init.constant_(self.classifier.bias, 0)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.distilbert(input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)
        gru_out, _ = self.gru(sequence_output)
        gru_out = self.layer_norm(gru_out)
        gru_out = self.hidden2hidden(gru_out)
        logits = self.classifier(gru_out)
        if labels is not None:
            mask = attention_mask.bool()
            crf_labels = labels.clone()
            crf_labels[crf_labels == -100] = 0
            loss = -self.crf(logits, crf_labels, mask=mask, reduction='mean')
            return loss
        else:
            mask = attention_mask.bool()
            predictions = self.crf.decode(logits, mask=mask)
            padded_predictions = []
            for pred, mask_len in zip(predictions, attention_mask.sum(1).tolist()):
                pad_len = attention_mask.size(1) - len(pred)
                padded_pred = pred + [0] * pad_len
                padded_predictions.append(padded_pred)
            return torch.tensor(padded_predictions, device=input_ids.device)

def train_model(model, data_loader, optimizer, scheduler, device, clip_value=1.0):
    model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        optimizer.zero_grad()
        loss = model(input_ids, attention_mask, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_value)
        optimizer.step()
        if scheduler:
            scheduler.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)

def evaluate_model(model, data_loader, device):
    model.eval()
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            predictions = model(input_ids, attention_mask)
            for pred_seq, label_seq, mask_seq in zip(predictions, labels, attention_mask):
                valid_indices = (mask_seq == 1) & (label_seq != -100)
                valid_pred = pred_seq[valid_indices].cpu().numpy()
                valid_label = label_seq[valid_indices].cpu().numpy()
                all_predictions.extend(valid_pred)
                all_labels.extend(valid_label)
    all_predictions = np.array(all_predictions)
    all_labels = np.array(all_labels)
    absolute_errors = np.abs(all_predictions - all_labels)
    mae = np.mean(absolute_errors)
    std_dev = np.std(absolute_errors)
    accuracy = accuracy_score(all_labels, all_predictions)
    precision = precision_score(all_labels, all_predictions, average='weighted', zero_division=0)
    recall = recall_score(all_labels, all_predictions, average='weighted', zero_division=0)
    f1 = f1_score(all_labels, all_predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(all_labels, all_predictions)
    kappa = cohen_kappa_score(all_labels, all_predictions)
    return accuracy, precision, recall, f1, mcc, mae, std_dev, kappa

def predict_boundary(model, text, tokenizer, max_len, device):
    model.eval()
    encoding = tokenizer(
        text,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
        return_special_tokens_mask=True
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)
    with torch.no_grad():
        predictions = model(input_ids, attention_mask)
    pred_labels = predictions[0]
    boundary_index = next((i for i, label in enumerate(pred_labels) if label == 1), len(pred_labels))
    special_tokens_mask = encoding['special_tokens_mask'][0].tolist()
    actual_tokens = [i for i, is_special in enumerate(special_tokens_mask) if not is_special]
    if boundary_index < len(actual_tokens):
        boundary_index = actual_tokens[boundary_index]
    return boundary_index

MODEL_NAME = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
config = {
    'batch_size': 32,
    'learning_rate': 2e-5,
    'epochs': 3,
    'max_len': 512,
    'weight_decay': 0.01,
    'gradient_clip': 1.0
}

train_df = pd.read_csv('sentence_train_data.csv')
dev_df = pd.read_csv('sentence_dev_data.csv')
test_df = pd.read_csv('sentence_test.csv')

train_texts = train_df["text"].tolist()
train_labels = train_df["label"].tolist()
dev_texts = dev_df["text"].tolist()
dev_labels = dev_df["label"].tolist()
test_texts = test_df["text"].tolist()
test_labels = test_df["label"].tolist()

train_dataset = MixedTextDataset(train_texts, train_labels, tokenizer, max_len=config['max_len'])
dev_dataset = MixedTextDataset(dev_texts, dev_labels, tokenizer, max_len=config['max_len'])
test_dataset = MixedTextDataset(test_texts, test_labels, tokenizer, max_len=config['max_len'])

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=config['batch_size'], shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_labels = 2

model = DistilBERTBiGRUCRFTagger(MODEL_NAME, num_labels).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
total_steps = len(train_loader) * config['epochs']
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=config['learning_rate'],
                                               total_steps=total_steps, pct_start=0.1, anneal_strategy='cos')

best_kappa = -float('inf')
best_epoch = 0
best_model_state = None
train_losses = []
val_metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'mcc': [], 'mae': [], 'std_dev': [], 'kappa': []}

for epoch in range(config['epochs']):
    print(f"\nEpoch {epoch + 1}/{config['epochs']}")
    train_loss = train_model(model, train_loader, optimizer, scheduler, device, config['gradient_clip'])
    train_losses.append(train_loss)
    val_accuracy, val_precision, val_recall, val_f1, val_mcc, val_mae, val_std_dev, val_kappa = evaluate_model(model, dev_loader, device)
    val_metrics['accuracy'].append(val_accuracy)
    val_metrics['precision'].append(val_precision)
    val_metrics['recall'].append(val_recall)
    val_metrics['f1'].append(val_f1)
    val_metrics['mcc'].append(val_mcc)
    val_metrics['mae'].append(val_mae)
    val_metrics['std_dev'].append(val_std_dev)
    val_metrics['kappa'].append(val_kappa)
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Metrics:")
    print(f"Accuracy: {val_accuracy:.4f}")
    print(f"Precision: {val_precision:.4f}")
    print(f"Recall: {val_recall:.4f}")
    print(f"F1 Score: {val_f1:.4f}")
    print(f"MCC: {val_mcc:.4f}")
    print(f"MAE: {val_mae:.2f}±{val_std_dev:.2f}")
    print(f"Kappa: {val_kappa:.4f}")
    if val_kappa > best_kappa:
        best_kappa = val_kappa
        best_epoch = epoch + 1
        best_model_state = model.state_dict()
        print(f"New best model with Kappa Score: {val_kappa:.4f}")

print(f"\nBest model at epoch {best_epoch} with Kappa Score: {best_kappa:.4f}")

model.load_state_dict(best_model_state)
model.eval()
test_accuracy, test_precision, test_recall, test_f1, test_mcc, test_mae, test_std_dev, test_kappa = evaluate_model(model, test_loader, device)
print(f"\nTest Results:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1 Score: {test_f1:.4f}")
print(f"MCC: {test_mcc:.4f}")
print(f"MAE: {test_mae:.2f}±{test_std_dev:.2f}")
print(f"Kappa: {test_kappa:.4f}")

test_df_copy = test_df.copy()
test_df_copy["predicted_boundary"] = test_df_copy["text"].apply(
    lambda x: predict_boundary(model, x, tokenizer, max_len=config['max_len'], device=device)
)
difference = (test_df_copy['label'] - test_df_copy['predicted_boundary']).abs()
boundary_mae = difference.mean()
boundary_sd = difference.std()
print(f"Boundary MAE ± SD: {boundary_mae:.4f} ± {boundary_sd:.4f}")

2025-05-16 16:27:58.734492: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered



Epoch 1/3
Train Loss: 80.6846
Validation Metrics:
Accuracy: 0.9317
Precision: 0.9330
Recall: 0.9317
F1 Score: 0.9320
MCC: 0.8577
MAE: 0.07±0.25
Kappa: 0.8570
New best model with Kappa Score: 0.8570

Epoch 2/3
Train Loss: 34.6524
Validation Metrics:
Accuracy: 0.9080
Precision: 0.9099
Recall: 0.9080
F1 Score: 0.9066
MCC: 0.8051
MAE: 0.09±0.29
Kappa: 0.8006

Epoch 3/3
Train Loss: 26.0526
Validation Metrics:
Accuracy: 0.9075
Precision: 0.9094
Recall: 0.9075
F1 Score: 0.9061
MCC: 0.8040
MAE: 0.09±0.29
Kappa: 0.7996

Best model at epoch 1 with Kappa Score: 0.8570

Test Results:
Accuracy: 0.8778
Precision: 0.8830
Recall: 0.8778
F1 Score: 0.8736
MCC: 0.7321
MAE: 0.12±0.33
Kappa: 0.7194
Boundary MAE ± SD: 30.9973 ± 60.0497
