In [None]:
!pip install pytorch-crf

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from torchcrf import CRF
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, matthews_corrcoef, mean_absolute_error
import matplotlib.pyplot as plt

In [None]:
class MixedTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        boundary_idx = int(self.labels[idx])
        words = text.split()
        if len(words) > self.max_len - 2:
            if boundary_idx > self.max_len - 2:
                words = words[-(self.max_len - 2):]
                boundary_idx = 0
            else:
                words = words[:self.max_len - 2]
        word_labels = [0 if i <= boundary_idx else 1 for i in range(len(words))]
        truncated_text = " ".join(words)
        encoding = self.tokenizer(
            truncated_text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
            return_special_tokens_mask=True
        )
        special_tokens_mask = encoding['special_tokens_mask'][0]
        token_labels = []
        current_word_idx = 0

        for is_special in special_tokens_mask:
            if is_special:
                token_labels.append(-100)
            else:
                if current_word_idx < len(word_labels):
                    token_labels.append(word_labels[current_word_idx])
                    current_word_idx += 1
                else:
                    token_labels.append(-100)
        token_labels = token_labels[:self.max_len]
        if len(token_labels) < self.max_len:
            token_labels.extend([-100] * (self.max_len - len(token_labels)))
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(token_labels)
        }

In [None]:
class RNNCRFTagger(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_labels, embedding_dim=768, num_layers=2, dropout=0.3):
        super(RNNCRFTagger, self).__init__()
        self.num_labels = num_labels
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.embed_dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=dropout if num_layers > 1 else 0)
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)
        self.hidden2hidden = nn.Sequential(nn.Linear(hidden_dim * 2, hidden_dim * 2), nn.ReLU(), nn.Dropout(dropout))
        self.hidden2tag = nn.Linear(hidden_dim * 2, num_labels)
        self.crf = CRF(num_labels, batch_first=True)
        for name, param in self.named_parameters():
            if 'weight' in name:
                if len(param.shape) > 1:
                    nn.init.xavier_uniform_(param)
                else:
                    nn.init.normal_(param, mean=0, std=0.01)
            elif 'bias' in name:
                nn.init.constant_(param, 0)

    def forward(self, input_ids, attention_mask, labels=None):
        embedded = self.embedding(input_ids)
        embedded = self.embed_dropout(embedded)
        embedded = embedded * attention_mask.unsqueeze(-1)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = self.layer_norm(lstm_out)
        lstm_out = self.hidden2hidden(lstm_out)
        logits = self.hidden2tag(lstm_out)

        if labels is not None:
            mask = attention_mask.bool()
            crf_labels = labels.clone()
            crf_labels[crf_labels == -100] = 0
            loss = -self.crf(logits, crf_labels, mask=mask, reduction='mean')
            return loss
        else:
            mask = attention_mask.bool()
            predictions = self.crf.decode(logits, mask=mask)
            padded_predictions = []
            for pred, mask_len in zip(predictions, attention_mask.sum(1).tolist()):
                pad_len = attention_mask.size(1) - len(pred)
                padded_pred = pred + [0] * pad_len
                padded_predictions.append(padded_pred)
            return torch.tensor(padded_predictions, device=input_ids.device)


In [None]:
def train_model(model, data_loader, optimizer, scheduler, device, clip_value=1.0):
    model.train()
    total_loss = 0
    for batch in data_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        optimizer.zero_grad()
        loss = model(input_ids, attention_mask, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_value)
        optimizer.step()
        if scheduler:
            scheduler.step()
        total_loss += loss.item()
    return total_loss / len(data_loader)

def evaluate_model(model, data_loader, device):
    model.eval()
    all_predictions = []
    all_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            predictions = model(input_ids, attention_mask)
            for pred_seq, label_seq, mask_seq in zip(predictions, labels, attention_mask):
                valid_indices = (mask_seq == 1) & (label_seq != -100)
                valid_pred = pred_seq[valid_indices].cpu().numpy()
                valid_label = label_seq[valid_indices].cpu().numpy()
                all_predictions.extend(valid_pred)
                all_labels.extend(valid_label)
    all_predictions = np.array(all_predictions)
    all_labels = np.array(all_labels)
    absolute_errors = np.abs(all_predictions - all_labels)
    mae = np.mean(absolute_errors)
    std_dev = np.std(absolute_errors)
    accuracy = accuracy_score(all_labels, all_predictions)
    precision = precision_score(all_labels, all_predictions, average='weighted', zero_division=0)
    recall = recall_score(all_labels, all_predictions, average='weighted', zero_division=0)
    f1 = f1_score(all_labels, all_predictions, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(all_labels, all_predictions)
    return accuracy, precision, recall, f1, mcc, mae, std_dev


In [None]:
MODEL_NAME = 'microsoft/deberta-v3-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
config = {
    'batch_size': 16,
    'learning_rate': 1e-3,
    'hidden_dim': 512,
    'embedding_dim': 768,
    'num_layers': 2,
    'dropout': 0.3,
    'epochs': 3,
    'max_len': 512,
    'weight_decay': 0.01,
    'gradient_clip': 1.0
}

train_df = pd.read_csv('/content/drive/MyDrive/SCI AIGC/sentence_train_data.csv')
dev_df = pd.read_csv('/content/drive/MyDrive/SCI AIGC/sentence_dev_data.csv')

train_texts = train_df["text"].tolist()
train_labels = train_df["label"].tolist()
dev_texts = dev_df["text"].tolist()
dev_labels = dev_df["label"].tolist()

train_dataset = MixedTextDataset(train_texts, train_labels, tokenizer, max_len=config['max_len'])
dev_dataset = MixedTextDataset(dev_texts, dev_labels, tokenizer, max_len=config['max_len'])

train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=config['batch_size'], shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab_size = tokenizer.vocab_size
num_labels = len(set([label for label in train_labels] + [label for label in dev_labels]))

model = RNNCRFTagger(vocab_size, config['hidden_dim'], num_labels,
                     embedding_dim=config['embedding_dim'],
                     num_layers=config['num_layers'],
                     dropout=config['dropout']).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
total_steps = len(train_loader) * config['epochs']
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=config['learning_rate'],
                                               total_steps=total_steps, pct_start=0.1, anneal_strategy='cos')


In [None]:
best_f1 = 0.0
best_epoch = 0
patience = 3
patience_counter = 0
best_model_path = "RNNCRF_bestmodel.pth"
train_losses = []
val_metrics = {'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'mcc': [], 'mae': [], 'std_dev': []}

print("Starting training...")
print(f"Training on device: {device}")
print(f"Number of training examples: {len(train_dataset)}")
print(f"Number of validation examples: {len(dev_dataset)}")

for epoch in range(config['epochs']):
    print(f"\nEpoch {epoch + 1}/{config['epochs']}")
    train_loss = train_model(model, train_loader, optimizer, scheduler, device, config['gradient_clip'])
    train_losses.append(train_loss)
    val_accuracy, val_precision, val_recall, val_f1, val_mcc, val_mae, val_std_dev = evaluate_model(model, dev_loader, device)
    val_metrics['accuracy'].append(val_accuracy)
    val_metrics['precision'].append(val_precision)
    val_metrics['recall'].append(val_recall)
    val_metrics['f1'].append(val_f1)
    val_metrics['mcc'].append(val_mcc)
    val_metrics['mae'].append(val_mae)
    val_metrics['std_dev'].append(val_std_dev)

    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Metrics:")
    print(f"Accuracy: {val_accuracy:.4f}")
    print(f"Precision: {val_precision:.4f}")
    print(f"Recall: {val_recall:.4f}")
    print(f"F1 Score: {val_f1:.4f}")
    print(f"MCC: {val_mcc:.4f}")
    print(f"MAE: {val_mae:.2f}±{val_std_dev:.2f}")

    if val_f1 > best_f1:
        best_f1 = val_f1
        best_epoch = epoch + 1
        patience_counter = 0
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'config': config,
            'metrics': {
                'f1': val_f1,
                'accuracy': val_accuracy,
                'precision': val_precision,
                'recall': val_recall,
                'mcc': val_mcc,
                'mae': val_mae,
                'std_dev': val_std_dev
            }
        }, best_model_path)
        print(f"New best model saved with F1 Score: {val_f1:.4f}")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"Early stopping triggered after {epoch + 1} epochs")
            break

print("\nTraining completed!")
print(f"Best model saved at epoch {best_epoch} with F1 Score: {best_f1:.4f}")

In [None]:
model.eval()

test_df = pd.read_csv('/content/drive/MyDrive/SCI AIGC/sentence_test.csv')
test_texts = test_df["text"].tolist()
test_labels = test_df["label"].tolist()

test_dataset = MixedTextDataset(test_texts, test_labels, tokenizer, max_len=512)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

test_accuracy, test_precision, test_recall, test_f1, test_mcc, test_mae, test_std_dev = evaluate_model(model, test_loader, device)

print(f"Test Results: Accuracy: {test_accuracy:.4f}, Precision: {test_precision:.4f}, Recall: {test_recall:.4f}, "
      f"F1 Score: {test_f1:.4f}, MCC: {test_mcc:.4f}, MAE: {test_mae:.2f}±{test_std_dev:.2f}")


In [None]:
def predict_boundary(model, text, tokenizer, max_len, device):
    model.eval()
    encoding = tokenizer(
        text,
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
        return_special_tokens_mask=True
    )

    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        embedded = model.embedding(input_ids)
        embedded = model.embed_dropout(embedded)
        embedded = embedded * attention_mask.unsqueeze(-1)
        lstm_out, _ = model.lstm(embedded)
        lstm_out = model.layer_norm(lstm_out)
        lstm_out = model.hidden2hidden(lstm_out)
        logits = model.hidden2tag(lstm_out)
        mask = attention_mask.bool()
        predictions = model.crf.decode(logits, mask=mask)

    pred_labels = predictions[0]
    boundary_index = next((i for i, label in enumerate(pred_labels) if label == 1), len(pred_labels))
    special_tokens_mask = encoding['special_tokens_mask'][0].tolist()
    actual_tokens = [i for i, is_special in enumerate(special_tokens_mask) if not is_special]

    if boundary_index < len(actual_tokens):
        boundary_index = actual_tokens[boundary_index]

    return boundary_index


In [None]:
test_df["predicted_boundary"] = test_df["text"].apply(
    lambda x: predict_boundary(model, x, tokenizer, max_len=512, device=device)
)

In [None]:
test_df

In [None]:
test_df.to_csv('BiLSTM_CRF.csv', index = False)

In [None]:
mae = (test_df['label'] - test_df['predicted_boundary']).abs().mean()
mae

In [None]:
difference = (test_df['label'] - test_df['predicted_boundary']).abs()
mae = difference.mean()
sd = difference.std()
print(f"MAE ± SD: {mae:.4f} ± {sd:.4f}")
