In [None]:
# !python -m spacy download en_core_web_trf

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import spacy
import os
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch.nn as nn

# Disable wandb
os.environ["WANDB_DISABLED"] = "true"

In [None]:
train_file = "trainingdata-all-annotations.txt"
val_file = "trialdata-all-annotations.txt"
columns = ["ID", "Target", "Tweet", "Stance", "OpinionTowards", "Sentiment"]

# Load the data
train_data = pd.read_csv(train_file, sep="\t", header=None, encoding='latin-1', names=columns)
val_data = pd.read_csv(val_file, sep="\t", header=None, encoding='latin-1', names=columns)

# Display the first few rows
print("Training data shape:", train_data.shape)
print("\nFirst few rows of training data:")
display(train_data.head())

Training data shape: (2815, 6)

First few rows of training data:


Unnamed: 0,ID,Target,Tweet,Stance,OpinionTowards,Sentiment
0,ID,Target,Tweet,Stance,Opinion towards,Sentiment
1,101,Atheism,dear lord thank u for all of ur blessings forg...,AGAINST,OTHER,POSITIVE
2,102,Atheism,"Blessed are the peacemakers, for they shall be...",AGAINST,OTHER,POSITIVE
3,103,Atheism,I am not conformed to this world. I am transfo...,AGAINST,OTHER,POSITIVE
4,104,Atheism,Salah should be prayed with #focus and #unders...,AGAINST,OTHER,POSITIVE


In [None]:
# Initialize models and tokenizer
nlp = spacy.load("en_core_web_trf")

# Initialize model and tokenizer
MODEL_NAME = "microsoft/deberta-v3-large"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)

  model.load_state_dict(torch.load(filelike, map_location=device))
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

In [None]:
# Preprocessing function
def preprocess_text(text, nlp):
    # Basic cleaning
    text = re.sub(r'#(\w+)', r'\1', text)
    text = re.sub(r'@(\w+)', 'USER', text)
    text = re.sub(r'http\S+', 'URL', text)
    text = re.sub(r'[^\w\s.,!?]', '', text)
    text = text.lower().strip()

    # SpaCy processing
    doc = nlp(text)

    # Preserve named entities and noun phrases
    processed_tokens = []
    i = 0
    while i < len(doc):
        if doc[i].ent_type_:
            ent = doc[i:i+len(doc[i].ent_iob_)]
            processed_tokens.append(ent.text)
            i += len(doc[i].ent_iob_)
        else:
            processed_tokens.append(doc[i].text)
            i += 1

    return ' '.join(processed_tokens)

# Apply preprocessing
print("Preprocessing training data...")
train_data["CleanTweet"] = train_data["Tweet"].apply(lambda x: preprocess_text(x, nlp))
print("Preprocessing validation data...")
val_data["CleanTweet"] = val_data["Tweet"].apply(lambda x: preprocess_text(x, nlp))

# Display example
print("\nExample of preprocessing:")
sample_idx = 0
print("Original:", train_data.iloc[sample_idx]["Tweet"])
print("Cleaned:", train_data.iloc[sample_idx]["CleanTweet"])

  with torch.cuda.amp.autocast(self._mixed_precision):


Preprocessing training data...
Preprocessing validation data...


  with torch.cuda.amp.autocast(self._mixed_precision):



Example of preprocessing:
Original: Tweet
Cleaned: tweet


In [None]:
# Drop rows with NaN stance
train_data = train_data.dropna(subset=["Stance"])
val_data = val_data.dropna(subset=["Stance"])

# Ensure column headers aren't mistakenly included as data
train_data = train_data[train_data["Stance"].isin(["FAVOR", "AGAINST", "NONE"])]
val_data = val_data[val_data["Stance"].isin(["FAVOR", "AGAINST", "NONE"])]

# Map stance labels
stance_mapping = {"FAVOR": 0, "AGAINST": 1, "NONE": 2}
train_data["StanceLabel"] = train_data["Stance"].map(stance_mapping).astype(int)
val_data["StanceLabel"] = val_data["Stance"].map(stance_mapping).astype(int)

# Combine input text
train_data["Input"] = train_data["Target"].astype(str) + " </s> " + train_data["CleanTweet"].astype(str)
val_data["Input"] = val_data["Target"].astype(str) + " </s> " + val_data["CleanTweet"].astype(str)

# Display class distribution
print("Training data stance distribution:")
print(train_data["Stance"].value_counts())

print("\nValidation data stance distribution:")
print(val_data["Stance"].value_counts())

# Final check to confirm no NaNs
print("\nCheck for NaNs in StanceLabel:")
print("Train:", train_data["StanceLabel"].isnull().sum())
print("Val:", val_data["StanceLabel"].isnull().sum())


Training data stance distribution:
Stance
AGAINST    1342
NONE        741
FAVOR       731
Name: count, dtype: int64

Validation data stance distribution:
Stance
AGAINST    53
NONE       25
FAVOR      22
Name: count, dtype: int64


In [None]:
print("Unique values in Stance column (Train):", train_data["Stance"].unique())
print("Unique values in Stance column (Validation):", val_data["Stance"].unique())

Unique values in Stance column (Train): ['AGAINST' 'FAVOR' 'NONE']
Unique values in Stance column (Validation): ['AGAINST' 'FAVOR' 'NONE']


In [None]:
# Dataset class
class EnhancedStanceDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['Input']
        label = self.data.iloc[idx]['StanceLabel']

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = EnhancedStanceDataset(train_data, tokenizer)
val_dataset = EnhancedStanceDataset(val_data, tokenizer)

# Calculate class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_data["StanceLabel"]),
    y=train_data["StanceLabel"]
)
class_weights = torch.tensor(class_weights, dtype=torch.float)

print("Class weights:", class_weights)

Class weights: tensor([1.2832, 0.6990, 1.2659])


In [None]:
# Metrics and Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    acc = accuracy_score(labels, predictions)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

class StanceTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # Weighted Cross Entropy Loss
        loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss


In [None]:
# Training arguments
def get_training_args():
    return TrainingArguments(
        output_dir="./results",
        evaluation_strategy="steps",
        eval_steps=100,
        logging_dir="./logs",
        logging_steps=50,
        save_strategy="steps",
        save_steps=100,
        save_total_limit=2,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=5,
        learning_rate=2e-5,
        weight_decay=0.01,
        warmup_ratio=0.1,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        fp16=True,
        gradient_accumulation_steps=2,
        max_grad_norm=1.0
    )

# Initialize trainer
training_args = get_training_args()
trainer = StanceTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
# Training
# Train model
print("Starting training...")
trainer.train()

# Save the final model
trainer.save_model("./final_stance_model")
print("Training completed and model saved!")

Starting training...


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
100,1.0454,0.918314,0.61,0.580339,0.61,0.586077
200,0.7045,0.568797,0.82,0.833937,0.82,0.805854
300,0.5639,0.284834,0.93,0.932941,0.93,0.930449
400,0.3458,0.305874,0.91,0.917637,0.91,0.910776
500,0.3989,0.265588,0.91,0.923548,0.91,0.912486
600,0.2556,0.262164,0.95,0.95381,0.95,0.950874
700,0.2523,0.385006,0.9,0.904278,0.9,0.894843
800,0.1602,0.333653,0.91,0.909252,0.91,0.90926


Training completed and model saved!
