In [None]:
!pip install transformers datasets evaluate accelerate sentencepiece

In [None]:
import os
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
import numpy as np
import datasets
from tabulate import tabulate
import nltk
from datetime import datetime

In [None]:

# ! pip install datasets
# ! pip install sentencepiece
# ! pip install rouge_score
! pip install wandb
import wandb
# wandb login}
wandb.login(key="6930a5bf7436e98e8f1d44766c7b999ee9621ba9")
# wandb.init(project="LLM", entity="sa07424-habib-university", settings=wandb.Settings(init_timeout=200))

In [None]:
import json
import torch
import random
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from torch.utils.data import Dataset
from collections import defaultdict

# === AEDA helper ===
PUNCTUATIONS = ['.', ',', '!', '?', ';', ':']
def aeda(sentence, num_insertions=3):
    words = sentence.split()
    if not words:
        return sentence
    new_words = words.copy()
    for _ in range(num_insertions):
        insert_pos = random.randint(0, len(new_words))
        punct = random.choice(PUNCTUATIONS)
        new_words.insert(insert_pos, punct)
    return ' '.join(new_words)

# === Load Data ===
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_en.json", "r", encoding="utf-8") as f:
    data_en = json.load(f)
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_es.json", "r", encoding="utf-8") as f:
    data_es = json.load(f)
with open("/kaggle/input/existdatasets/EXIST2025_training_task1_2_gold_soft.json", "r", encoding="utf-8") as f:
    gold_soft = json.load(f)

gold_soft_dict = {entry["id"]: entry["value"] for entry in gold_soft}
label_classes = ["NO", "DIRECT", "REPORTED", "JUDGEMENTAL"]

# === Count Label Distribution ===
label_counts = defaultdict(int)
for soft in gold_soft_dict.values():
    max_label = max(soft, key=soft.get)
    label_counts[max_label] += 1

# === Identify underrepresented labels (you can tune this threshold) ===
avg_count = np.mean(list(label_counts.values()))
underrepresented_labels = [label for label, count in label_counts.items() if count < avg_count]

# === Process Tweets & Augment Underrepresented Only ===
def process_data_with_soft_labels(data, augment=True, augment_n=2):
    tweets, labels, ids = [], [], []

    for entry in data.values():
        tweet_id = entry["id_EXIST"]
        tweet = entry["tweet"]

        if tweet_id not in gold_soft_dict:
            continue

        soft_label_dict = gold_soft_dict[tweet_id]
        soft_label_vector = [soft_label_dict.get(label, 0.0) for label in label_classes]

        # Original tweet
        tweets.append(tweet)
        labels.append(soft_label_vector)
        ids.append(tweet_id)

        # Determine primary label
        main_label = max(soft_label_dict, key=soft_label_dict.get)

        # Augment only if underrepresented
        if augment and main_label in underrepresented_labels:
            for i in range(augment_n):
                augmented_tweet = aeda(tweet)
                tweets.append(augmented_tweet)
                labels.append(soft_label_vector)
                ids.append(f"{tweet_id}_aug{i+1}")

    return tweets, labels, ids

# Process English and Spanish data
tweets_en, labels_en, ids_en = process_data_with_soft_labels(data_en)
tweets_es, labels_es, ids_es = process_data_with_soft_labels(data_es)

# tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large")

# === Dataset Class ===
class TweetDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tweet_id = self.ids[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            "id": tweet_id,
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": label
        }

# === Train-validation split ===
def get_datasets(tweets, labels, ids):
    train_texts, val_texts, train_labels, val_labels, train_ids, val_ids = train_test_split(
        tweets, labels, ids, test_size=0.2, random_state=42
    )
    train_dataset = TweetDataset(train_texts, train_labels, train_ids, tokenizer)
    val_dataset = TweetDataset(val_texts, val_labels, val_ids, tokenizer)
    return train_dataset, val_dataset

# === Create datasets ===
train_dataset_en, val_dataset_en = get_datasets(tweets_en, labels_en, ids_en)
train_dataset_es, val_dataset_es = get_datasets(tweets_es, labels_es, ids_es)

print(f"✅ English train set size: {len(train_dataset_en)} (with selective augmentation)")
print(f"✅ Spanish train set size: {len(train_dataset_es)} (with selective augmentation)")

CORRECT_LABELS = label_classes
import json
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# === Load Tweets ===
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_en.json", "r", encoding="utf-8") as f:
    data_en = json.load(f)

with open("/kaggle/input/existdatasets/EXIST2025_training_translated_es.json", "r", encoding="utf-8") as f:
    data_es = json.load(f)

# === Load gold_soft_train ===
with open("/kaggle/input/existdatasets/EXIST2025_training_task1_2_gold_soft.json", "r", encoding="utf-8") as f:
    gold_soft = json.load(f)

# Convert gold_soft to a dict for fast access
gold_soft_dict = {entry["id"]: entry["value"] for entry in gold_soft}

# Define binary labels
CORRECT_LABELS = label_classes

# === Process Tweets with Corresponding Soft Labels ===
def process_data_with_soft_labels(data):
    tweets = []
    labels = []
    ids = []

    for entry in data.values():
        tweet_id = entry["id_EXIST"]
        tweet = entry["tweet"]

        if tweet_id not in gold_soft_dict:
            continue  # Skip if soft label not found

        soft_label_dict = gold_soft_dict[tweet_id]

        # Build binary soft label vector [YES_score, NO_score]
        soft_label_vector = [soft_label_dict.get(label, 0.0) for label in CORRECT_LABELS]

        tweets.append(tweet)
        labels.append(soft_label_vector)
        ids.append(tweet_id)

    return tweets, labels, ids

# Process both English and Spanish tweets
tweets_en, labels_en, ids_en = process_data_with_soft_labels(data_en)
tweets_es, labels_es, ids_es = process_data_with_soft_labels(data_es)

# === Custom Dataset Class ===
class TweetDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tweet_id = self.ids[idx]
        labels = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            "id": tweet_id,
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": labels
        }

# === Train-validation split ===
def get_datasets(tweets, labels, ids):
    train_texts, val_texts, train_labels, val_labels, train_ids, val_ids = train_test_split(
        tweets, labels, ids, test_size=0.2, random_state=42
    )
    train_dataset = TweetDataset(train_texts, train_labels, train_ids, tokenizer)
    val_dataset = TweetDataset(val_texts, val_labels, val_ids, tokenizer)
    return train_dataset, val_dataset

# === Create datasets ===
train_dataset_en, val_dataset_en = get_datasets(tweets_en, labels_en, ids_en)
train_dataset_es, val_dataset_es = get_datasets(tweets_es, labels_es, ids_es)


# === Train Model ===
def train_model(train_dataset, val_dataset, output_dir):


    # model_en = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/distilroberta-base_mergedlang_en")
# tokenizer_en = AutoTokenizer.from_pretrained("/kaggle/working/distilroberta-base_mergedlang_en")
    # model = AutoModelForSequenceClassification.from_pretrained(
    #         "cardiffnlp/twitter-xlm-roberta-base",
    #         num_labels=len(CORRECT_LABELS),
    #         problem_type="multi_label_classification"
    #     )
    model = AutoModelForSequenceClassification.from_pretrained(
            "FacebookAI/xlm-roberta-large",
            num_labels=len(CORRECT_LABELS),
            problem_type="multi_label_classification"
        )

    # model = BertForSequenceClassification.from_pretrained(
    #     "FacebookAI/xlm-roberta-large",
    #     num_labels=len(CORRECT_LABELS),
    #     problem_type="multi_label_classification"
    # )

    training_args = TrainingArguments(
    output_dir=output_dir,
    do_train=True,
    do_eval=True,
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=1,
)


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    trainer.train()
    return trainer

# === Train English and Spanish models ===
trainer_en = train_model(train_dataset_en, val_dataset_en, output_dir="./results/en_mbert_xlmR_large_aeda")
trainer_es = train_model(train_dataset_es, val_dataset_es, output_dir="./results/es_mbert_xlmR_large_aeda")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

✅ English train set size: 9763 (with selective augmentation)
✅ Spanish train set size: 9763 (with selective augmentation)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss


### Prediction

In [None]:
import json

# Load the dev dataset
with open("/kaggle/input/existdatasets/EXIST2025_dev.json", "r", encoding="utf-8") as f:
    dev_data = json.load(f)

# Split into English & Spanish
english_dev_tweets = []
english_dev_ids = []
spanish_dev_tweets = []
spanish_dev_ids = []

for entry in dev_data.values():
    tweet_id = entry["id_EXIST"]
    tweet = entry["tweet"]
    lang = entry["lang"]

    if lang == "en":
        english_dev_tweets.append(tweet)
        english_dev_ids.append(tweet_id)
    elif lang == "es":
        spanish_dev_tweets.append(tweet)
        spanish_dev_ids.append(tweet_id)

# Debugging: Check split sizes
print(f"English Dev Samples: {len(english_dev_tweets)}")
print(f"Spanish Dev Samples: {len(spanish_dev_tweets)}")


In [None]:
import os
from transformers import BertForSequenceClassification, AutoModelForSequenceClassification, AutoTokenizer

# Function to get the latest checkpoint
def get_latest_checkpoint(directory="./results"):
    checkpoints = [d for d in os.listdir(directory) if d.startswith("checkpoint-")]
    if not checkpoints:
        raise ValueError(f"No checkpoints found in {directory}")
    latest_checkpoint = sorted(checkpoints, key=lambda x: int(x.split('-')[-1]))[-1]
    return os.path.join(directory, latest_checkpoint)

# Load the best model checkpoint for English and Spanish
latest_checkpoint_en = get_latest_checkpoint("./results/en_mbert_xlmR_large_aeda")
latest_checkpoint_es = get_latest_checkpoint("./results/es_mbert_xlmR_large_aeda")

print(f"Using latest checkpoint for English: {latest_checkpoint_en}")
print(f"Using latest checkpoint for Spanish: {latest_checkpoint_es}")

model_en = AutoModelForSequenceClassification.from_pretrained(latest_checkpoint_en)
model_es = AutoModelForSequenceClassification.from_pretrained(latest_checkpoint_es)
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large")


## HARD Predictions

In [None]:
def predict_hard_labels_from_soft_model(tweets, ids, model, tokenizer, label_classes, output_file):
    """
    Uses the soft model to predict a single hard label: "YES" or "NO".
    - Assigns the label with the higher probability.
    """
    # model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")#faster on gpu
    model.to(device)

    results = []

    for tweet, tweet_id in zip(tweets, ids):
        # encoding = tokenizer(text=tweet, truncation=True, padding="max_length", max_length=256, return_tensors="pt")
        
        encoding = tokenizer(tweet, truncation=True, padding="max_length", max_length=256, return_tensors="pt")
        encoding = {key: val.to(device) for key, val in encoding.items()}
        with torch.no_grad():
            outputs = model(**encoding)

        logits = outputs.logits.squeeze()
        probs = torch.sigmoid(logits).cpu().numpy()

        # Pick the label with the highest probability (YES or NO)
        max_index = int(probs.argmax())
        predicted_label = label_classes[max_index]

        results.append({
            "test_case": "EXIST2025",
            "id": tweet_id,
            "value": [predicted_label]  # Only one label
        })
    print(f"Hard label predictions saved to {output_file}")

predict_hard_labels_from_soft_model(english_dev_tweets, english_dev_ids, model_en, tokenizer, label_classes, "EXIST2025_dev_predictions_hard_xlmR_en.json")
predict_hard_labels_from_soft_model(spanish_dev_tweets, spanish_dev_ids, model_es, tokenizer, label_classes, "EXIST2025_dev_predictions_hard_xlmR_es.json")

In [None]:
import json

with open("/kaggle/working/EXIST2025_dev_predictions_hard_merged_es.json", "r", encoding="utf-8") as f:
    es_data = json.load(f)
with open("/kaggle/working/EXIST2025_dev_predictions_hard_merged_en.json", "r", encoding="utf-8") as f:
    en_data = json.load(f)

# Assuming both files contain lists of predictions, merge them
if isinstance(es_data, list) and isinstance(en_data, list):
    merged_data = es_data + en_data
else:
    raise ValueError("JSON structure is not a list. Ensure both files contain lists.")

import json

predictions = merged_data

converted = []
for entry in predictions:
    # Convert the "value" list to a single string (first label only)
    new_entry = {
        "test_case": entry["test_case"],
        "id": entry["id"],
        "value": entry["value"][0] if isinstance(entry["value"], list) else entry["value"]
    }
    converted.append(new_entry)
output_file = "EXIST2025_dev_predictions_merged_hard_flat.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(converted, f, indent=4)

print(f"Predictions converted to gold format and saved to {output_file}")

In [None]:
import torch
import json
from tqdm import tqdm
!pip install pyEvall
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils

In [None]:
predictions = "/kaggle/working/EXIST2025_dev_predictions_merged_hard_flat.json"         
gold = "/kaggle/input/existdatasets/EXIST2025_dev_task1_2_gold_hard.json" 
test = PyEvALLEvaluation() 
params= dict() 
params[PyEvALLUtils.PARAM_REPORT]= PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED  
metrics=["ICM", "ICMNorm" ,"FMeasure"]                  # for hard        
report= test.evaluate(predictions, gold, metrics, **params) 
report.print_report()