In [3]:
!pip install transformers datasets evaluate accelerate sentencepiece

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=2.0.0->accelerate)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=2.0.0->accelerate)
  Downloading nvidia_

In [4]:
import os
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
import numpy as np
import datasets
from tabulate import tabulate
import nltk
from datetime import datetime

In [5]:

# ! pip install datasets
# ! pip install sentencepiece
# ! pip install rouge_score
! pip install wandb
import wandb
# wandb login}
wandb.login(key="6930a5bf7436e98e8f1d44766c7b999ee9621ba9")
# wandb.init(project="LLM", entity="sa07424-habib-university", settings=wandb.Settings(init_timeout=200))



[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msa07424[0m ([33msa07424-habib-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
label_classes = ['NO', 'DIRECT','REPORTED','JUDGEMENTAL']
label2id = {'NO': 0, 'DIRECT': 1, 'REPORTED': 2, 'JUDGEMENTAL': 3}
id2label  = {v: k for k, v in class2id.items()}

#### Gold set ignores "UNKNOWN"

In [58]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [11]:
class2id = {'NO': 0, 'DIRECT': 1, 'REPORTED': 2, 'JUDGEMENTAL': 3}
id2class = {v: k for k, v in class2id.items()}

### Hyperparameter Tuning through randomized search

In [None]:
import torch
import numpy as np
import random
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd

# Label mappings
class2id = {'NO': 0, 'DIRECT': 1, 'REPORTED': 2, 'JUDGEMENTAL': 3}
id2class = {v: k for k, v in class2id.items()}
import random
import numpy as np

random.seed(42)
np.random.seed(42)


# Load and preprocess dataset
# def load_and_prepare_dataset(csv_path):
#     df = pd.read_csv(csv_path)
#     df = df[['tweet', 'label']].rename(columns={'tweet': 'text'})
#     df['label'] = df['label'].map(class2id)
#     df = df.dropna().astype({'label': 'int'})
#     df = df.sample(frac=1, random_state=42)  # Shuffle
#     dataset = Dataset.from_pandas(df)
#     train_test = dataset.train_test_split(test_size=0.1)
#     return train_test['train'], train_test['test']
def load_and_prepare_dataset(csv_path):
    df = pd.read_csv(csv_path)
    df = df[['tweet', 'label']].rename(columns={'tweet': 'text'})
    df = df.dropna().astype({'label': 'int'})  # Ensure labels are integers
    df = df.sample(frac=1, random_state=42)  # Shuffle
    dataset = Dataset.from_pandas(df)
    train_test = dataset.train_test_split(test_size=0.1)
    return train_test['train'], train_test['test']

# Tokenization
def preprocess(train_data, val_data, tokenizer, max_length=256):
    def tokenize_fn(example):
        return tokenizer(example['text'], truncation=True, padding='max_length', max_length=max_length)
    train_data = train_data.map(tokenize_fn, batched=True)
    val_data = val_data.map(tokenize_fn, batched=True)
    return train_data, val_data

# Metrics computation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

# Hyperparameter search space
param_dist = {
    "learning_rate": [1e-5, 3e-5, 5e-5],
    "per_device_train_batch_size": [8, 16, 32, 64],
    "num_train_epochs": [2, 3, 4],
    "weight_decay": [0.0, 0.01, 0.1],
    "warmup_steps": [100, 200, 500],
}

def random_search(train_data, val_data, tokenizer, model, n_iter=25):
    best_score = -1
    best_params = None

    for i in range(n_iter):
        print(f"Iter {i}")
        # Randomly sample hyperparameters
        params = {
            "learning_rate": random.choice(param_dist["learning_rate"]),
            "per_device_train_batch_size": random.choice(param_dist["per_device_train_batch_size"]),
            "num_train_epochs": 3,
            # "num_train_epochs": random.choice(param_dist["num_train_epochs"]),
            "weight_decay": random.choice(param_dist["weight_decay"]),
            "warmup_steps": random.choice(param_dist["warmup_steps"]),
        }

        # Training arguments with sampled params
        training_args = TrainingArguments(
            output_dir="./results",
            evaluation_strategy="epoch",
            save_strategy="epoch",
            logging_steps=50,
            load_best_model_at_end=True,
            **params
        )

        # Trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_data,
            eval_dataset=val_data,
            tokenizer=tokenizer,
            data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
            compute_metrics=compute_metrics,
        )

        # Train and evaluate
        trainer.train()
        eval_results = trainer.evaluate()

        # Track best model
        current_score = eval_results["eval_f1"]
        if current_score > best_score:
            best_score = current_score
            best_params = params

    return best_params, best_score

def train_model(csv_path, model_checkpoint, save_name, n_iter=25):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=4, )

    train_data, val_data = load_and_prepare_dataset(csv_path)
    train_data, val_data = preprocess(train_data, val_data, tokenizer)

    # Perform random search
    best_params, best_score = random_search(train_data, val_data, tokenizer, model, n_iter=n_iter)
    print(f"Best Hyperparameters: {best_params}")
    print(f"Best F1 Score: {best_score:.4f}")

    # Train final model with best params
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_steps=50,
        load_best_model_at_end=True,
        **best_params
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
    )

    trainer.train()
    trainer.save_model(f"{save_name}_best_model")

# Run training with hyperparameter tuning
train_model("/kaggle/input/existdatasets/trainin_gold_labels_en.csv", "distilroberta-base", "distilroberta-base", n_iter=25)

* #### For English
Best Hyperparameters: {'learning_rate': 5e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.1, 'warmup_steps': 200}
#### Best F1 Score: 0.5134

Best Hyperparameters: {'learning_rate': 5e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 3, 'weight_decay': 0.0, 'warmup_steps': 500}
#### Best F1 Score: 0.5676

#### For Spanish
Best Hyperparameters: {'learning_rate': 5e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.1, 'warmup_steps': 200}
#### Best F1 Score: 0.5181

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

import pandas as pd
import numpy as np

def load_and_prepare_dataset(csv_path):
    df = pd.read_csv(csv_path)
    df = df[['tweet', 'label']].rename(columns={'tweet': 'text'})
    df = df.dropna().astype({'label': 'int'})  # Ensure labels are integers
    df = df.sample(frac=1, random_state=42)  # Shuffle
    dataset = Dataset.from_pandas(df)
    train_test = dataset.train_test_split(test_size=0.1)
    return train_test['train'], train_test['test']


def preprocess(train_data, val_data, tokenizer, max_length=256):
    def tokenize_fn(example):
        return tokenizer(example['text'], truncation=True, padding='max_length', max_length=max_length)
    train_data = train_data.map(tokenize_fn, batched=True)
    val_data = val_data.map(tokenize_fn, batched=True)
    return train_data, val_data

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def train_model(json_path, model_checkpoint, save_name):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=4)

    train_data, val_data = load_and_prepare_dataset(json_path)
    train_data, val_data = preprocess(train_data, val_data, tokenizer)

    training_args = TrainingArguments(
        output_dir="results",
        num_train_epochs=4,
        # learning_rate=5e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        # warmup_steps=500,
        # weight_decay=0.0,
        logging_dir="logs",
        logging_steps=50,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    # data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_data,
        eval_dataset=val_data,
        tokenizer=tokenizer,
        # data_collator=data_collator,
        compute_metrics=compute_metrics
    )
    trainer.train()
    metrics = trainer.evaluate()
    # print(metrics)
    print("Evaluation Results:")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")
    
# Best Hyperparameters: {'learning_rate': 5e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.1, 'warmup_steps': 200}

    save_path = f"{save_name}_mergedlang_es"
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"Model saved to {save_path}")
    return model,tokenizer

model,tokenizer = train_model("/kaggle/input/existdatasets/trainin_gold_labels_es.csv", "FacebookAI/xlm-roberta-base", "FacebookAI/xlm-roberta-base")

In [60]:
model.save_pretrained("/kaggle/working/distilroberta-base_mergedlang_en")
tokenizer.save_pretrained("/kaggle/working/distilroberta-base_mergedlang_en")


('/kaggle/working/distilroberta-base_mergedlang_en/tokenizer_config.json',
 '/kaggle/working/distilroberta-base_mergedlang_en/special_tokens_map.json',
 '/kaggle/working/distilroberta-base_mergedlang_en/vocab.json',
 '/kaggle/working/distilroberta-base_mergedlang_en/merges.txt',
 '/kaggle/working/distilroberta-base_mergedlang_en/added_tokens.json',
 '/kaggle/working/distilroberta-base_mergedlang_en/tokenizer.json')

In [89]:
def train_model(encodings, labels, num_labels, modelname):
    """Train a BERT model on the given encodings and labels."""
    # Split the data into training and validation sets
    train_inputs, val_inputs, train_labels, val_labels = train_test_split(
        encodings['input_ids'], labels, test_size=0.2, random_state=42
    )
    train_masks, val_masks = train_test_split(
        encodings['attention_mask'], test_size=0.2, random_state=42
    )

    train_encodings = {'input_ids': train_inputs, 'attention_mask': train_masks}
    val_encodings = {'input_ids': val_inputs, 'attention_mask': val_masks}

    # # Convert labels to tensors
    # train_labels = torch.tensor(train_labels)
    # val_labels = torch.tensor(val_labels)

    train_dataset = CustomDataset(train_encodings, train_labels)
    val_dataset = CustomDataset(val_encodings, val_labels)

    # if 'bert-' in modelname:
    #     model = BertForSequenceClassification.from_pretrained(modelname,num_labels=4,problem_type="multi_label_classification")    
    # elif 'distilbert' in modelname:
    #     model = DistilBertTokenizer.from_pretrained(model)
    # else:
    model = AutoModelForSequenceClassification.from_pretrained(modelname, num_labels=4, id2label=id2label, label2id=label2id)
    
    training_args = TrainingArguments(
        output_dir="./dismiss",
        learning_rate=2e-5,
        eval_strategy="epoch",
        weight_decay=0.0048,
        num_train_epochs=4,              #4 epochs were found to be optimal after which performance decreases
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        save_steps=500,
        save_strategy="epoch", 
        save_total_limit=2,
        logging_dir='./logs',  # optional: directory for logs
        greater_is_better=False,      # Lower eval_loss is better
        warmup_steps=500,             # Learning rate warmup
        load_best_model_at_end=True,


    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        processing_class=tokenizer,
        compute_metrics=compute_metrics,
   
    )
    # print("Model: "+modelname+"Mode: "+)
    trainer.train()
    return trainer, model

### Base Training with Translated Augmented Data

In [None]:
import json
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# === Load Tweets ===
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_en.json", "r", encoding="utf-8") as f:
    data_en = json.load(f)

with open("/kaggle/input/existdatasets/EXIST2025_training_translated_es.json", "r", encoding="utf-8") as f:
    data_es = json.load(f)

# === Load gold_soft_train ===
with open("/kaggle/input/existdatasets/EXIST2025_training_task1_2_gold_soft.json", "r", encoding="utf-8") as f:
    gold_soft = json.load(f)

# Convert gold_soft to a dict for fast access
gold_soft_dict = {entry["id"]: entry["value"] for entry in gold_soft}

CORRECT_LABELS = label_classes

# === Process Tweets with Corresponding Soft Labels ===
def process_data_with_soft_labels(data):
    tweets = []
    labels = []
    ids = []

    for entry in data.values():
        tweet_id = entry["id_EXIST"]

        if tweet_id not in gold_soft_dict:
            continue  # Skip if soft label not found

        soft_label_dict = gold_soft_dict[tweet_id]

        soft_label_vector = [soft_label_dict.get(label, 0.0) for label in CORRECT_LABELS]
        tweet = entry["tweet"]
        tweets.append(tweet)
        labels.append(soft_label_vector)
        ids.append(tweet_id)

    return tweets, labels, ids

# Process both English and Spanish tweets
tweets_en, labels_en, ids_en = process_data_with_soft_labels(data_en)
tweets_es, labels_es, ids_es = process_data_with_soft_labels(data_es)

# === Tokenizer ===
# tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")

# === Custom Dataset Class ===
class TweetDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tweet_id = self.ids[idx]
        labels = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            "id": tweet_id,
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": labels
        }

# === Train-validation split ===
def get_datasets(tweets, labels, ids):
    train_texts, val_texts, train_labels, val_labels, train_ids, val_ids = train_test_split(
        tweets, labels, ids, test_size=0.2, random_state=42
    )
    train_dataset = TweetDataset(train_texts, train_labels, train_ids, tokenizer)
    val_dataset = TweetDataset(val_texts, val_labels, val_ids, tokenizer)
    return train_dataset, val_dataset

# === Create datasets ===
train_dataset_en, val_dataset_en = get_datasets(tweets_en, labels_en, ids_en)
train_dataset_es, val_dataset_es = get_datasets(tweets_es, labels_es, ids_es)

# === Train Model ===
def train_model(train_dataset, val_dataset, output_dir):


    # model_en = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/distilroberta-base_mergedlang_en")
# tokenizer_en = AutoTokenizer.from_pretrained("/kaggle/working/distilroberta-base_mergedlang_en")
    model = AutoModelForSequenceClassification.from_pretrained(
            "FacebookAI/xlm-roberta-base",
            num_labels=len(CORRECT_LABELS),
            problem_type="multi_label_classification"
        )

    # model = BertForSequenceClassification.from_pretrained(
    #     "FacebookAI/xlm-roberta-base",
    #     num_labels=len(CORRECT_LABELS),
    #     problem_type="multi_label_classification"
    # )

    training_args = TrainingArguments(
    output_dir=output_dir,
    do_train=True,
    do_eval=True,
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=2
)


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    trainer.train()
    return trainer

# === Train English and Spanish models ===
trainer_en = train_model(train_dataset_en, val_dataset_en, output_dir="./results/en_xlm_roberta")
# trainer_es = train_model(train_dataset_es, val_dataset_es, output_dir="./results/es")

In [15]:
trainer_es.evaluate()



{'eval_loss': 0.4226243197917938,
 'eval_runtime': 14.1753,
 'eval_samples_per_second': 97.635,
 'eval_steps_per_second': 3.104,
 'epoch': 4.0}

In [90]:
from sklearn.metrics import accuracy_score, precision_score, f1_score
import torch
from transformers import EvalPrediction

def compute_metrics(eval_pred: EvalPrediction):
    logits, labels = eval_pred
    logits = torch.tensor(logits)  # Convert logits to a PyTorch tensor
    preds = torch.argmax(logits, dim=1).numpy()  # Get the predicted class indices
    
    # No need to call .numpy() on labels, as it is already a NumPy array
    labels = labels  # labels are already a NumPy array

    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')

    return {'accuracy': accuracy, 'precision': precision, 'f1': f1}


### Models Tried
##### Best ones: FacebookAI/xlm-roberta-base, FacebookAI/xlm-roberta-large, distilroberta-base, cardiffnlp/twitter-xlm-roberta-base


In [91]:
# PlanTL-GOB-ES/RoBERTalex
# FacebookAI/xlm-roberta-base
# distilbert/distilbert-base-uncased
# google-bert/bert-base-multilingual-uncased
# JonatanGk/roberta-base-bne-finetuned-hate-speech-offensive-spanish
# distilroberta-base
modelname = "distilroberta-base"

In [92]:
encodings, labels, tokenizer = initializetokenizer(modelname, "en")

In [None]:
trainer,model = train_model(encodings, labels, 4, modelname)

In [18]:
# models = ["PlanTL-GOB-ES/RoBERTalex",
# "FacebookAI/xlm-roberta-base",
# "distilbert/distilbert-base-uncased",
# "google-bert/bert-base-multilingual-uncased",
# "JonatanGk/roberta-base-bne-finetuned-hate-speech-offensive-spanish",
# "FacebookAI/xlm-roberta-base"]

In [19]:
model.save_pretrained("/kaggle/working/bert-base-multilingual-cased_en_backtrans")
tokenizer.save_pretrained("/kaggle/working/bert-base-multilingual-cased_en_backtrans")

('/kaggle/working/bert-base-multilingual-cased_en_backtrans/tokenizer_config.json',
 '/kaggle/working/bert-base-multilingual-cased_en_backtrans/special_tokens_map.json',
 '/kaggle/working/bert-base-multilingual-cased_en_backtrans/vocab.txt',
 '/kaggle/working/bert-base-multilingual-cased_en_backtrans/added_tokens.json',
 '/kaggle/working/bert-base-multilingual-cased_en_backtrans/tokenizer.json')

In [None]:
trainedmodels=[]
for modelname in models:
    model = "model"+modelname+"_en_backtrans"
    encodings, labels, tokenizer = initializetokenizer(modelname, "en")
    trainer,model = train_model(encodings, labels, 4, modelname)
    trainedmodels.append([trainer,model])

In [None]:
for modelname in range(len(models)):
    trained[model][0].save_pretrained("/kaggle/working/"+models[modelname]+"_en_backtrans")
    trained[model][1].save_pretrained("/kaggle/working/"+models[modelname]+"_en_backtrans")

In [None]:
for modelname in range(len(models)):
    trained[model]

In [37]:
model.save_pretrained("/kaggle/working/xlmroberta_es_aug")
tokenizer.save_pretrained("/kaggle/working/xlmroberta_es_aug")

('/kaggle/working/xlmroberta_es_aug/tokenizer_config.json',
 '/kaggle/working/xlmroberta_es_aug/special_tokens_map.json',
 '/kaggle/working/xlmroberta_es_aug/sentencepiece.bpe.model',
 '/kaggle/working/xlmroberta_es_aug/added_tokens.json',
 '/kaggle/working/xlmroberta_es_aug/tokenizer.json')

In [61]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

def classify_text_with_model(tokenizer, text, model):
    """
    Classify the input text using the given model and tokenizer.
    Assumes multi-class classification task.
    """
    # Automatically detect available device (GPU if available, otherwise CPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Automatically choose GPU if available
    
    # Move model to the correct device
    model.to(device)

    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Move the inputs to the same device as the model
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Set the model to evaluation mode
    model.eval()

    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted class (highest logit)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()  # Get the index of the predicted class

    return predicted_class

# Example usage:
# Assuming the model and tokenizer are loaded
model = DistilBertForSequenceClassification.from_pretrained('./english_model')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

text = "I dont support women who judge women"
predicted_class = classify_text_with_model(tokenizer, text, model)
print(f"Predicted class: {id2label[predicted_class]}")

Predicted class: REPORTED


### Preprocessing

In [19]:
import json
import re

def clean_and_save_tweets(input_path, output_path):
    def clean(text):
            # text = re.sub(r'https?://\S+', '[URL]', text)   # Replace URL with token
            text = re.sub(r'\s+', ' ', text).strip()        # Normalize spaces
            return text

    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    for entry in data.values():
        if 'tweet' in entry:
            entry['tweet'] = clean(entry['tweet'])

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

# Example usage
clean_and_save_tweets(
    '/kaggle/input/existdatasets/EXIST2025_training_translated_en.json',
    'EXIST2025_training_translated_en_cleaned.json'
)

## With Annotator Data

In [None]:
import torch
torch.cuda.empty_cache()

In [2]:
import os
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
import numpy as np
import datasets
from tabulate import tabulate
import nltk
from datetime import datetime

2025-05-11 17:34:42.688565: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746984882.875198      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746984882.926624      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Attempt 1

In [10]:
import json
import numpy as np
from collections import defaultdict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# === Helper function to split dataset by annotator ===
def split_data_by_annotator(data):
    split_data = defaultdict(list)  # dictionary to hold data for each annotator

    # Iterate through each entry in the dataset
    for entry in data.values():
        tweet_id = entry["id_EXIST"]
        tweet = entry["tweet"]

        # Collect metadata for each annotator
        for i in range(6):  # Assuming there are 6 annotators
            annotator_key = f"Annotator_{i+1}"
            annotator_metadata = {
                "gender": entry["gender_annotators"][i],
                "age": entry["age_annotators"][i],
                "ethnicity": entry["ethnicities_annotators"][i],
                "study_level": entry["study_levels_annotators"][i],
                "country": entry["countries_annotators"][i]
            }

            # For each annotator, create a separate dataset with metadata
            split_data[annotator_key].append({
                "id_EXIST": tweet_id,
                "tweet": tweet,
                "metadata": annotator_metadata,
                "labels_task1_1": entry["labels_task1_1"][i],
                "labels_task1_2": entry["labels_task1_2"][i],
                "labels_task1_3": entry["labels_task1_3"][i]
            })

    return split_data

# === Define a custom Dataset class ===
class TweetDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=256):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]['tweet']
        labels = [self.data[idx]['labels_task1_2']]  # Use labels from task 1.2 only
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

# === Fine-tune the Component Model for each annotator ===
def fine_tune_component_model(train_data, model_name="cardiffnlp/twitter-xlm-roberta-base", num_labels=3):
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    
    # Prepare dataset for training
    train_dataset = TweetDataset(train_data, tokenizer)
    
    training_args = TrainingArguments(
        output_dir="./results",  
        num_train_epochs=4,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        logging_dir="./logs",
        logging_steps=100,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset
    )
    
    trainer.train()
    return model

# === Get soft and hard labels from predictions ===
def get_soft_labels(predictions):
    # Soft label: Compute the probability distribution for each class
    soft_labels = np.mean(predictions, axis=0)  # Average the predicted probabilities
    soft_labels /= np.sum(soft_labels)  # Ensure the sum is 1
    return soft_labels

def get_hard_labels(predictions, threshold=2):
    # Hard label: Majority vote based on predictions
    hard_labels = np.array([1 if np.sum(pred == 1) > threshold else 0 for pred in zip(*predictions)])
    return hard_labels

# === Main code ===
def train_and_process_language_data(data, language, tokenizer, model_name="cardiffnlp/twitter-xlm-roberta-base"):
    # 1. Split the dataset by annotator
    split_data = split_data_by_annotator(data)

    # 2. Fine-tune models for each annotator
    component_models = []
    for annotator_key, train_data in split_data.items():
        model = fine_tune_component_model(train_data, model_name)
        component_models.append(model)

    # 3. Collect predictions from all models
    predictions = []
    for model in component_models:
        # Assuming each model outputs probabilities for each class
        pred = model.predict(val_dataset)  # Modify as per your model prediction logic
        predictions.append(pred)

    # 4. Calculate soft labels (probability distribution)
    soft_labels = get_soft_labels(predictions)

    # 5. Calculate hard labels (majority voting)
    hard_labels = get_hard_labels(predictions)

    # 6. Save results in required submission format
    def save_results(soft_labels, hard_labels, output_file):
        results = {
            "soft_labels": soft_labels,
            "hard_labels": hard_labels
        }
        with open(output_file, "w") as f:
            json.dump(results, f)

    save_results(soft_labels, hard_labels, f"final_output_{language}.json")


# === Load and train for English ===
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_en.json", "r", encoding="utf-8") as f:
    data_en = json.load(f)

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base")
train_and_process_language_data(data_en, "en", tokenizer)

# === Load and train for Spanish ===
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_es.json", "r", encoding="utf-8") as f:
    data_es = json.load(f)

train_and_process_language_data(data_es, "es", tokenizer)

config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.




Step,Training Loss
100,0.4644
200,0.4212
300,0.4072
400,0.3926
500,0.386
600,0.368


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
100,0.4613
200,0.4215
300,0.4005
400,0.3865
500,0.3784
600,0.3597




#### Attempt 2

In [13]:
import json
import torch
import random
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from collections import defaultdict

# === Load Data ===
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_en.json", "r", encoding="utf-8") as f:
    data_en = json.load(f)
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_es.json", "r", encoding="utf-8") as f:
    data_es = json.load(f)
with open("/kaggle/input/existdatasets/EXIST2025_training_task1_2_gold_soft.json", "r", encoding="utf-8") as f:
    gold_soft = json.load(f)

gold_soft_dict = {entry["id"]: entry["value"] for entry in gold_soft}
label_classes = ["NO", "DIRECT", "REPORTED", "JUDGEMENTAL"]

# === Count Label Distribution ===
label_counts = defaultdict(int)
for soft in gold_soft_dict.values():
    max_label = max(soft, key=soft.get)
    label_counts[max_label] += 1

# === Identify underrepresented labels (you can tune this threshold) ===
avg_count = np.mean(list(label_counts.values()))
underrepresented_labels = [label for label, count in label_counts.items() if count < avg_count]
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base")

config.json:   0%|          | 0.00/652 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [16]:
# === AEDA helper ===
PUNCTUATIONS = ['.', ',', '!', '?', ';', ':']
def aeda(sentence, num_insertions=3):
    words = sentence.split()
    if not words:
        return sentence
    new_words = words.copy()
    for _ in range(num_insertions):
        insert_pos = random.randint(0, len(new_words))
        punct = random.choice(PUNCTUATIONS)
        new_words.insert(insert_pos, punct)
    return ' '.join(new_words)
class TweetDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_length=256, is_soft=True):
        self.texts = texts
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_soft = is_soft

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tweet_id = self.ids[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.float if self.is_soft else torch.long)
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            "id": tweet_id,
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": label
        }
def process_data_with_soft_labels(data, gold_soft_dict, label_classes, augment=True, augment_n=2):
    tweets, labels, ids = [], [], []

    for entry in data.values():
        tweet_id = entry["id_EXIST"]
        tweet = entry["tweet"]

        if tweet_id not in gold_soft_dict:
            continue

        soft_label_dict = gold_soft_dict[tweet_id]
        soft_label_vector = [soft_label_dict.get(label, 0.0) for label in label_classes]

        # Original tweet
        tweets.append(tweet)
        labels.append(soft_label_vector)
        ids.append(tweet_id)

        # Determine primary label
        main_label = max(soft_label_dict, key=soft_label_dict.get)

        # Augment only if underrepresented
        if augment and main_label in underrepresented_labels:
            for i in range(augment_n):
                augmented_tweet = aeda(tweet)
                tweets.append(augmented_tweet)
                labels.append(soft_label_vector)
                ids.append(f"{tweet_id}_aug{i+1}")

    return tweets, labels, ids
def process_data_with_hard_labels(data, gold_soft_dict, label_classes):
    tweets, labels, ids = [], [], []

    for entry in data.values():
        tweet_id = entry["id_EXIST"]
        tweet = entry["tweet"]

        if tweet_id not in gold_soft_dict:
            continue

        soft_label_dict = gold_soft_dict[tweet_id]
        hard_label = max(soft_label_dict, key=soft_label_dict.get)
        hard_label_vector = [1 if label == hard_label else 0 for label in label_classes]

        # Collect original tweet and label
        tweets.append(tweet)
        labels.append(hard_label_vector)
        ids.append(tweet_id)

    return tweets, labels, ids

def get_datasets(tweets, labels, ids, tokenizer, is_soft=True):
    train_texts, val_texts, train_labels, val_labels, train_ids, val_ids = train_test_split(
        tweets, labels, ids, test_size=0.2, random_state=42
    )
    train_dataset = TweetDataset(train_texts, train_labels, train_ids, tokenizer, is_soft=is_soft)
    val_dataset = TweetDataset(val_texts, val_labels, val_ids, tokenizer, is_soft=is_soft)
    return train_dataset, val_dataset
def train_model(train_dataset, val_dataset, output_dir, model_name="cardiffnlp/twitter-xlm-roberta-base", num_labels=4):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        problem_type="multi_label_classification"
    )

    training_args = TrainingArguments(
        output_dir=output_dir,
        do_train=True,
        do_eval=True,
        num_train_epochs=4,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        logging_dir="./logs",
        logging_steps=100,
        save_total_limit=1,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    trainer.train()
    return trainer


In [17]:
# Process and train for soft labels
tweets_en, labels_en, ids_en = process_data_with_soft_labels(data_en, gold_soft_dict, label_classes)
tweets_es, labels_es, ids_es = process_data_with_soft_labels(data_es, gold_soft_dict, label_classes)

train_dataset_en, val_dataset_en = get_datasets(tweets_en, labels_en, ids_en, tokenizer, is_soft=True)
train_dataset_es, val_dataset_es = get_datasets(tweets_es, labels_es, ids_es, tokenizer, is_soft=True)

trainer_en = train_model(train_dataset_en, val_dataset_en, output_dir="./results/en_xlm_roberta_soft")
trainer_es = train_model(train_dataset_es, val_dataset_es, output_dir="./results/es_xlm_roberta_soft")

# Process and train for hard labels
# tweets_en_hard, labels_en_hard, ids_en_hard = process_data_with_hard_labels(data_en, gold_soft_dict, label_classes)
# tweets_es_hard, labels_es_hard, ids_es_hard = process_data_with_hard_labels(data_es, gold_soft_dict, label_classes)

# train_dataset_en_hard, val_dataset_en_hard = get_datasets(tweets_en_hard, labels_en_hard, ids_en_hard, tokenizer, is_soft=False)
# train_dataset_es_hard, val_dataset_es_hard = get_datasets(tweets_es_hard, labels_es_hard, ids_es_hard, tokenizer, is_soft=False)

# trainer_en_hard = train_model(train_dataset_en_hard, val_dataset_en_hard, output_dir="./results/en_xlm_roberta_hard")
# trainer_es_hard = train_model(train_dataset_es_hard, val_dataset_es_hard, output_dir="./results/es_xlm_roberta_hard")


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
100,0.4915
200,0.4627
300,0.4429
400,0.4292
500,0.4193
600,0.4109
700,0.3943
800,0.3898
900,0.3911
1000,0.3759


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
100,0.5032
200,0.4693
300,0.4505
400,0.4368
500,0.4251
600,0.4167
700,0.4
800,0.3943
900,0.3959
1000,0.3788




### Testing Annotator

In [29]:
import json

# Load the dev dataset
with open("/kaggle/input/existdatasets/EXIST2025_dev.json", "r", encoding="utf-8") as f:
    dev_data = json.load(f)

# Split into English & Spanish
english_dev_tweets = []
english_dev_ids = []
spanish_dev_tweets = []
spanish_dev_ids = []

for entry in dev_data.values():
    tweet_id = entry["id_EXIST"]
    tweet = entry["tweet"]
    lang = entry["lang"]

    if lang == "en":
        english_dev_tweets.append(tweet)
        english_dev_ids.append(tweet_id)
    elif lang == "es":
        spanish_dev_tweets.append(tweet)
        spanish_dev_ids.append(tweet_id)

# Debugging: Check split sizes
print(f"English Dev Samples: {len(english_dev_tweets)}")
print(f"Spanish Dev Samples: {len(spanish_dev_tweets)}")

English Dev Samples: 489
Spanish Dev Samples: 549


In [30]:
import os
from transformers import BertForSequenceClassification, AutoModelForSequenceClassification, AutoTokenizer

# Function to get the latest checkpoint
def get_latest_checkpoint(directory="./results"):
    checkpoints = [d for d in os.listdir(directory) if d.startswith("checkpoint-")]
    if not checkpoints:
        raise ValueError(f"No checkpoints found in {directory}")
    latest_checkpoint = sorted(checkpoints, key=lambda x: int(x.split('-')[-1]))[-1]
    return os.path.join(directory, latest_checkpoint)

# Load the best model checkpoint for English and Spanish
latest_checkpoint_en = get_latest_checkpoint("./results/en_xlm_roberta_soft")
latest_checkpoint_es = get_latest_checkpoint("./results/es_xlm_roberta_soft")

print(f"Using latest checkpoint for English: {latest_checkpoint_en}")
print(f"Using latest checkpoint for Spanish: {latest_checkpoint_es}")

model_en = AutoModelForSequenceClassification.from_pretrained(latest_checkpoint_en)
model_es = AutoModelForSequenceClassification.from_pretrained(latest_checkpoint_es)
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base")


Using latest checkpoint for English: ./results/en_xlm_roberta_soft/checkpoint-1224
Using latest checkpoint for Spanish: ./results/es_xlm_roberta_soft/checkpoint-1224


## HARD Predictions

In [33]:
def predict_hard_labels_from_soft_model(tweets, ids, model, tokenizer, label_classes, output_file):
    """
    Uses the soft model to predict a single hard label: "YES" or "NO".
    - Assigns the label with the higher probability.
    """
    # model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")#faster on gpu
    model.to(device)

    results = []

    for tweet, tweet_id in zip(tweets, ids):
        # encoding = tokenizer(text=tweet, truncation=True, padding="max_length", max_length=256, return_tensors="pt")
        
        encoding = tokenizer(tweet, truncation=True, padding="max_length", max_length=256, return_tensors="pt")
        encoding = {key: val.to(device) for key, val in encoding.items()}
        with torch.no_grad():
            outputs = model(**encoding)

        logits = outputs.logits.squeeze()
        probs = torch.sigmoid(logits).cpu().numpy()

        # Pick the label with the highest probability (YES or NO)
        max_index = int(probs.argmax())
        predicted_label = label_classes[max_index]

        results.append({
            "test_case": "EXIST2025",
            "id": tweet_id,
            "value": [predicted_label]  # Only one label
        })
    print(f"Hard label predictions saved to {output_file}")

predict_hard_labels_from_soft_model(english_dev_tweets, english_dev_ids, model_en, tokenizer, label_classes, "EXIST2025_dev_predictions_hard_merged_en.json")
predict_hard_labels_from_soft_model(spanish_dev_tweets, spanish_dev_ids, model_es, tokenizer, label_classes, "EXIST2025_dev_predictions_hard_merged_es.json")

Hard label predictions saved to EXIST2025_dev_predictions_hard_merged_en.json
Hard label predictions saved to EXIST2025_dev_predictions_hard_merged_es.json


In [34]:
import json

with open("/kaggle/working/EXIST2025_dev_predictions_hard_merged_es.json", "r", encoding="utf-8") as f:
    es_data = json.load(f)
with open("/kaggle/working/EXIST2025_dev_predictions_hard_merged_en.json", "r", encoding="utf-8") as f:
    en_data = json.load(f)

# Assuming both files contain lists of predictions, merge them
if isinstance(es_data, list) and isinstance(en_data, list):
    merged_data = es_data + en_data
else:
    raise ValueError("JSON structure is not a list. Ensure both files contain lists.")

import json

predictions = merged_data

converted = []
for entry in predictions:
    # Convert the "value" list to a single string (first label only)
    new_entry = {
        "test_case": entry["test_case"],
        "id": entry["id"],
        "value": entry["value"][0] if isinstance(entry["value"], list) else entry["value"]
    }
    converted.append(new_entry)
output_file = "EXIST2025_dev_predictions_merged_hard_flat.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(converted, f, indent=4)

print(f"Predictions converted to gold format and saved to {output_file}")

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/working/EXIST2025_dev_predictions_hard_merged_es.json'

In [None]:
# Run inference on each tweet
output = []
for case_id, case_data in tqdm(test_data.items()):
    if case_data['lang']=='es':
        continue
    tweet = case_data["tweet"]

    # Tokenize the tweet
    inputs = tokenizer(tweet, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Ensure the model and inputs are on the same device (use GPU if available)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Get model predictions
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted class index
    logits = outputs.logits
    predicted_class_idx = torch.argmax(logits, dim=1).item()

    # Map the predicted class index to label
    predicted_label = id2label[predicted_class_idx]

    # Append the result to the output list
    output.append({
        "test_case": "EXIST2025",
        "id": case_data["id_EXIST"],
        "value": predicted_label
    })

# Save the results to an output JSON file
output_json_file = "distil_en_hard_predictions_mergedlang.json"  # Specify the output file path
with open(output_json_file, "w") as f:
    json.dump(output, f, indent=4)

print(f"Results saved to {output_json_file}")

from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils
predictions = "/kaggle/working/distil_en_hard_predictions_mergedlang.json"         
gold = "/kaggle/input/existdatasets/EXIST2025_dev_task1_2_gold_hard.json" 
test = PyEvALLEvaluation() 
params= dict() 
params[PyEvALLUtils.PARAM_REPORT]= PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED  
metrics=["ICM", "ICMNorm" ,"FMeasure"]                  # for hard        
report= test.evaluate(predictions, gold, metrics, **params) 
report.print_report()

In [None]:
from numba import cuda
cuda.select_device(0)
cuda.close()
cuda.select_device(0)


--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.11/logging/__init__.py", line 1114, in emit
    self.flush()
  File "/usr/lib/python3.11/logging/__init__.py", line 1094, in flush
    self.stream.flush()
OSError: [Errno 28] No space left on device
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>
    ColabKernelApp.launch_instance()
  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start
    self.io_loop.start()
  File "/usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py", line 205, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.11/asyncio/base_events.py", line 608, in run_forever
    self._run_

2025-04-24 09:41:17,568 - numba.cuda.cudadrv.driver - INFO -   ensure_initialized() - init
2025-04-24 09:41:17,570 - numba.cuda.cudadrv.driver - INFO -                reset() - reset context of device 0


# To Generate Prediction File

In [15]:
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/distilroberta-base_mergedlang_en")
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/distilroberta-base_mergedlang_en")

In [6]:
import json

# Load the dev dataset
with open("/kaggle/input/existdatasets/EXIST2025_dev.json", "r", encoding="utf-8") as f:
    dev_data = json.load(f)

# Split into English & Spanish
english_dev_tweets = []
english_dev_ids = []
spanish_dev_tweets = []
spanish_dev_ids = []

for entry in dev_data.values():
    # tweet = entry["tweet"]
    # annotator_info = {
    #     # "country": entry.get("countries_annotators", []),
    #     "study_level": entry.get("study_levels_annotators", []),
    #     "ethnicity": entry.get("ethnicities_annotators", []),
    #     # "age": entry.get("age_annotators", []),
    #     # "gender": entry.get("gender_annotators", [])
    # }

    # # Flatten and format metadata into string
    # annotator_str = " | ".join(
    #     f"{key}: {', '.join(map(str, value))}" for key, value in annotator_info.items()
    # )
    # full_text = f"{tweet} [ANNOTATORS] {annotator_str}"
    tweet_id = entry["id_EXIST"]
    tweet = entry["tweet"]
    lang = entry["lang"]

    if lang == "en":
        english_dev_tweets.append(tweet)
        english_dev_ids.append(tweet_id)
    elif lang == "es":
        spanish_dev_tweets.append(tweet)
        spanish_dev_ids.append(tweet_id)

# Debugging: Check split sizes
print(f"English Dev Samples: {len(english_dev_tweets)}")
print(f"Spanish Dev Samples: {len(spanish_dev_tweets)}")


English Dev Samples: 489
Spanish Dev Samples: 549


In [30]:
print(english_dev_tweets[10])


@esjayXX @EcuadorianMum @monsalore They so remind me of MGTOW (Men go their own way) in US full of men who hate women obessively talking about women. Just go your own way, we don't fcking care. And the envy pics of creepy men not having lunch but staring into their camera alone or from a women's loo!


In [12]:
import os
from transformers import BertForSequenceClassification, AutoModelForSequenceClassification, AutoTokenizer

# Function to get the latest checkpoint
def get_latest_checkpoint(directory="./results"):
    checkpoints = [d for d in os.listdir(directory) if d.startswith("checkpoint-")]
    if not checkpoints:
        raise ValueError(f"No checkpoints found in {directory}")
    latest_checkpoint = sorted(checkpoints, key=lambda x: int(x.split('-')[-1]))[-1]
    return os.path.join(directory, latest_checkpoint)

# Load the best model checkpoint for English and Spanish
latest_checkpoint_en = get_latest_checkpoint("./results/en_xlm_roberta_fb_aeda")
latest_checkpoint_es = get_latest_checkpoint("./results/es_xlm_roberta_fb_aeda")

print(f"Using latest checkpoint for English: {latest_checkpoint_en}")
print(f"Using latest checkpoint for Spanish: {latest_checkpoint_es}")

# Load models
# model_en = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/distilroberta-base_mergedlang_en")
# tokenizer_en = AutoTokenizer.from_pretrained("/kaggle/working/distilroberta-base_mergedlang_en")
# model_es = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/distilroberta-base_mergedlang_es")
# tokenizer_es = AutoTokenizer.from_pretrained("/kaggle/working/distilroberta-base_mergedlang_es")
# Load models
model_en = AutoModelForSequenceClassification.from_pretrained(latest_checkpoint_en)
model_es = AutoModelForSequenceClassification.from_pretrained(latest_checkpoint_es)
# tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
# tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base")
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")


Using latest checkpoint for English: ./results/en_xlm_roberta_fb_aeda/checkpoint-865
Using latest checkpoint for Spanish: ./results/es_xlm_roberta_fb_aeda/checkpoint-865


### hard

In [33]:
# Check for None or empty tweets in Spanish data
for i, (tweet, tweet_id) in enumerate(zip(spanish_dev_tweets, spanish_dev_ids)):
    if not tweet:
        print(f"Empty tweet at index {i}, ID: {tweet_id}")

In [37]:
def predict_hard_labels_from_soft_model(tweets, ids, model, tokenizer, label_classes, output_file):
    """
    Uses the soft model to predict a single hard label: "YES" or "NO".
    - Assigns the label with the higher probability.
    """
    # model.eval()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")#faster on gpu
    model.to(device)

    results = []

    for tweet, tweet_id in zip(tweets, ids):
        # encoding = tokenizer(text=tweet, truncation=True, padding="max_length", max_length=256, return_tensors="pt")
        
        encoding = tokenizer(tweet, truncation=True, padding="max_length", max_length=256, return_tensors="pt")
        encoding = {key: val.to(device) for key, val in encoding.items()}
        with torch.no_grad():
            outputs = model(**encoding)

        logits = outputs.logits.squeeze()
        probs = torch.sigmoid(logits).cpu().numpy()

        # Pick the label with the highest probability (YES or NO)
        max_index = int(probs.argmax())
        predicted_label = label_classes[max_index]

        results.append({
            "test_case": "EXIST2025",
            "id": tweet_id,
            "value": [predicted_label]  # Only one label
        })
    print(f"Hard label predictions saved to {output_file}")

predict_hard_labels_from_soft_model(english_dev_tweets, english_dev_ids, model_en, tokenizer, label_classes, "EXIST2025_dev_predictions_hard_merged_en.json")
predict_hard_labels_from_soft_model(spanish_dev_tweets, spanish_dev_ids, model_es, tokenizer, label_classes, "EXIST2025_dev_predictions_hard_merged_es.json")

Hard label predictions saved to EXIST2025_dev_predictions_hard_merged_en.json
Hard label predictions saved to EXIST2025_dev_predictions_hard_merged_es.json


In [38]:
import json

# Load the Spanish predictions
with open("/kaggle/working/EXIST2025_dev_predictions_hard_merged_es.json", "r", encoding="utf-8") as f:
    es_data = json.load(f)

# Load the English predictions
with open("/kaggle/working/EXIST2025_dev_predictions_hard_merged_en.json", "r", encoding="utf-8") as f:
    en_data = json.load(f)

# Assuming both files contain lists of predictions, merge them
if isinstance(es_data, list) and isinstance(en_data, list):
    merged_data = es_data + en_data
else:
    raise ValueError("JSON structure is not a list. Ensure both files contain lists.")

import json

predictions = merged_data

converted = []
for entry in predictions:
    # Convert the "value" list to a single string (first label only)
    new_entry = {
        "test_case": entry["test_case"],
        "id": entry["id"],
        "value": entry["value"][0] if isinstance(entry["value"], list) else entry["value"]
    }
    converted.append(new_entry)
output_file = "EXIST2025_dev_predictions_merged_hard_flat.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(converted, f, indent=4)

print(f"Predictions converted to gold format and saved to {output_file}")

Predictions converted to gold format and saved to EXIST2025_dev_predictions_merged_hard_flat.json


In [39]:
import torch
import json
from tqdm import tqdm


In [23]:
!pip install pyEvall

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [40]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils
predictions = "/kaggle/working/EXIST2025_dev_predictions_merged_hard_flat.json"         
gold = "/kaggle/input/existdatasets/EXIST2025_dev_task1_2_gold_hard.json" 
test = PyEvALLEvaluation() 
params= dict() 
params[PyEvALLUtils.PARAM_REPORT]= PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED  
metrics=["ICM", "ICMNorm" ,"FMeasure"]                  # for hard        
report= test.evaluate(predictions, gold, metrics, **params) 
report.print_report()

2025-04-24 10:01:38,707 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM', 'ICMNorm', 'FMeasure']
2025-04-24 10:01:38,799 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-24 10:01:39,136 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Normalized evaluation method
2025-04-24 10:01:39,139 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-24 10:01:39,497 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-24 10:01:39,818 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
{
  "metrics": {
    "ICM": {
      "name": "Information Contrast model",
      "acronym": "ICM",
      "description": "Coming soon!",
      "status": "OK",
      "results": {
        "test_cases": [{
          "name": "EXIST2025",
          "average": -0.42508

### SOFT


In [28]:
def predict_on_dev(tweets, ids, model, tokenizer, label_classes, output_file):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")#faster on gpu
    model.to(device)
    model.eval()
    results = []

    for tweet, tweet_id in zip(tweets, ids):
        # encoding = tokenizer(tweet, truncation=True, padding="max_length", max_length=256, return_tensors="pt")
        encoding = tokenizer(tweet, truncation=True, padding="max_length", max_length=256, return_tensors="pt")
        encoding = {key: val.to(device) for key, val in encoding.items()}

        with torch.no_grad():
            outputs = model(**encoding)

        logits = outputs.logits.squeeze()
        probs = torch.sigmoid(logits).cpu().numpy()

        # Convert probabilities to dictionary format and sort by highest probability
        soft_label_dict = {label_classes[i]: float(probs[i]) for i in range(len(label_classes))}
        sorted_soft_label_dict = dict(sorted(soft_label_dict.items(), key=lambda item: item[1], reverse=True))  # Sort descending

        results.append({
            "test_case": "EXIST2025",
            "id": tweet_id,
            "value": sorted_soft_label_dict  # Rename "soft_label" to "value" and sort it
        })

    # Save results
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4)

    print(f"Predictions saved to {output_file}")
    
# label_classes = CORRECT_LABELS


# Run predictions
predict_on_dev(english_dev_tweets, english_dev_ids, model_en, tokenizer, label_classes, "EXIST2025_dev_predictions_soft_merged_en.json")
predict_on_dev(spanish_dev_tweets, spanish_dev_ids, model_es, tokenizer, label_classes, "EXIST2025_dev_predictions_soft_merged_es.json")


Predictions saved to EXIST2025_dev_predictions_soft_merged_en.json
Predictions saved to EXIST2025_dev_predictions_soft_merged_es.json


In [29]:
import json

# Load the Spanish predictions
with open("/kaggle/working/EXIST2025_dev_predictions_soft_merged_es.json", "r", encoding="utf-8") as f:
    es_data = json.load(f)

# Load the English predictions
with open("/kaggle/working/EXIST2025_dev_predictions_soft_merged_en.json", "r", encoding="utf-8") as f:
    en_data = json.load(f)

# Assuming both files contain lists of predictions, merge them
if isinstance(es_data, list) and isinstance(en_data, list):
    merged_data = es_data + en_data
else:
    raise ValueError("JSON structure is not a list. Ensure both files contain lists.")

# Save to a new file
output_filename = "EXIST2025_dev_predictions_merged_soft_distilroberta.json"
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=4, ensure_ascii=False)

print(f"Merging complete! Saved to {output_filename}")


Merging complete! Saved to EXIST2025_dev_predictions_merged_soft_distilroberta.json


In [30]:
import json
import numpy as np

# Load your predictions file
with open('EXIST2025_dev_predictions_merged_soft_distilroberta.json', 'r') as f:
    predictions = json.load(f)

# Define the snapping values (multiples of 1/6)
snap_vals = np.array([i / 6 for i in range(7)])  # [0.0, 0.1667, ..., 1.0]

def snap_to_nearest_sixth(value):
    return float(snap_vals[np.argmin(np.abs(snap_vals - value))])

# Snap each value in the 'value' dict
for entry in predictions:
    entry['value'] = {k: snap_to_nearest_sixth(v) for k, v in entry['value'].items()}

# Save the snapped predictions to a new file
with open('EXIST2025_dev_predictions_snapped_soft.json', 'w') as f:
    json.dump(predictions, f, indent=2)


In [16]:
# print(predictions)

In [31]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils
predictions = "/kaggle/working/EXIST2025_dev_predictions_snapped_soft.json"         
gold = "/kaggle/input/existdatasets/EXIST2025_dev_task1_2_gold_soft.json" 
test = PyEvALLEvaluation() 
params= dict() 
params[PyEvALLUtils.PARAM_REPORT]= PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED  
metrics=["ICMSoft", "ICMSoftNorm", "CrossEntropy"]      # for soft    
report= test.evaluate(predictions, gold, metrics, **params) 
report.print_report()

2025-04-24 08:58:38,226 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICMSoft', 'ICMSoftNorm', 'CrossEntropy']
2025-04-24 08:58:38,503 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Soft evaluation method
2025-04-24 08:58:39,453 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM-Soft Normalized evaluation method
2025-04-24 08:58:39,456 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Soft evaluation method
2025-04-24 08:58:40,380 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Soft evaluation method
2025-04-24 08:58:41,274 - pyevall.metrics.metrics - INFO -             evaluate() - Executing Cross Entropy evaluation method
{
  "metrics": {
    "ICMSoft": {
      "name": "Information Contrast Model Soft",
      "acronym": "ICM-Soft",
      "description": "Coming soon!",
      "status": "OK",
      "results": {
        "test_cases": [{
         

### AEDA AUGMENTATION

First create combined dataset with gold labels

In [11]:
import json
import torch
import random
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from torch.utils.data import Dataset

# AEDA helper
PUNCTUATIONS = ['.', ',', '!', '?', ';', ':']
def aeda(sentence, num_insertions=3):
    words = sentence.split()
    if not words:
        return sentence
    new_words = words.copy()
    for _ in range(num_insertions):
        insert_pos = random.randint(0, len(new_words))
        punct = random.choice(PUNCTUATIONS)
        new_words.insert(insert_pos, punct)
    return ' '.join(new_words)

# === Load Data ===
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_en.json", "r", encoding="utf-8") as f:
    data_en = json.load(f)
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_es.json", "r", encoding="utf-8") as f:
    data_es = json.load(f)
with open("/kaggle/input/existdatasets/EXIST2025_training_task1_2_gold_soft.json", "r", encoding="utf-8") as f:
    gold_soft = json.load(f)

gold_soft_dict = {entry["id"]: entry["value"] for entry in gold_soft}

# === Process Tweets & Apply AEDA ===
def process_data_with_soft_labels(data, augment=True, augment_n=2):
    tweets, labels, ids = [], [], []

    for entry in data.values():
        tweet_id = entry["id_EXIST"]
        tweet = entry["tweet"]

        if tweet_id not in gold_soft_dict:
            continue

        soft_label_dict = gold_soft_dict[tweet_id]
        soft_label_vector = [soft_label_dict.get(label, 0.0) for label in label_classes]

        # Original tweet
        tweets.append(tweet)
        labels.append(soft_label_vector)
        ids.append(tweet_id)

        # Augmented tweets
        if augment:
            for i in range(augment_n):
                augmented_tweet = aeda(tweet)
                tweets.append(augmented_tweet)
                labels.append(soft_label_vector)
                ids.append(f"{tweet_id}_aug{i+1}")

    return tweets, labels, ids

# Process both languages
tweets_en, labels_en, ids_en = process_data_with_soft_labels(data_en)
tweets_es, labels_es, ids_es = process_data_with_soft_labels(data_es)

# === Tokenizer ===
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# === Dataset Class ===
class TweetDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tweet_id = self.ids[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            "id": tweet_id,
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": label
        }

# === Train-validation split ===
def get_datasets(tweets, labels, ids):
    train_texts, val_texts, train_labels, val_labels, train_ids, val_ids = train_test_split(
        tweets, labels, ids, test_size=0.2, random_state=42
    )
    train_dataset = TweetDataset(train_texts, train_labels, train_ids, tokenizer)
    val_dataset = TweetDataset(val_texts, val_labels, val_ids, tokenizer)
    return train_dataset, val_dataset

# === Create datasets ===
train_dataset_en, val_dataset_en = get_datasets(tweets_en, labels_en, ids_en)
train_dataset_es, val_dataset_es = get_datasets(tweets_es, labels_es, ids_es)

print(f"✅ English train set size: {len(train_dataset_en)} (with augmentation)")
print(f"✅ Spanish train set size: {len(train_dataset_es)} (with augmentation)")


✅ English train set size: 16608 (with augmentation)
✅ Spanish train set size: 16608 (with augmentation)


### With underrepresented labels only

In [16]:
import json
import torch
import random
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from torch.utils.data import Dataset
from collections import defaultdict

# === AEDA helper ===
PUNCTUATIONS = ['.', ',', '!', '?', ';', ':']
def aeda(sentence, num_insertions=3):
    words = sentence.split()
    if not words:
        return sentence
    new_words = words.copy()
    for _ in range(num_insertions):
        insert_pos = random.randint(0, len(new_words))
        punct = random.choice(PUNCTUATIONS)
        new_words.insert(insert_pos, punct)
    return ' '.join(new_words)

# === Load Data ===
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_en.json", "r", encoding="utf-8") as f:
    data_en = json.load(f)
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_es.json", "r", encoding="utf-8") as f:
    data_es = json.load(f)
with open("/kaggle/input/existdatasets/EXIST2025_training_task1_2_gold_soft.json", "r", encoding="utf-8") as f:
    gold_soft = json.load(f)

gold_soft_dict = {entry["id"]: entry["value"] for entry in gold_soft}
label_classes = ["NO", "DIRECT", "REPORTED", "JUDGEMENTAL"]

# === Count Label Distribution ===
label_counts = defaultdict(int)
for soft in gold_soft_dict.values():
    max_label = max(soft, key=soft.get)
    label_counts[max_label] += 1

# === Identify underrepresented labels (you can tune this threshold) ===
avg_count = np.mean(list(label_counts.values()))
underrepresented_labels = [label for label, count in label_counts.items() if count < avg_count]

# === Process Tweets & Augment Underrepresented Only ===
def process_data_with_soft_labels(data, augment=True, augment_n=2):
    tweets, labels, ids = [], [], []

    for entry in data.values():
        tweet_id = entry["id_EXIST"]
        tweet = entry["tweet"]

        if tweet_id not in gold_soft_dict:
            continue

        soft_label_dict = gold_soft_dict[tweet_id]
        soft_label_vector = [soft_label_dict.get(label, 0.0) for label in label_classes]

        # Original tweet
        tweets.append(tweet)
        labels.append(soft_label_vector)
        ids.append(tweet_id)

        # Determine primary label
        main_label = max(soft_label_dict, key=soft_label_dict.get)

        # Augment only if underrepresented
        if augment and main_label in underrepresented_labels:
            for i in range(augment_n):
                augmented_tweet = aeda(tweet)
                tweets.append(augmented_tweet)
                labels.append(soft_label_vector)
                ids.append(f"{tweet_id}_aug{i+1}")

    return tweets, labels, ids

# Process English and Spanish data
tweets_en, labels_en, ids_en = process_data_with_soft_labels(data_en)
tweets_es, labels_es, ids_es = process_data_with_soft_labels(data_es)

In [17]:
# === Tokenizer ===
# model = AutoModelForSequenceClassification.from_pretrained(
#         "cardiffnlp/twitter-xlm-roberta-base",
#         num_labels=len(CORRECT_LABELS),
#         problem_type="multi_label_classification"
#     )
# tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base")

# === Dataset Class ===
class TweetDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tweet_id = self.ids[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            "id": tweet_id,
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": label
        }

# === Train-validation split ===
def get_datasets(tweets, labels, ids):
    train_texts, val_texts, train_labels, val_labels, train_ids, val_ids = train_test_split(
        tweets, labels, ids, test_size=0.2, random_state=42
    )
    train_dataset = TweetDataset(train_texts, train_labels, train_ids, tokenizer)
    val_dataset = TweetDataset(val_texts, val_labels, val_ids, tokenizer)
    return train_dataset, val_dataset

# === Create datasets ===
train_dataset_en, val_dataset_en = get_datasets(tweets_en, labels_en, ids_en)
train_dataset_es, val_dataset_es = get_datasets(tweets_es, labels_es, ids_es)

print(f"English train set size: {len(train_dataset_en)} (with selective augmentation)")
print(f"Spanish train set size: {len(train_dataset_es)} (with selective augmentation)")


✅ English train set size: 9763 (with selective augmentation)
✅ Spanish train set size: 9763 (with selective augmentation)


train

In [18]:
CORRECT_LABELS = label_classes
# === Train Model ===
def train_model(train_dataset, val_dataset, output_dir):


    # model_en = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/distilroberta-base_mergedlang_en")
# tokenizer_en = AutoTokenizer.from_pretrained("/kaggle/working/distilroberta-base_mergedlang_en")
    model = AutoModelForSequenceClassification.from_pretrained(
            "cardiffnlp/twitter-xlm-roberta-base",
            num_labels=len(CORRECT_LABELS),
            problem_type="multi_label_classification"
        )
    # model = AutoModelForSequenceClassification.from_pretrained(
    #         "FacebookAI/xlm-roberta-base",
    #         num_labels=len(CORRECT_LABELS),
    #         problem_type="multi_label_classification"
    #     )

    # model = BertForSequenceClassification.from_pretrained(
    #     "FacebookAI/xlm-roberta-base",
    #     num_labels=len(CORRECT_LABELS),
    #     problem_type="multi_label_classification"
    # )

    training_args = TrainingArguments(
    output_dir=output_dir,
    do_train=True,
    do_eval=True,
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=1,
)


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    trainer.train()
    return trainer

# === Train English and Spanish models ===
# trainer_en = train_model(train_dataset_en, val_dataset_en, output_dir="./results/en_xlm_roberta_aeda")
trainer_es = train_model(train_dataset_es, val_dataset_es, output_dir="./results/es_xlm_roberta_aeda")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
100,0.5032
200,0.4693
300,0.4505
400,0.4368
500,0.4251
600,0.4167
700,0.4
800,0.3943
900,0.3959
1000,0.3788




### Combined training with AEDA

### BERT

In [20]:
import json
import torch
import random
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from torch.utils.data import Dataset
from collections import defaultdict

# === AEDA helper ===
PUNCTUATIONS = ['.', ',', '!', '?', ';', ':']
def aeda(sentence, num_insertions=3):
    words = sentence.split()
    if not words:
        return sentence
    new_words = words.copy()
    for _ in range(num_insertions):
        insert_pos = random.randint(0, len(new_words))
        punct = random.choice(PUNCTUATIONS)
        new_words.insert(insert_pos, punct)
    return ' '.join(new_words)

# === Load Data ===
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_en.json", "r", encoding="utf-8") as f:
    data_en = json.load(f)
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_es.json", "r", encoding="utf-8") as f:
    data_es = json.load(f)
with open("/kaggle/input/existdatasets/EXIST2025_training_task1_2_gold_soft.json", "r", encoding="utf-8") as f:
    gold_soft = json.load(f)

gold_soft_dict = {entry["id"]: entry["value"] for entry in gold_soft}
label_classes = ["NO", "DIRECT", "REPORTED", "JUDGEMENTAL"]

# === Count Label Distribution ===
label_counts = defaultdict(int)
for soft in gold_soft_dict.values():
    max_label = max(soft, key=soft.get)
    label_counts[max_label] += 1

# === Identify underrepresented labels (you can tune this threshold) ===
avg_count = np.mean(list(label_counts.values()))
underrepresented_labels = [label for label, count in label_counts.items() if count < avg_count]

# === Process Tweets & Augment Underrepresented Only ===
def process_data_with_soft_labels(data, augment=True, augment_n=2):
    tweets, labels, ids = [], [], []

    for entry in data.values():
        tweet_id = entry["id_EXIST"]
        tweet = entry["tweet"]

        if tweet_id not in gold_soft_dict:
            continue

        soft_label_dict = gold_soft_dict[tweet_id]
        soft_label_vector = [soft_label_dict.get(label, 0.0) for label in label_classes]

        # Original tweet
        tweets.append(tweet)
        labels.append(soft_label_vector)
        ids.append(tweet_id)

        # Determine primary label
        main_label = max(soft_label_dict, key=soft_label_dict.get)

        # Augment only if underrepresented
        if augment and main_label in underrepresented_labels:
            for i in range(augment_n):
                augmented_tweet = aeda(tweet)
                tweets.append(augmented_tweet)
                labels.append(soft_label_vector)
                ids.append(f"{tweet_id}_aug{i+1}")

    return tweets, labels, ids

# Process English and Spanish data
tweets_en, labels_en, ids_en = process_data_with_soft_labels(data_en)
tweets_es, labels_es, ids_es = process_data_with_soft_labels(data_es)

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
# tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base")

# === Dataset Class ===
class TweetDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tweet_id = self.ids[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            "id": tweet_id,
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": label
        }

# === Train-validation split ===
def get_datasets(tweets, labels, ids):
    train_texts, val_texts, train_labels, val_labels, train_ids, val_ids = train_test_split(
        tweets, labels, ids, test_size=0.2, random_state=42
    )
    train_dataset = TweetDataset(train_texts, train_labels, train_ids, tokenizer)
    val_dataset = TweetDataset(val_texts, val_labels, val_ids, tokenizer)
    return train_dataset, val_dataset

# === Create datasets ===
train_dataset_en, val_dataset_en = get_datasets(tweets_en, labels_en, ids_en)
train_dataset_es, val_dataset_es = get_datasets(tweets_es, labels_es, ids_es)

print(f"✅ English train set size: {len(train_dataset_en)} (with selective augmentation)")
print(f"✅ Spanish train set size: {len(train_dataset_es)} (with selective augmentation)")

CORRECT_LABELS = label_classes
import json
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# === Load Tweets ===
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_en.json", "r", encoding="utf-8") as f:
    data_en = json.load(f)

with open("/kaggle/input/existdatasets/EXIST2025_training_translated_es.json", "r", encoding="utf-8") as f:
    data_es = json.load(f)

# === Load gold_soft_train ===
with open("/kaggle/input/existdatasets/EXIST2025_training_task1_2_gold_soft.json", "r", encoding="utf-8") as f:
    gold_soft = json.load(f)

# Convert gold_soft to a dict for fast access
gold_soft_dict = {entry["id"]: entry["value"] for entry in gold_soft}

CORRECT_LABELS = label_classes

# === Process Tweets with Corresponding Soft Labels ===
def process_data_with_soft_labels(data):
    tweets = []
    labels = []
    ids = []

    for entry in data.values():
        tweet_id = entry["id_EXIST"]
        tweet = entry["tweet"]

        if tweet_id not in gold_soft_dict:
            continue  # Skip if soft label not found

        soft_label_dict = gold_soft_dict[tweet_id]

        soft_label_vector = [soft_label_dict.get(label, 0.0) for label in CORRECT_LABELS]

        tweets.append(tweet)
        labels.append(soft_label_vector)
        ids.append(tweet_id)

    return tweets, labels, ids

# Process both English and Spanish tweets
tweets_en, labels_en, ids_en = process_data_with_soft_labels(data_en)
tweets_es, labels_es, ids_es = process_data_with_soft_labels(data_es)

# === Custom Dataset Class ===
class TweetDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tweet_id = self.ids[idx]
        labels = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            "id": tweet_id,
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": labels
        }

# === Train-validation split ===
def get_datasets(tweets, labels, ids):
    train_texts, val_texts, train_labels, val_labels, train_ids, val_ids = train_test_split(
        tweets, labels, ids, test_size=0.2, random_state=42
    )
    train_dataset = TweetDataset(train_texts, train_labels, train_ids, tokenizer)
    val_dataset = TweetDataset(val_texts, val_labels, val_ids, tokenizer)
    return train_dataset, val_dataset

# === Create datasets ===
train_dataset_en, val_dataset_en = get_datasets(tweets_en, labels_en, ids_en)
train_dataset_es, val_dataset_es = get_datasets(tweets_es, labels_es, ids_es)


# === Train Model ===
def train_model(train_dataset, val_dataset, output_dir):


    # model_en = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/distilroberta-base_mergedlang_en")
# tokenizer_en = AutoTokenizer.from_pretrained("/kaggle/working/distilroberta-base_mergedlang_en")
    # model = AutoModelForSequenceClassification.from_pretrained(
    #         "cardiffnlp/twitter-xlm-roberta-base",
    #         num_labels=len(CORRECT_LABELS),
    #         problem_type="multi_label_classification"
    #     )
    # model = AutoModelForSequenceClassification.from_pretrained(
    #         "FacebookAI/xlm-roberta-base",
    #         num_labels=len(CORRECT_LABELS),
    #         problem_type="multi_label_classification"
    #     )

    model = BertForSequenceClassification.from_pretrained(
        "bert-base-multilingual-cased",
        num_labels=len(CORRECT_LABELS),
        problem_type="multi_label_classification"
    )

    training_args = TrainingArguments(
    output_dir=output_dir,
    do_train=True,
    do_eval=True,
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=1,
)


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    trainer.train()
    return trainer

# === Train English and Spanish models ===
trainer_en = train_model(train_dataset_en, val_dataset_en, output_dir="./results/en_mbert_aeda")
trainer_es = train_model(train_dataset_es, val_dataset_es, output_dir="./results/es_mbert_aeda")

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

✅ English train set size: 9763 (with selective augmentation)
✅ Spanish train set size: 9763 (with selective augmentation)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`




model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
100,0.4703
200,0.4278
300,0.4122
400,0.3944
500,0.3862
600,0.3635


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
100,0.4726
200,0.4368
300,0.4168
400,0.4026
500,0.389
600,0.3672




### xlm Roberta fb

In [None]:
import json
import torch
import random
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from torch.utils.data import Dataset
from collections import defaultdict

# === AEDA helper ===
PUNCTUATIONS = ['.', ',', '!', '?', ';', ':']
def aeda(sentence, num_insertions=3):
    words = sentence.split()
    if not words:
        return sentence
    new_words = words.copy()
    for _ in range(num_insertions):
        insert_pos = random.randint(0, len(new_words))
        punct = random.choice(PUNCTUATIONS)
        new_words.insert(insert_pos, punct)
    return ' '.join(new_words)

# === Load Data ===
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_en.json", "r", encoding="utf-8") as f:
    data_en = json.load(f)
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_es.json", "r", encoding="utf-8") as f:
    data_es = json.load(f)
with open("/kaggle/input/existdatasets/EXIST2025_training_task1_2_gold_soft.json", "r", encoding="utf-8") as f:
    gold_soft = json.load(f)

gold_soft_dict = {entry["id"]: entry["value"] for entry in gold_soft}
label_classes = ["NO", "DIRECT", "REPORTED", "JUDGEMENTAL"]

# === Count Label Distribution ===
label_counts = defaultdict(int)
for soft in gold_soft_dict.values():
    max_label = max(soft, key=soft.get)
    label_counts[max_label] += 1

# === Identify underrepresented labels (you can tune this threshold) ===
avg_count = np.mean(list(label_counts.values()))
underrepresented_labels = [label for label, count in label_counts.items() if count < avg_count]

# === Process Tweets & Augment Underrepresented Only ===
def process_data_with_soft_labels(data, augment=True, augment_n=2):
    tweets, labels, ids = [], [], []

    for entry in data.values():
        tweet_id = entry["id_EXIST"]
        tweet = entry["tweet"]

        if tweet_id not in gold_soft_dict:
            continue

        soft_label_dict = gold_soft_dict[tweet_id]
        soft_label_vector = [soft_label_dict.get(label, 0.0) for label in label_classes]

        # Original tweet
        tweets.append(tweet)
        labels.append(soft_label_vector)
        ids.append(tweet_id)

        # Determine primary label
        main_label = max(soft_label_dict, key=soft_label_dict.get)

        # Augment only if underrepresented
        if augment and main_label in underrepresented_labels:
            for i in range(augment_n):
                augmented_tweet = aeda(tweet)
                tweets.append(augmented_tweet)
                labels.append(soft_label_vector)
                ids.append(f"{tweet_id}_aug{i+1}")

    return tweets, labels, ids

# Process English and Spanish data
tweets_en, labels_en, ids_en = process_data_with_soft_labels(data_en)
tweets_es, labels_es, ids_es = process_data_with_soft_labels(data_es)

# tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
# tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base")
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")

# === Dataset Class ===
class TweetDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tweet_id = self.ids[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            "id": tweet_id,
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": label
        }

# === Train-validation split ===
def get_datasets(tweets, labels, ids):
    train_texts, val_texts, train_labels, val_labels, train_ids, val_ids = train_test_split(
        tweets, labels, ids, test_size=0.2, random_state=42
    )
    train_dataset = TweetDataset(train_texts, train_labels, train_ids, tokenizer)
    val_dataset = TweetDataset(val_texts, val_labels, val_ids, tokenizer)
    return train_dataset, val_dataset

# === Create datasets ===
train_dataset_en, val_dataset_en = get_datasets(tweets_en, labels_en, ids_en)
train_dataset_es, val_dataset_es = get_datasets(tweets_es, labels_es, ids_es)

print(f"✅ English train set size: {len(train_dataset_en)} (with selective augmentation)")
print(f"✅ Spanish train set size: {len(train_dataset_es)} (with selective augmentation)")

CORRECT_LABELS = label_classes
import json
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# === Load Tweets ===
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_en.json", "r", encoding="utf-8") as f:
    data_en = json.load(f)

with open("/kaggle/input/existdatasets/EXIST2025_training_translated_es.json", "r", encoding="utf-8") as f:
    data_es = json.load(f)

# === Load gold_soft_train ===
with open("/kaggle/input/existdatasets/EXIST2025_training_task1_2_gold_soft.json", "r", encoding="utf-8") as f:
    gold_soft = json.load(f)

# Convert gold_soft to a dict for fast access
gold_soft_dict = {entry["id"]: entry["value"] for entry in gold_soft}

# Define labels
CORRECT_LABELS = label_classes

# === Process Tweets with Corresponding Soft Labels ===
def process_data_with_soft_labels(data):
    tweets = []
    labels = []
    ids = []

    for entry in data.values():
        tweet_id = entry["id_EXIST"]
        tweet = entry["tweet"]

        if tweet_id not in gold_soft_dict:
            continue  # Skip if soft label not found

        soft_label_dict = gold_soft_dict[tweet_id]
        soft_label_vector = [soft_label_dict.get(label, 0.0) for label in CORRECT_LABELS]

        tweets.append(tweet)
        labels.append(soft_label_vector)
        ids.append(tweet_id)

    return tweets, labels, ids

# Process both English and Spanish tweets
tweets_en, labels_en, ids_en = process_data_with_soft_labels(data_en)
tweets_es, labels_es, ids_es = process_data_with_soft_labels(data_es)

# === Custom Dataset Class ===
class TweetDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tweet_id = self.ids[idx]
        labels = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            "id": tweet_id,
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": labels
        }

# === Train-validation split ===
def get_datasets(tweets, labels, ids):
    train_texts, val_texts, train_labels, val_labels, train_ids, val_ids = train_test_split(
        tweets, labels, ids, test_size=0.2, random_state=42
    )
    train_dataset = TweetDataset(train_texts, train_labels, train_ids, tokenizer)
    val_dataset = TweetDataset(val_texts, val_labels, val_ids, tokenizer)
    return train_dataset, val_dataset

# === Create datasets ===
train_dataset_en, val_dataset_en = get_datasets(tweets_en, labels_en, ids_en)
train_dataset_es, val_dataset_es = get_datasets(tweets_es, labels_es, ids_es)


# === Train Model ===
def train_model(train_dataset, val_dataset, output_dir):


    # model_en = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/distilroberta-base_mergedlang_en")
# tokenizer_en = AutoTokenizer.from_pretrained("/kaggle/working/distilroberta-base_mergedlang_en")
    # model = AutoModelForSequenceClassification.from_pretrained(
    #         "cardiffnlp/twitter-xlm-roberta-base",
    #         num_labels=len(CORRECT_LABELS),
    #         problem_type="multi_label_classification"
    #     )
    model = AutoModelForSequenceClassification.from_pretrained(
            "FacebookAI/xlm-roberta-base",
            num_labels=len(CORRECT_LABELS),
            problem_type="multi_label_classification"
        )
# model = AutoModelForSequenceClassification.from_pretrained(
    #         "FacebookAI/xlm-roberta-base",
    #         num_labels=len(CORRECT_LABELS),
    #         problem_type="multi_label_classification"
    #     )

    # model = BertForSequenceClassification.from_pretrained(
    #     "bert-base-multilingual-cased",
    #     num_labels=len(CORRECT_LABELS),
    #     problem_type="multi_label_classification"
    # )

    training_args = TrainingArguments(
    output_dir=output_dir,
    do_train=True,
    do_eval=True,
    num_train_epochs=5,#increasing epocs for more data
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=1,
)


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    trainer.train()
    return trainer

# === Train English and Spanish models ===
trainer_en = train_model(train_dataset_en, val_dataset_en, output_dir="./results/en_xlm_roberta_fb_aeda")
trainer_es = train_model(train_dataset_es, val_dataset_es, output_dir="./results/es_xlm_roberta_fb_aeda")

### HARD Training

In [None]:
import json
import torch
import random
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from torch.utils.data import Dataset
from collections import defaultdict

# === AEDA helper ===
PUNCTUATIONS = ['.', ',', '!', '?', ';', ':']
def aeda(sentence, num_insertions=3):
    words = sentence.split()
    if not words:
        return sentence
    new_words = words.copy()
    for _ in range(num_insertions):
        insert_pos = random.randint(0, len(new_words))
        punct = random.choice(PUNCTUATIONS)
        new_words.insert(insert_pos, punct)
    return ' '.join(new_words)

# === Load Data ===
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_en.json", "r", encoding="utf-8") as f:
    data_en = json.load(f)
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_es.json", "r", encoding="utf-8") as f:
    data_es = json.load(f)
with open("/kaggle/input/existdatasets/EXIST2025_training_task1_2_gold_soft.json", "r", encoding="utf-8") as f:
    gold_hard = json.load(f)

gold_hard_dict = {entry["id"]: entry["value"] for entry in gold_hard}
label_classes = ["NO", "DIRECT", "REPORTED", "JUDGEMENTAL"]

# === Count Label Distribution ===
label_counts = defaultdict(int)
for hard in gold_hard_dict.values():
    label_counts[hard] += 1

# === Identify underrepresented labels (you can tune this threshold) ===
avg_count = np.mean(list(label_counts.values()))
underrepresented_labels = [label for label, count in label_counts.items() if count < avg_count]

# === Process Tweets & Augment Underrepresented Only ===
def process_data_with_soft_labels(data, augment=True, augment_n=2):
    tweets, labels, ids = [], [], []

    for entry in data.values():
        tweet_id = entry["id_EXIST"]
        tweet = entry["tweet"]

        if tweet_id not in gold_hard_dict:
            continue

        hard_label = gold_hard_dict[tweet_id]
        # soft_label_vector = [soft_label_dict.get(label, 0.0) for label in label_classes]

        # Original tweet
        tweets.append(tweet)
        labels.append(hard_label)
        ids.append(tweet_id)

        # Determine primary label
        main_label = max(soft_label_dict, key=soft_label_dict.get)

        # Augment only if underrepresented
        if augment and main_label in underrepresented_labels:
            for i in range(augment_n):
                augmented_tweet = aeda(tweet)
                tweets.append(augmented_tweet)
                labels.append(hard_label)
                ids.append(f"{tweet_id}_aug{i+1}")

    return tweets, labels, ids

# Process English and Spanish data
tweets_en, labels_en, ids_en = process_data_with_soft_labels(data_en)
tweets_es, labels_es, ids_es = process_data_with_soft_labels(data_es)

# tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base")
# tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-base")

# === Dataset Class ===
class TweetDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tweet_id = self.ids[idx]
        label = self.labels[idx]
        # label = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            "id": tweet_id,
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": label
        }

# === Train-validation split ===
def get_datasets(tweets, labels, ids):
    train_texts, val_texts, train_labels, val_labels, train_ids, val_ids = train_test_split(
        tweets, labels, ids, test_size=0.2, random_state=42
    )
    train_dataset = TweetDataset(train_texts, train_labels, train_ids, tokenizer)
    val_dataset = TweetDataset(val_texts, val_labels, val_ids, tokenizer)
    return train_dataset, val_dataset

# === Create datasets ===
train_dataset_en, val_dataset_en = get_datasets(tweets_en, labels_en, ids_en)
train_dataset_es, val_dataset_es = get_datasets(tweets_es, labels_es, ids_es)

print(f"✅ English train set size: {len(train_dataset_en)} (with selective augmentation)")
print(f"✅ Spanish train set size: {len(train_dataset_es)} (with selective augmentation)")

CORRECT_LABELS = label_classes
import json
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# === Load Tweets ===
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_en.json", "r", encoding="utf-8") as f:
    data_en = json.load(f)

with open("/kaggle/input/existdatasets/EXIST2025_training_translated_es.json", "r", encoding="utf-8") as f:
    data_es = json.load(f)

# === Load gold_soft_train ===
with open("/kaggle/input/existdatasets/EXIST2025_training_task1_2_gold_hard.json", "r", encoding="utf-8") as f:
    gold_soft = json.load(f)

# Convert gold_soft to a dict for fast access
gold_hard_dict = {entry["id"]: entry["value"] for entry in gold_soft}

# Define labels
CORRECT_LABELS = label_classes

# === Process Tweets with Corresponding Soft Labels ===
def process_data_with_soft_labels(data):
    tweets = []
    labels = []
    ids = []

    for entry in data.values():
        tweet_id = entry["id_EXIST"]
        tweet = entry["tweet"]

        if tweet_id not in gold_hard_dict:
            continue  # Skip if soft label not found

        hard_label = gold_hard_dict[tweet_id]

        # soft_label_vector = [soft_label_dict.get(label, 0.0) for label in CORRECT_LABELS]

        tweets.append(tweet)
        labels.append(hard_label)
        ids.append(tweet_id)

    return tweets, labels, ids

# Process both English and Spanish tweets
tweets_en, labels_en, ids_en = process_data_with_soft_labels(data_en)
tweets_es, labels_es, ids_es = process_data_with_soft_labels(data_es)

# === Custom Dataset Class ===
class TweetDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tweet_id = self.ids[idx]
        labels = self.labels[idx]
        # label = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            "id": tweet_id,
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": labels
        }

# === Train-validation split ===
def get_datasets(tweets, labels, ids):
    train_texts, val_texts, train_labels, val_labels, train_ids, val_ids = train_test_split(
        tweets, labels, ids, test_size=0.2, random_state=42
    )
    train_dataset = TweetDataset(train_texts, train_labels, train_ids, tokenizer)
    val_dataset = TweetDataset(val_texts, val_labels, val_ids, tokenizer)
    return train_dataset, val_dataset

# === Create datasets ===
train_dataset_en, val_dataset_en = get_datasets(tweets_en, labels_en, ids_en)
train_dataset_es, val_dataset_es = get_datasets(tweets_es, labels_es, ids_es)


# === Train Model ===
def train_model(train_dataset, val_dataset, output_dir):


    # model_en = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/distilroberta-base_mergedlang_en")
# tokenizer_en = AutoTokenizer.from_pretrained("/kaggle/working/distilroberta-base_mergedlang_en")
    model = AutoModelForSequenceClassification.from_pretrained(
            "cardiffnlp/twitter-xlm-roberta-base",
            num_labels=len(CORRECT_LABELS),
            problem_type="multi_label_classification"
        )
    # model = AutoModelForSequenceClassification.from_pretrained(
    #         "FacebookAI/xlm-roberta-base",
    #         num_labels=len(CORRECT_LABELS),
    #         problem_type="multi_label_classification"
    #     )
# model = AutoModelForSequenceClassification.from_pretrained(
    #         "FacebookAI/xlm-roberta-base",
    #         num_labels=len(CORRECT_LABELS),
    #         problem_type="multi_label_classification"
    #     )

    # model = BertForSequenceClassification.from_pretrained(
    #     "bert-base-multilingual-cased",
    #     num_labels=len(CORRECT_LABELS),
    #     problem_type="multi_label_classification"
    # )

    training_args = TrainingArguments(
    output_dir=output_dir,
    do_train=True,
    do_eval=True,
    num_train_epochs=5,#increasing epocs for more data
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=1,
)


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    trainer.train()
    return trainer

# === Train English and Spanish models ===
trainer_en = train_model(train_dataset_en, val_dataset_en, output_dir="./results/en_xlm_roberta_fb_aeda_hard")
trainer_es = train_model(train_dataset_es, val_dataset_es, output_dir="./results/es_xlm_roberta_fb_aeda_hard")

### EASE S Approach

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
!pip install nltk transformers torch tqdm
import nltk
nltk.download('punkt', download_dir='/content/nltk_data')
nltk.download('wordnet', download_dir='/content/nltk_data')
nltk.data.path.append('/content/nltk_data')
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Optional, for better synonym support


nltk.download("punkt_tab")

In [None]:
import json
import torch
import nltk
import random
from tqdm import tqdm
from nltk.corpus import wordnet
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from torch.nn.functional import softmax

nltk.download("punkt")
nltk.download("wordnet")

# Load tweet data (X)
file_path = "/content/EXIST2025_training_translated_es.json"
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Load label data (Y)
label_path = "/content/EXIST2025_training_task1_3_gold_soft.json"
with open(label_path, "r", encoding="utf-8") as f:
    labels = json.load(f)

# Map from tweet ID to label
label_map = {item["id"]: item for item in labels}

# Load Pretrained DistilBERT
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Augmentation functions
def extract_units(text):
    sentences = nltk.sent_tokenize(text)
    return sentences if len(sentences) > 1 else [text]

def sift_sentences(sentences):
    return [s for s in sentences if len(s.split()) > 3]

def synonym_replacement(sentence, num_replacements=1):
    words = sentence.split()
    if len(words) < 2:
        return sentence
    words_to_replace = random.sample(words, min(num_replacements, len(words)))
    for i, word in enumerate(words):
        if word in words_to_replace:
            syns = wordnet.synsets(word)
            if syns:
                synonyms = [lemma.name().replace('_', ' ') for lemma in syns[0].lemmas()]
                if synonyms:
                    words[i] = random.choice(synonyms)
    return ' '.join(words)

# Augmentation setup
augmented_data = {}
augmented_labels = []
augmented_count = 0
AUGMENT_LIMIT = 1000
used_ids = []

# Shuffle and iterate
shuffled_items = list(data.items())
random.shuffle(shuffled_items)

for key, value in tqdm(shuffled_items, desc="Augmenting Tweets"):
    if augmented_count >= AUGMENT_LIMIT:
        break

    original_id = value["id_EXIST"]
    if original_id in used_ids:
        continue

    original_text = value["tweet"]
    extracted_units = extract_units(original_text)
    filtered_units = sift_sentences(extracted_units)

    for i, unit in enumerate(filtered_units):
        if augmented_count >= AUGMENT_LIMIT:
            break

        sr_unit = synonym_replacement(unit)
        aug_key = f"{key}_AUG_{i}"
        aug_id = f"{original_id}_AUG_{i}"

        # Save augmented tweet
        augmented_data[aug_key] = {
            "id_EXIST": aug_id,
            "lang": value["lang"],
            "tweet": sr_unit,
            "number_annotators": value["number_annotators"],
            "annotators": value["annotators"],
            "gender_annotators": value["gender_annotators"],
            "age_annotators": value["age_annotators"],
            "ethnicities_annotators": value["ethnicities_annotators"],
            "study_levels_annotators": value["study_levels_annotators"],
            "countries_annotators": value["countries_annotators"],
            "labels_task1_1": value["labels_task1_1"],
            "labels_task1_2": value["labels_task1_2"],
            "labels_task1_3": value["labels_task1_3"],
            "split": "AUG_EN",
        }

        # Save corresponding label
        if original_id in label_map:
            original_label = label_map[original_id]
            new_label = {
                "test_case": original_label["test_case"],
                "id": aug_id,
                "value": original_label["value"]
            }
            augmented_labels.append(new_label)
            augmented_count += 1

    used_ids.append(original_id)

# Merge original + augmented data
data.update(augmented_data)
all_labels = labels + augmented_labels

# Save tweet data
augmented_file_path = "EXIST2025_training_augmented_S_es.json"
with open(augmented_file_path, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

# Save label data
augmented_label_path = "EXIST2025_training_augmented_gold_es.json"
with open(augmented_label_path, "w", encoding="utf-8") as f:
    json.dump(all_labels, f, ensure_ascii=False, indent=4)

# Save used tweet IDs (optional, for reloading later)
with open("used_tweet_ids.json", "w") as f:
    json.dump(used_ids, f, indent=2)

print(f"\n✅ Augmentation Complete: {augmented_count} new samples added.")
print(f"Tweets saved at: {augmented_file_path}")
print(f"Labels saved at: {augmented_label_path}")
print(f"Used tweet IDs saved at: used_tweet_ids.json")

### MBert

In [None]:
import json
import torch
import random
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from torch.utils.data import Dataset
from collections import defaultdict

# === AEDA helper ===
PUNCTUATIONS = ['.', ',', '!', '?', ';', ':']
def aeda(sentence, num_insertions=3):
    words = sentence.split()
    if not words:
        return sentence
    new_words = words.copy()
    for _ in range(num_insertions):
        insert_pos = random.randint(0, len(new_words))
        punct = random.choice(PUNCTUATIONS)
        new_words.insert(insert_pos, punct)
    return ' '.join(new_words)

# === Load Data ===
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_en.json", "r", encoding="utf-8") as f:
    data_en = json.load(f)
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_es.json", "r", encoding="utf-8") as f:
    data_es = json.load(f)
with open("/kaggle/input/existdatasets/EXIST2025_training_task1_2_gold_soft.json", "r", encoding="utf-8") as f:
    gold_soft = json.load(f)

gold_soft_dict = {entry["id"]: entry["value"] for entry in gold_soft}
label_classes = ["NO", "DIRECT", "REPORTED", "JUDGEMENTAL"]

# === Count Label Distribution ===
label_counts = defaultdict(int)
for soft in gold_soft_dict.values():
    max_label = max(soft, key=soft.get)
    label_counts[max_label] += 1

# === Identify underrepresented labels (you can tune this threshold) ===
avg_count = np.mean(list(label_counts.values()))
underrepresented_labels = [label for label, count in label_counts.items() if count < avg_count]

# === Process Tweets & Augment Underrepresented Only ===
def process_data_with_soft_labels(data, augment=True, augment_n=2):
    tweets, labels, ids = [], [], []

    for entry in data.values():
        tweet_id = entry["id_EXIST"]
        tweet = entry["tweet"]

        if tweet_id not in gold_soft_dict:
            continue

        soft_label_dict = gold_soft_dict[tweet_id]
        soft_label_vector = [soft_label_dict.get(label, 0.0) for label in label_classes]

        # Original tweet
        tweets.append(tweet)
        labels.append(soft_label_vector)
        ids.append(tweet_id)

        # Determine primary label
        main_label = max(soft_label_dict, key=soft_label_dict.get)

        # Augment only if underrepresented
        if augment and main_label in underrepresented_labels:
            for i in range(augment_n):
                augmented_tweet = aeda(tweet)
                tweets.append(augmented_tweet)
                labels.append(soft_label_vector)
                ids.append(f"{tweet_id}_aug{i+1}")

    return tweets, labels, ids

# Process English and Spanish data
tweets_en, labels_en, ids_en = process_data_with_soft_labels(data_en)
tweets_es, labels_es, ids_es = process_data_with_soft_labels(data_es)

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
# tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base")

# === Dataset Class ===
class TweetDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tweet_id = self.ids[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            "id": tweet_id,
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": label
        }

# === Train-validation split ===
def get_datasets(tweets, labels, ids):
    train_texts, val_texts, train_labels, val_labels, train_ids, val_ids = train_test_split(
        tweets, labels, ids, test_size=0.2, random_state=42
    )
    train_dataset = TweetDataset(train_texts, train_labels, train_ids, tokenizer)
    val_dataset = TweetDataset(val_texts, val_labels, val_ids, tokenizer)
    return train_dataset, val_dataset

# === Create datasets ===
train_dataset_en, val_dataset_en = get_datasets(tweets_en, labels_en, ids_en)
train_dataset_es, val_dataset_es = get_datasets(tweets_es, labels_es, ids_es)

print(f"✅ English train set size: {len(train_dataset_en)} (with selective augmentation)")
print(f"✅ Spanish train set size: {len(train_dataset_es)} (with selective augmentation)")

CORRECT_LABELS = label_classes
import json
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# === Load Tweets ===
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_en.json", "r", encoding="utf-8") as f:
    data_en = json.load(f)

with open("/kaggle/input/existdatasets/EXIST2025_training_translated_es.json", "r", encoding="utf-8") as f:
    data_es = json.load(f)

# === Load gold_soft_train ===
with open("/kaggle/input/existdatasets/EXIST2025_training_task1_2_gold_soft.json", "r", encoding="utf-8") as f:
    gold_soft = json.load(f)

# Convert gold_soft to a dict for fast access
gold_soft_dict = {entry["id"]: entry["value"] for entry in gold_soft}

CORRECT_LABELS = label_classes

# === Process Tweets with Corresponding Soft Labels ===
def process_data_with_soft_labels(data):
    tweets = []
    labels = []
    ids = []

    for entry in data.values():
        tweet_id = entry["id_EXIST"]
        tweet = entry["tweet"]

        if tweet_id not in gold_soft_dict:
            continue  # Skip if soft label not found

        soft_label_dict = gold_soft_dict[tweet_id]

        soft_label_vector = [soft_label_dict.get(label, 0.0) for label in CORRECT_LABELS]

        tweets.append(tweet)
        labels.append(soft_label_vector)
        ids.append(tweet_id)

    return tweets, labels, ids

# Process both English and Spanish tweets
tweets_en, labels_en, ids_en = process_data_with_soft_labels(data_en)
tweets_es, labels_es, ids_es = process_data_with_soft_labels(data_es)

# === Custom Dataset Class ===
class TweetDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tweet_id = self.ids[idx]
        labels = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            "id": tweet_id,
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": labels
        }

# === Train-validation split ===
def get_datasets(tweets, labels, ids):
    train_texts, val_texts, train_labels, val_labels, train_ids, val_ids = train_test_split(
        tweets, labels, ids, test_size=0.2, random_state=42
    )
    train_dataset = TweetDataset(train_texts, train_labels, train_ids, tokenizer)
    val_dataset = TweetDataset(val_texts, val_labels, val_ids, tokenizer)
    return train_dataset, val_dataset

# === Create datasets ===
train_dataset_en, val_dataset_en = get_datasets(tweets_en, labels_en, ids_en)
train_dataset_es, val_dataset_es = get_datasets(tweets_es, labels_es, ids_es)


# === Train Model ===
def train_model(train_dataset, val_dataset, output_dir):


    # model_en = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/distilroberta-base_mergedlang_en")
# tokenizer_en = AutoTokenizer.from_pretrained("/kaggle/working/distilroberta-base_mergedlang_en")
    # model = AutoModelForSequenceClassification.from_pretrained(
    #         "cardiffnlp/twitter-xlm-roberta-base",
    #         num_labels=len(CORRECT_LABELS),
    #         problem_type="multi_label_classification"
    #     )
    # model = AutoModelForSequenceClassification.from_pretrained(
    #         "FacebookAI/xlm-roberta-base",
    #         num_labels=len(CORRECT_LABELS),
    #         problem_type="multi_label_classification"
    #     )

    model = BertForSequenceClassification.from_pretrained(
        "bert-base-multilingual-cased",
        num_labels=len(CORRECT_LABELS),
        problem_type="multi_label_classification"
    )

    training_args = TrainingArguments(
    output_dir=output_dir,
    do_train=True,
    do_eval=True,
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=1,
)


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    trainer.train()
    return trainer

# === Train English and Spanish models ===
trainer_en = train_model(train_dataset_en, val_dataset_en, output_dir="./results/en_mbert_aeda")
trainer_es = train_model(train_dataset_es, val_dataset_es, output_dir="./results/es_mbert_aeda")

#### Previous Evaluation

In [61]:
# Run inference on each tweet
output = []
for case_id, case_data in tqdm(test_data.items()):
    if case_data['lang']=='es':
        continue
    tweet = case_data["tweet"]

    # Tokenize the tweet
    inputs = tokenizer(tweet, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Ensure the model and inputs are on the same device (use GPU if available)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Get model predictions
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted class index
    logits = outputs.logits
    predicted_class_idx = torch.argmax(logits, dim=1).item()

    # Map the predicted class index to label
    predicted_label = id2label[predicted_class_idx]

    # Append the result to the output list
    output.append({
        "test_case": "EXIST2025",
        "id": case_data["id_EXIST"],
        "value": predicted_label
    })

# Save the results to an output JSON file
output_json_file = "distil_en_hard_predictions_mergedlang.json"  # Specify the output file path
with open(output_json_file, "w") as f:
    json.dump(output, f, indent=4)

print(f"Results saved to {output_json_file}")


100%|██████████| 1038/1038 [00:03<00:00, 326.52it/s]

Results saved to distil_en_hard_predictions_mergedlang.json





In [62]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils
predictions = "/kaggle/working/distil_en_hard_predictions_mergedlang.json"         
gold = "/kaggle/input/existdatasets/EXIST2025_dev_task1_2_gold_hard.json" 
test = PyEvALLEvaluation() 
params= dict() 
params[PyEvALLUtils.PARAM_REPORT]= PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED  
metrics=["ICM", "ICMNorm" ,"FMeasure"]                  # for hard        
report= test.evaluate(predictions, gold, metrics, **params) 
report.print_report()

2025-04-23 07:25:37,560 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM', 'ICMNorm', 'FMeasure']
2025-04-23 07:25:37,647 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-23 07:25:37,954 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Normalized evaluation method
2025-04-23 07:25:37,957 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-23 07:25:38,265 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-23 07:25:38,664 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
{
  "metrics": {
    "ICM": {
      "name": "Information Contrast model",
      "acronym": "ICM",
      "description": "Coming soon!",
      "status": "OK",
      "results": {
        "test_cases": [{
          "name": "EXIST2025",
          "average": -1.04738

soft

In [63]:
import torch
import json
from tqdm import tqdm
output = []

# Run inference on each tweet
for case_id, case_data in tqdm(test_data.items()):
    if case_data['lang']=='es':
        continue

    tweet = case_data["tweet"]

    # Tokenize the tweet
    inputs = tokenizer(tweet, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Ensure the model and inputs are on the same device (use GPU if available)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Get model predictions
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        outputs = model(**inputs)

    # Get probabilities for all labels
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1).squeeze().tolist()
    # print(probabilities)
    # Map class indices to labels with probabilities
    label_probs = {id2label[idx]: prob for idx, prob in enumerate(probabilities)}
    values = {
            "NO": label_probs["NO"],
            "REPORTED": label_probs["REPORTED"],
            "JUDGEMENTAL": label_probs["JUDGEMENTAL"],
            "DIRECT": label_probs["DIRECT"],
        }
    # Append the result to the output list
    output.append({
        "test_case": "EXIST2025",
        "id": str(case_data["id_EXIST"]),
        "value": dict(sorted(values.items(), key=lambda item: item[1]))
    })

# Save the results to an output JSON file
output_json_file = "distil_en_soft_predictions_mergedlang.json"  # Specify the output file path
with open(output_json_file, "w") as f:
    json.dump(output, f, indent=4)

print(f"Results saved to {output_json_file}")


100%|██████████| 1038/1038 [00:03<00:00, 327.55it/s]

Results saved to distil_en_soft_predictions_mergedlang.json





In [68]:
softs = pd.read_json("distil_en_soft_predictions_mergedlang.json")
softs.head()

Unnamed: 0,test_case,id,value
0,EXIST2025,400001,"{'JUDGEMENTAL': 0.120749652385711, 'REPORTED':..."
1,EXIST2025,400002,"{'NO': 0.187171712517738, 'JUDGEMENTAL': 0.224..."
2,EXIST2025,400003,"{'NO': 0.176499783992767, 'JUDGEMENTAL': 0.213..."
3,EXIST2025,400004,"{'REPORTED': 0.007521138526499001, 'JUDGEMENTA..."
4,EXIST2025,400005,"{'DIRECT': 0.013363457284867, 'JUDGEMENTAL': 0..."


In [65]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils
predictions = "/kaggle/working/distil_en_soft_predictions_mergedlang.json"         
gold = "/kaggle/input/existdatasets/EXIST2025_dev_task1_2_soft_hard.json" 
test = PyEvALLEvaluation() 
params= dict() 
params[PyEvALLUtils.PARAM_REPORT]= PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED  
metrics=["ICMSoft", "ICMSoftNorm", "CrossEntropy"]      # for soft    
report= test.evaluate(predictions, gold, metrics, **params) 
report.print_report()

2025-04-23 07:31:24,302 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICMSoft', 'ICMSoftNorm', 'CrossEntropy']
{
  "metrics": {},
  "files": {
    "distil_en_soft_predictions_mergedlang.json": {
      "name": "distil_en_soft_predictions_mergedlang.json",
      "status": "OK",
      "gold": false,
      "errors": {}
    },
    "EXIST2025_dev_task1_2_soft_hard.json": {
      "name": "EXIST2025_dev_task1_2_soft_hard.json",
      "status": "FAIL",
      "gold": true,
      "errors": {
        "FORMAT_FILE_NOT_EXIST_ERROR": {
          "description": "File not found error: wrong file's path.\\nFile name: EXIST2025_dev_task1_2_soft_hard.json.\\nThe evaluation STOP.",
          "exception": null,
          "status": "STOP"
        }
      }
    }
  }
}


## Free up working dir

In [4]:
# Clear output folder
import os

def remove_folder_contents(folder):
    for the_file in os.listdir(folder):
        file_path = os.path.join(folder, the_file)
        try:
            if os.path.isfile(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                remove_folder_contents(file_path)
                os.rmdir(file_path)
        except Exception as e:
            print(e)

folder_path = '/kaggle/working/results/es_mbert_aeda'
remove_folder_contents(folder_path)
os.rmdir(folder_path)
