In [62]:
!pip install transformers datasets evaluate accelerate sentencepiece

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [63]:
import os
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
import torch
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
import numpy as np
import datasets
from tabulate import tabulate
import nltk
from datetime import datetime

In [64]:
label_classes = ['NO', 'DIRECT','REPORTED','JUDGEMENTAL']
class2id = {'NO': 0, 'DIRECT': 1, 'REPORTED': 2, 'JUDGEMENTAL': 3}
id2class = {v: k for k, v in class2id.items()}
id2label = id2class
label_mapping = class2id

In [65]:

# ! pip install datasets
# ! pip install sentencepiece
# ! pip install rouge_score
! pip install wandb
import wandb
# wandb login}
wandb.login(key="6930a5bf7436e98e8f1d44766c7b999ee9621ba9")
# wandb.init(project="LLM", entity="sa07424-habib-university", settings=wandb.Settings(init_timeout=200))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)






True

### With AEDA

In [89]:
import json
import torch
import random
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from torch.utils.data import Dataset
from collections import defaultdict
 
# AEDA helper
PUNCTUATIONS = ['.', ',', '!', '?', ';', ':']
def aeda(sentence, num_insertions=3):
    words = sentence.split()
    if not words:
        return sentence
    new_words = words.copy()
    for _ in range(num_insertions):
        insert_pos = random.randint(0, len(new_words))
        punct = random.choice(PUNCTUATIONS)
        new_words.insert(insert_pos, punct)
    return ' '.join(new_words)
    
# === Load Tweets ===
with open("/kaggle/input/existdatasets/EXIST2025_training_translated_en.json", "r", encoding="utf-8") as f:
    data_en = json.load(f)

with open("/kaggle/input/existdatasets/EXIST2025_training_translated_es.json", "r", encoding="utf-8") as f:
    data_es = json.load(f)

# === Loading gold_hard_train === (Using hard labels for Task 2 - multi-class classification)
with open("/kaggle/input/existdatasets/EXIST2025_training_task1_2_gold_hard.json", "r", encoding="utf-8") as f:
    gold_hard = json.load(f)

# Convert gold_hard to a dict for fast access
gold_hard_dict = {entry["id"]: entry["value"] for entry in gold_hard}

# Define the label classes
CORRECT_LABELS = label_classes  # Make sure you define the correct label classes
NUM_CLASSES = len(CORRECT_LABELS)  # This should be the number of classes

# === Count Label Distribution ===
label_counts = defaultdict(int)
for hard in gold_hard_dict.values():
    label_counts[hard] += 1

# === Identify underrepresented labels (you can tune this threshold) ===
avg_count = np.mean(list(label_counts.values()))
underrepresented_labels = [label for label, count in label_counts.items() if count < 1294]


In [86]:
print(label_counts)
print(avg_count)

defaultdict(<class 'int'>, {'REPORTED': 459, 'NO': 3367, 'DIRECT': 1294, 'JUDGEMENTAL': 376})
1374.0


In [90]:
# === Process Tweets with Corresponding Hard Labels ===
def process_data_with_hard_labels(data, augment=True, augment_n=2):
    tweets = []
    labels = []
    ids = []

    for entry in data.values():
        tweet_id = entry["id_EXIST"]
        tweet = entry["tweet"]

        if tweet_id not in gold_hard_dict:
            continue  # Skip if hard label not found

        # Get the hard label from the dictionary
        hard_label = gold_hard_dict[tweet_id]

        # Check if the label is in the mapping, if not, raise an error or handle it
        if hard_label not in label_mapping:
            raise ValueError(f"Unexpected label '{hard_label}' for tweet ID {tweet_id}")

        # Convert the string label to the corresponding integer
        label_int = label_mapping[hard_label]

        # Tried combining tweet with annotator attributes
        # annotator_info = {
        #     "country": entry.get("countries_annotators", []),
        #     "study_level": entry.get("study_levels_annotators", []),
        #     "ethnicity": entry.get("ethnicities_annotators", []),
        #     "age": entry.get("age_annotators", []),
        #     "gender": entry.get("gender_annotators", []),
        # }

        # # Flatten and format metadata into string
        # annotator_str = " | ".join(
        #     f"{key}: {', '.join(map(str, value))}" for key, value in annotator_info.items()
        # )
        # full_text = f"{tweet} [ANNOTATORS] {annotator_str}"

        tweets.append(tweet)  # input everything
        labels.append(label_int)  # single label for multi-class classification
        ids.append(tweet_id)
        # Augment only if underrepresented
        if augment and hard_label in underrepresented_labels:
            for i in range(augment_n):
                augmented_tweet = aeda(tweet)
                tweets.append(augmented_tweet)
                labels.append(label_int)
                ids.append(f"{tweet_id}_aug{i+1}")

    return tweets, labels, ids
# Process both English and Spanish tweets
tweets_en, labels_en, ids_en = process_data_with_hard_labels(data_en)
tweets_es, labels_es, ids_es = process_data_with_hard_labels(data_es)
print(f"English train set size: {len(train_dataset_en)} (with augmentation)")
print(f"Spanish train set size: {len(train_dataset_es)} (with augmentation)")


English train set size: 7803 (with augmentation)
Spanish train set size: 7803 (with augmentation)


In [91]:

# === Tokenizer ===
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base")

# === Custom Dataset Class ===
class TweetDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tweet_id = self.ids[idx]
        labels = torch.tensor(self.labels[idx], dtype=torch.long)  # Use long for multi-class
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            "id": tweet_id,
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": labels
        }

# === Train-validation split ===
def get_datasets(tweets, labels, ids):
    train_texts, val_texts, train_labels, val_labels, train_ids, val_ids = train_test_split(
        tweets, labels, ids, test_size=0.2, random_state=42
    )
    train_dataset = TweetDataset(train_texts, train_labels, train_ids, tokenizer)
    val_dataset = TweetDataset(val_texts, val_labels, val_ids, tokenizer)
    return train_dataset, val_dataset

# === Create datasets ===
train_dataset_en, val_dataset_en = get_datasets(tweets_en, labels_en, ids_en)
train_dataset_es, val_dataset_es = get_datasets(tweets_es, labels_es, ids_es)

# === Train Model ===
def train_model(train_dataset, val_dataset, output_dir):
    model = AutoModelForSequenceClassification.from_pretrained(
            "cardiffnlp/twitter-xlm-roberta-base",
            num_labels=NUM_CLASSES,  # Set number of classes for multi-class classification
            problem_type="single_label_classification"  # For multi-class, use single_label_classification
        )

    training_args = TrainingArguments(
        output_dir=output_dir,
        do_train=True,
        do_eval=True,
        num_train_epochs=4,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        logging_dir="./logs",
        logging_steps=100,
        save_total_limit=1,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )
    trainer.train()
    return trainer

# === Train English and Spanish models ===
trainer_en = train_model(train_dataset_en, val_dataset_en, output_dir="./results/en_xlm_twt_roberta_hard_aeda")
trainer_es = train_model(train_dataset_es, val_dataset_es, output_dir="./results/es_xlm_twt_roberta_hard_aeda")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
100,0.8919
200,0.6641
300,0.5352
400,0.4397
500,0.3045
600,0.2114
700,0.1453


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
100,0.9028
200,0.6823
300,0.5539
400,0.4666
500,0.2867
600,0.2255
700,0.1473




In [None]:
# FacebookAI/xlm-roberta-large

In [7]:
class2id = {'NO': 0,'DIRECT':1,'REPORTED':2,'JUDGEMENTAL':3}
id2class = {v: k for k, v in class2id.items()}

In [8]:
df = pd.read_json("/kaggle/input/existdatasets/EXIST2025_training_translated_en.json")
df.head()

Unnamed: 0,100001,100002,100003,100004,100005,100006,100007,100008,100009,100010,...,203251,203252,203253,203254,203255,203256,203257,203258,203259,203260
id_EXIST,100001,100002,100003,100004,100005,100006,100007,100008,100009,100010,...,203251,203252,203253,203254,203255,203256,203257,203258,203259,203260
lang,es,es,es,es,es,es,es,es,es,es,...,en,en,en,en,en,en,en,en,en,en
tweet,"@TheChiflis Ignora to the other, he's a jerk.T...",@ultinameda_ If comicsgate looks like somethin...,"@Steven2897 Read about Gamergate, and as that ...",@Lunariita7 A rather unfortunate social retard...,@novadragon21 @icep4ck @TvDannyZ Then as this ...,@yonkykong Aaah yes. Andrew Dobson. The one wh...,@glutamatom @JoaquinAdolphoC I was on the Game...,@BestKabest This gringa is still crying for th...,Do you know the #DECORATION #GAMER style for #...,CES 2022 ASUS ROG Rise of Gamers Launch event ...,...,"""you look like a whore"" I'm literally wearing ...",“You look like a whore” if you think I’m cute ...,Who fucking lied to you? You look like a whore...,@ShefVaidya Ma'am if I say that you look like ...,I forgot I have a m*d that changes the drachen...,idk why y’all bitches think having half your a...,This has been a part of an experiment with @Wo...,"""Take me already"" ""Not yet. You gotta be ready...",@clintneedcoffee why do you look like a whore?...,ik when mandy says “you look like a whore” i l...
number_annotators,6,6,6,6,6,6,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
annotators,"[Annotator_1, Annotator_2, Annotator_3, Annota...","[Annotator_7, Annotator_8, Annotator_9, Annota...","[Annotator_7, Annotator_8, Annotator_9, Annota...","[Annotator_13, Annotator_14, Annotator_15, Ann...","[Annotator_19, Annotator_20, Annotator_21, Ann...","[Annotator_25, Annotator_26, Annotator_27, Ann...","[Annotator_25, Annotator_26, Annotator_27, Ann...","[Annotator_25, Annotator_26, Annotator_27, Ann...","[Annotator_31, Annotator_32, Annotator_33, Ann...","[Annotator_37, Annotator_38, Annotator_39, Ann...",...,"[Annotator_473, Annotator_474, Annotator_475, ...","[Annotator_617, Annotator_618, Annotator_619, ...","[Annotator_617, Annotator_618, Annotator_619, ...","[Annotator_668, Annotator_669, Annotator_670, ...","[Annotator_674, Annotator_675, Annotator_676, ...","[Annotator_478, Annotator_479, Annotator_480, ...","[Annotator_668, Annotator_669, Annotator_670, ...","[Annotator_467, Annotator_468, Annotator_469, ...","[Annotator_674, Annotator_675, Annotator_676, ...","[Annotator_473, Annotator_474, Annotator_475, ..."


In [13]:
# PlanTL-GOB-ES/RoBERTalex
# FacebookAI/xlm-roberta-base
# distilbert/distilbert-base-uncased
# google-bert/bert-base-multilingual-uncased
# JonatanGk/roberta-base-bne-finetuned-hate-speech-offensive-spanish
# FacebookAI/xlm-roberta-base
# distilroberta-base
# modelname = "distilroberta-base"
modelname = "cardiffnlp/twitter-xlm-roberta-base"

### Hard predict

In [35]:
import json

# Load the dev dataset
with open("/kaggle/input/existdatasets/EXIST2025_dev.json", "r", encoding="utf-8") as f:
    dev_data = json.load(f)

# Split into English & Spanish
english_dev_tweets = []
english_dev_ids = []
spanish_dev_tweets = []
spanish_dev_ids = []

for entry in dev_data.values():
    tweet_id = entry["id_EXIST"]
    tweet = entry["tweet"]
    lang = entry["lang"]

    if lang == "en":
        english_dev_tweets.append(tweet)
        english_dev_ids.append(tweet_id)
    elif lang == "es":
        spanish_dev_tweets.append(tweet)
        spanish_dev_ids.append(tweet_id)

# Debugging: Check split sizes
print(f"English Dev Samples: {len(english_dev_tweets)}")
print(f"Spanish Dev Samples: {len(spanish_dev_tweets)}")


English Dev Samples: 489
Spanish Dev Samples: 549


In [29]:
# Check for None or empty tweets in Spanish data
for i, (tweet, tweet_id) in enumerate(zip(spanish_dev_tweets, spanish_dev_ids)):
    if not tweet:
        print(f"Empty tweet at index {i}, ID: {tweet_id}")

In [98]:
import os
from transformers import BertForSequenceClassification
import os
from transformers import BertForSequenceClassification

# Function to get the latest checkpoint
def get_latest_checkpoint(directory="./results"):
    checkpoints = [d for d in os.listdir(directory) if d.startswith("checkpoint-")]
    if not checkpoints:
        raise ValueError(f"No checkpoints found in {directory}")
    latest_checkpoint = sorted(checkpoints, key=lambda x: int(x.split('-')[-1]))[-1]
    return os.path.join(directory, latest_checkpoint)

# Load the best model checkpoint for English and Spanish
latest_checkpoint_en = get_latest_checkpoint("./results/en_xlm_twt_roberta_hard_aeda")
latest_checkpoint_es = get_latest_checkpoint("./results/es_xlm_twt_roberta_hard_aeda")

print(f"Using latest checkpoint for English: {latest_checkpoint_en}")
print(f"Using latest checkpoint for Spanish: {latest_checkpoint_es}")

# Load models
# model_en = BertForSequenceClassification.from_pretrained(latest_checkpoint_en)
# model_es = BertForSequenceClassification.from_pretrained(latest_checkpoint_es)
#for non bert models
model_en = AutoModelForSequenceClassification.from_pretrained(latest_checkpoint_en)
model_es = AutoModelForSequenceClassification.from_pretrained(latest_checkpoint_es)

Using latest checkpoint for English: ./results/en_xlm_twt_roberta_hard_aeda/checkpoint-976
Using latest checkpoint for Spanish: ./results/es_xlm_twt_roberta_hard_aeda/checkpoint-976


In [99]:
def predict_hard_labels(tweets, ids, model, tokenizer, label_classes, output_file):
    # model.eval()
    results = []
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")#faster on gpu
    model.to(device)

    for tweet, tweet_id in zip(tweets, ids):
        # encoding = tokenizer(text=tweet, truncation=True, padding="max_length", max_length=256, return_tensors="pt")
        encoding = tokenizer(tweet, truncation=True, padding="max_length", max_length=256, return_tensors="pt")
        encoding = {key: val.to(device) for key, val in encoding.items()}

        with torch.no_grad():
            outputs = model(**encoding)

        logits = outputs.logits.squeeze()
        probs = torch.sigmoid(logits).cpu().numpy()

        # Pick the label with the highest probability (YES or NO)
        max_index = int(probs.argmax())
        predicted_label = label_classes[max_index]

        results.append({
            "test_case": "EXIST2025",
            "id": tweet_id,
            "value": [predicted_label]  # Only one label
        })

    # Save results
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4)

    print(f"Hard label predictions saved to {output_file}")

predict_hard_labels(english_dev_tweets, english_dev_ids, model_en, tokenizer, label_classes, "EXIST2025_dev_predictions_hard_merged_en_aeda1.json")
predict_hard_labels(spanish_dev_tweets, spanish_dev_ids, model_es, tokenizer, label_classes, "EXIST2025_dev_predictions_hard_merged_es_aeda1.json")

Hard label predictions saved to EXIST2025_dev_predictions_hard_merged_en_aeda1.json
Hard label predictions saved to EXIST2025_dev_predictions_hard_merged_es_aeda1.json


In [100]:
import json

# Load the Spanish predictions
with open("/kaggle/working/EXIST2025_dev_predictions_hard_merged_es_aeda1.json", "r", encoding="utf-8") as f:
    es_data = json.load(f)

# Load the English predictions
with open("/kaggle/working/EXIST2025_dev_predictions_hard_merged_en_aeda1.json", "r", encoding="utf-8") as f:
    en_data = json.load(f)

# Assuming both files contain lists of predictions, merge them
if isinstance(es_data, list) and isinstance(en_data, list):
    merged_data = es_data + en_data
else:
    raise ValueError("JSON structure is not a list. Ensure both files contain lists.")

# Save to a new file
output_filename = "EXIST2025_dev_predictions_merged_hard_xlmroberta_aeda1.json"
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=4, ensure_ascii=False)

print(f"Merging complete! Saved to {output_filename}")

import json

def convert_prediction_format(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as f:
        predictions = json.load(f)

    converted = []
    for entry in predictions:
        # Convert the "value" list to a single string (first label only)
        new_entry = {
            "test_case": entry["test_case"],
            "id": entry["id"],
            "value": entry["value"][0] if isinstance(entry["value"], list) else entry["value"]
        }
        converted.append(new_entry)

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(converted, f, indent=4)

    print(f"Predictions converted to gold format and saved to {output_file}")

convert_prediction_format("EXIST2025_dev_predictions_merged_hard_xlmroberta_aeda1.json", "EXIST2025_dev_predictions_merged_hard_flat_aeda1.json")

Merging complete! Saved to EXIST2025_dev_predictions_merged_hard_xlmroberta_aeda1.json
Predictions converted to gold format and saved to EXIST2025_dev_predictions_merged_hard_flat_aeda1.json


In [72]:
!pip install pyEvall

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [73]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils

In [101]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils
predictions = "/kaggle/working/EXIST2025_dev_predictions_merged_hard_flat_aeda1.json"         
gold = "/kaggle/input/existdatasets/EXIST2025_dev_task1_2_gold_hard.json" 
test = PyEvALLEvaluation() 
params= dict() 
params[PyEvALLUtils.PARAM_REPORT]= PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED  
metrics=["ICM", "ICMNorm" ,"FMeasure"]                  # for hard        
report= test.evaluate(predictions, gold, metrics, **params) 
report.print_report()

2025-05-11 18:04:07,497 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM', 'ICMNorm', 'FMeasure']
2025-05-11 18:04:07,619 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-11 18:04:07,979 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Normalized evaluation method
2025-05-11 18:04:07,982 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-11 18:04:08,321 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-11 18:04:08,606 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
{
  "metrics": {
    "ICM": {
      "name": "Information Contrast model",
      "acronym": "ICM",
      "description": "Coming soon!",
      "status": "OK",
      "results": {
        "test_cases": [{
          "name": "EXIST2025",
          "average": -0.20000

In [59]:
import os
import shutil

def remove_folder_contents(folder):
    if os.path.exists(folder):
        for the_file in os.listdir(folder):
            file_path = os.path.join(folder, the_file)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception as e:
                print(f'Failed to delete {file_path}. Reason: {e}')
    else:
        print(f"The folder {folder} does not exist.")

folder_path = '/kaggle/working/results/'
remove_folder_contents(folder_path)

# Only try to remove the folder if it exists
if os.path.exists(folder_path):
    os.rmdir(folder_path)

In [61]:
import torch
torch.cuda.empty_cache()
