In [22]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/exist2025/EXIST2025_training.json
/kaggle/input/exist2025/EXIST2025_dev.json
/kaggle/input/exist2025-all/EXIST2025_dev_task1_2_gold_hard.json
/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_majority_class_soft.json
/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_minority_class_hard.json
/kaggle/input/exist2025-all/training_gold_augmented_en.json
/kaggle/input/exist2025-all/EXIST2025_training_task1_1_gold_soft.json
/kaggle/input/exist2025-all/EXIST2025_training_augmented_gold_en_v3.json
/kaggle/input/exist2025-all/EXIST2025_training_augmented_gold_AEDA_en.json
/kaggle/input/exist2025-all/EXIST2025_training_task1_2_gold_soft.json
/kaggle/input/exist2025-all/EXIST2025_training_task1_3_minority_class_hard.json
/kaggle/input/exist2025-all/EXIST2025_training.json
/kaggle/input/exist2025-all/EXIST2025_training_augmented_S2_es.json
/kaggle/input/exist2025-all/EXIST2025_dev_task1_1_gold_hard.json
/kaggle/input/exist2025-all/EXIST2025_dev_en.json
/kaggle/input/exist2025-all/EXI

In [23]:
import wandb

wandb.login(key="0c5f368f1f51fd942ec7bb3a1c74efb7bdc832d6")




True

In [24]:
CORRECT_LABELS = [
    "IDEOLOGICAL-INEQUALITY",
    "MISOGYNY-NON-SEXUAL-VIOLENCE",
    "OBJECTIFICATION",
    "SEXUAL-VIOLENCE",
    "STEREOTYPING-DOMINANCE",
    "NO"  # Represents non-sexist tweets (previously "-")
]

# Newer version

In [26]:
import json
import torch
import random
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from torch.utils.data import Dataset
from collections import defaultdict

# === AEDA helper ===
PUNCTUATIONS = ['.', ',', '!', '?', ';', ':']
def aeda(sentence, num_insertions=3):
    words = sentence.split()
    if not words:
        return sentence
    new_words = words.copy()
    for _ in range(num_insertions):
        insert_pos = random.randint(0, len(new_words))
        punct = random.choice(PUNCTUATIONS)
        new_words.insert(insert_pos, punct)
    return ' '.join(new_words)

# === Load Data ===
with open("/kaggle/input/exist2025-all/EXIST2025_training_translated_en.json", "r", encoding="utf-8") as f:
    data_en = json.load(f)
with open("/kaggle/input/exist2025-all/EXIST2025_training_translated_es.json", "r", encoding="utf-8") as f:
    data_es = json.load(f)
with open("/kaggle/input/exist2025-all/EXIST2025_training_task1_3_gold_soft.json", "r", encoding="utf-8") as f:
    gold_soft = json.load(f)

gold_soft_dict = {entry["id"]: entry["value"] for entry in gold_soft}
label_classes = [
    "IDEOLOGICAL-INEQUALITY",
    "MISOGYNY-NON-SEXUAL-VIOLENCE",
    "OBJECTIFICATION",
    "SEXUAL-VIOLENCE",
    "STEREOTYPING-DOMINANCE",
    "NO"  # Represents non-sexist tweets (previously "-")
]

# === Count Label Distribution ===
label_counts = defaultdict(int)
for soft in gold_soft_dict.values():
    max_label = max(soft, key=soft.get)
    label_counts[max_label] += 1

# === Identify underrepresented labels (you can tune this threshold) ===
avg_count = np.mean(list(label_counts.values()))
underrepresented_labels = [label for label, count in label_counts.items() if count < avg_count]

# === Process Tweets & Augment Underrepresented Only ===
def process_data_with_soft_labels(data, augment=True, augment_n=2):
    tweets, labels, ids = [], [], []

    for entry in data.values():
        tweet_id = entry["id_EXIST"]
        tweet = entry["tweet"]

        if tweet_id not in gold_soft_dict:
            continue

        soft_label_dict = gold_soft_dict[tweet_id]
        soft_label_vector = [soft_label_dict.get(label, 0.0) for label in label_classes]

        # Original tweet
        tweets.append(tweet)
        labels.append(soft_label_vector)
        ids.append(tweet_id)

        # Determine primary label
        main_label = max(soft_label_dict, key=soft_label_dict.get)

        # Augment only if underrepresented
        if augment and main_label in underrepresented_labels:
            for i in range(augment_n):
                augmented_tweet = aeda(tweet)
                tweets.append(augmented_tweet)
                labels.append(soft_label_vector)
                ids.append(f"{tweet_id}_aug{i+1}")

    return tweets, labels, ids

# Process English and Spanish data
tweets_en, labels_en, ids_en = process_data_with_soft_labels(data_en)
tweets_es, labels_es, ids_es = process_data_with_soft_labels(data_es)

class TweetDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tweet_id = self.ids[idx]
        label = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            "id": tweet_id,
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": label
        }

# === Train-validation split ===
def get_datasets(tweets, labels, ids):
    train_texts, val_texts, train_labels, val_labels, train_ids, val_ids = train_test_split(
        tweets, labels, ids, test_size=0.2, random_state=42
    )
    train_dataset = TweetDataset(train_texts, train_labels, train_ids, tokenizer)
    val_dataset = TweetDataset(val_texts, val_labels, val_ids, tokenizer)
    return train_dataset, val_dataset

# === Create datasets ===
train_dataset_en, val_dataset_en = get_datasets(tweets_en, labels_en, ids_en)
train_dataset_es, val_dataset_es = get_datasets(tweets_es, labels_es, ids_es)

print(f"English train set size: {len(train_dataset_en)} (with selective augmentation)")
print(f"Spanish train set size: {len(train_dataset_es)} (with selective augmentation)")

English train set size: 9731 (with selective augmentation)
Spanish train set size: 9731 (with selective augmentation)


In [27]:
import json
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset


# Define correct label classes in fixed order
CORRECT_LABELS = [
    "IDEOLOGICAL-INEQUALITY",
    "MISOGYNY-NON-SEXUAL-VIOLENCE",
    "OBJECTIFICATION",
    "SEXUAL-VIOLENCE",
    "STEREOTYPING-DOMINANCE",
    "NO"
]


# === Train Model ===
def train_model(train_dataset, val_dataset, output_dir):
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-multilingual-cased",
        num_labels=len(CORRECT_LABELS),
        problem_type="multi_label_classification"
    )

    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        num_train_epochs=5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        logging_dir="./logs",
        logging_steps=10,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    trainer.train()
    return trainer

# === Train English and Spanish models ===
trainer_en = train_model(train_dataset_en, val_dataset_en, output_dir="./results/en")
trainer_es = train_model(train_dataset_es, val_dataset_es, output_dir="./results/es")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.4117,0.405875
2,0.3782,0.385621
3,0.3449,0.373079
4,0.3597,0.36877


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.4257,0.418609
2,0.3851,0.392061
3,0.3503,0.379482
4,0.3609,0.372128




# **Dev Testing starts from here**

In [28]:
with open("/kaggle/input/exist2025-all/EXIST2025_dev.json", "r", encoding="utf-8") as f:
    dev_data = json.load(f)

# Extract tweets and IDs
dev_tweets = [entry["tweet"] for entry in dev_data.values()]
dev_ids = [entry["id_EXIST"] for entry in dev_data.values()]


In [29]:
import json

# Load the dev dataset
with open("/kaggle/input/exist2025-all/EXIST2025_dev.json", "r", encoding="utf-8") as f:
    dev_data = json.load(f)

# Split into English & Spanish
english_dev_tweets = []
english_dev_ids = []
spanish_dev_tweets = []
spanish_dev_ids = []

for entry in dev_data.values():
    tweet_id = entry["id_EXIST"]
    tweet = entry["tweet"]
    lang = entry["lang"]

    if lang == "en":
        english_dev_tweets.append(tweet)
        english_dev_ids.append(tweet_id)
    elif lang == "es":
        spanish_dev_tweets.append(tweet)
        spanish_dev_ids.append(tweet_id)

# Debugging: Check split sizes
print(f"English Dev Samples: {len(english_dev_tweets)}")
print(f"Spanish Dev Samples: {len(spanish_dev_tweets)}")


English Dev Samples: 489
Spanish Dev Samples: 549


In [30]:
import os
from transformers import BertForSequenceClassification

# Function to get the latest checkpoint
def get_latest_checkpoint(directory="./results"):
    checkpoints = [d for d in os.listdir(directory) if d.startswith("checkpoint-")]
    if not checkpoints:
        raise ValueError(f"No checkpoints found in {directory}")
    latest_checkpoint = sorted(checkpoints, key=lambda x: int(x.split('-')[-1]))[-1]
    return os.path.join(directory, latest_checkpoint)

# Load the best model checkpoint for English and Spanish
latest_checkpoint_en = get_latest_checkpoint("./results/en")
latest_checkpoint_es = get_latest_checkpoint("./results/es")

print(f"Using latest checkpoint for English: {latest_checkpoint_en}")
print(f"Using latest checkpoint for Spanish: {latest_checkpoint_es}")

# Load models
model_en = BertForSequenceClassification.from_pretrained(latest_checkpoint_en)
model_es = BertForSequenceClassification.from_pretrained(latest_checkpoint_es)


Using latest checkpoint for English: ./results/en/checkpoint-1220
Using latest checkpoint for Spanish: ./results/es/checkpoint-1220


# This is Soft Soft

In [31]:
def predict_on_dev(tweets, ids, model, tokenizer, label_classes, output_file):
    model.eval()
    results = []

    for tweet, tweet_id in zip(tweets, ids):
        encoding = tokenizer(tweet, truncation=True, padding="max_length", max_length=256, return_tensors="pt")

        with torch.no_grad():
            outputs = model(**encoding)

        logits = outputs.logits.squeeze()
        probs = torch.sigmoid(logits).cpu().numpy()

        # Convert probabilities to dictionary format and sort by highest probability
        soft_label_dict = {label_classes[i]: float(probs[i]) for i in range(len(label_classes))}
        sorted_soft_label_dict = dict(sorted(soft_label_dict.items(), key=lambda item: item[1], reverse=True))  # Sort descending

        results.append({
            "test_case": "EXIST2025",
            "id": tweet_id,
            "value": sorted_soft_label_dict  # Rename "soft_label" to "value" and sort it
        })

    # Save results
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4)

    print(f"Predictions saved to {output_file}")
    
label_classes = CORRECT_LABELS


# Run predictions
predict_on_dev(english_dev_tweets, english_dev_ids, model_en, tokenizer, label_classes, "EXIST2025_dev_predictions_en.json")
predict_on_dev(spanish_dev_tweets, spanish_dev_ids, model_es, tokenizer, label_classes, "EXIST2025_dev_predictions_es.json")


Predictions saved to EXIST2025_dev_predictions_en.json
Predictions saved to EXIST2025_dev_predictions_es.json


# This be Hard Hard

In [32]:
def predict_hard_labels_from_soft_model(tweets, ids, model, tokenizer, label_classes, output_file, threshold=0.33):
    """
    Uses the soft model to predict hard labels by applying a threshold.
    - Labels are assigned if their probability > threshold.
    - If no labels pass the threshold, assigns "NO".
    """
    model.eval()
    results = []

    for tweet, tweet_id in zip(tweets, ids):
        encoding = tokenizer(tweet, truncation=True, padding="max_length", max_length=256, return_tensors="pt")

        with torch.no_grad():
            outputs = model(**encoding)

        logits = outputs.logits.squeeze()
        probs = torch.sigmoid(logits).cpu().numpy()

        # Convert probabilities to hard labels using threshold
        hard_labels = [label_classes[i] for i, prob in enumerate(probs) if prob > threshold]

        # If no labels meet the threshold, assign "NO"
        if not hard_labels:
            hard_labels = ["NO"]

        results.append({
            "test_case": "EXIST2025",
            "id": tweet_id,
            "value": hard_labels  # Final hard labels
        })

    # Save results
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4)

    print(f"Hard label predictions saved to {output_file}")

# Run hard label prediction using soft model
predict_hard_labels_from_soft_model(english_dev_tweets, english_dev_ids, model_en, tokenizer, label_classes, "EXIST2025_dev_predictions_hard_en.json")
predict_hard_labels_from_soft_model(spanish_dev_tweets, spanish_dev_ids, model_es, tokenizer, label_classes, "EXIST2025_dev_predictions_hard_es.json")


Hard label predictions saved to EXIST2025_dev_predictions_hard_en.json
Hard label predictions saved to EXIST2025_dev_predictions_hard_es.json


# Merging soft models ka dev set predictions

In [33]:
import json

# Load the Spanish predictions
with open("/kaggle/working/EXIST2025_dev_predictions_es.json", "r", encoding="utf-8") as f:
    es_data = json.load(f)

# Load the English predictions
with open("/kaggle/working/EXIST2025_dev_predictions_en.json", "r", encoding="utf-8") as f:
    en_data = json.load(f)

# Assuming both files contain lists of predictions, merge them
if isinstance(es_data, list) and isinstance(en_data, list):
    merged_data = es_data + en_data
else:
    raise ValueError("JSON structure is not a list. Ensure both files contain lists.")

# Save to a new file
output_filename = "EXIST2025_dev_predictions_merged_soft.json"
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=4, ensure_ascii=False)

print(f"Merging complete! Saved to {output_filename}")


Merging complete! Saved to EXIST2025_dev_predictions_merged_soft.json


In [34]:
import json
import numpy as np

# Load your predictions file
with open('EXIST2025_dev_predictions_merged_soft.json', 'r') as f:
    predictions = json.load(f)

# Define the snapping values (multiples of 1/6)
snap_vals = np.array([i / 6 for i in range(7)])  # [0.0, 0.1667, ..., 1.0]

def snap_to_nearest_sixth(value):
    return float(snap_vals[np.argmin(np.abs(snap_vals - value))])

# Snap each value in the 'value' dict
for entry in predictions:
    entry['value'] = {k: snap_to_nearest_sixth(v) for k, v in entry['value'].items()}

# Save the snapped predictions to a new file
with open('EXIST2025_dev_predictions_snapped_soft.json', 'w') as f:
    json.dump(predictions, f, indent=2)


# Merging hard models ka dev set predictions

In [35]:
import json

# Load the Spanish predictions
with open("/kaggle/working/EXIST2025_dev_predictions_hard_es.json", "r", encoding="utf-8") as f:
    es_data = json.load(f)

# Load the English predictions
with open("/kaggle/working/EXIST2025_dev_predictions_hard_en.json", "r", encoding="utf-8") as f:
    en_data = json.load(f)

# Assuming both files contain lists of predictions, merge them
if isinstance(es_data, list) and isinstance(en_data, list):
    merged_data = es_data + en_data
else:
    raise ValueError("JSON structure is not a list. Ensure both files contain lists.")

# Save to a new file
output_filename = "EXIST2025_dev_predictions_merged_hard.json"
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=4, ensure_ascii=False)

print(f"Merging complete! Saved to {output_filename}")


Merging complete! Saved to EXIST2025_dev_predictions_merged_hard.json


# Metric Calculation

In [36]:
pip install pyevall

Note: you may need to restart the kernel to use updated packages.


# This is hard metrics

In [37]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils

predictions = "/kaggle/working/EXIST2025_dev_predictions_merged_hard.json"         
gold = "/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_gold_hard.json" 
test = PyEvALLEvaluation() 
params= dict() 
params[PyEvALLUtils.PARAM_REPORT]= PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED  
# metrics=["ICMSoft", "ICMSoftNorm", "CrossEntropy"]     # for soft    
metrics=["ICM", "ICMNorm" ,"FMeasure"] 
TASK1_3_HIERARCHY = {"YES":["IDEOLOGICAL-INEQUALITY","STEREOTYPING-DOMINANCE","OBJECTIFICATION", "SEXUAL-VIOLENCE", "MISOGYNY-NON-SEXUAL-VIOLENCE"], "NO":[]}
params[PyEvALLUtils.PARAM_HIERARCHY]= TASK1_3_HIERARCHY  
report= test.evaluate(predictions, gold, metrics, **params) 
report.print_report()



2025-04-24 08:54:16,800 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM', 'ICMNorm', 'FMeasure']
2025-04-24 08:54:16,986 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-24 08:54:17,489 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Normalized evaluation method
2025-04-24 08:54:17,492 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-24 08:54:18,058 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-04-24 08:54:18,546 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
{
  "metrics": {
    "ICM": {
      "name": "Information Contrast model",
      "acronym": "ICM",
      "description": "Coming soon!",
      "status": "OK",
      "results": {
        "test_cases": [{
          "name": "EXIST2025",
          "average": -0.26853

# These are soft metrics

In [39]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils

# Define file paths
predictions = "/kaggle/working/EXIST2025_dev_predictions_merged_soft.json"  # Change to your actual prediction file
gold = "/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_gold_soft.json"   # Change to your actual gold file

# Define hierarchical structure for subtask 1.3
TASK1_3_HIERARCHY = {
    "YES": ["IDEOLOGICAL-INEQUALITY", "STEREOTYPING-DOMINANCE",
            "OBJECTIFICATION", "SEXUAL-VIOLENCE", "MISOGYNY-NON-SEXUAL-VIOLENCE"],
    "NO": []
}

# Initialize PyEvALL evaluation
evaluator = PyEvALLEvaluation()

# Set evaluation parameters
params = dict()
params[PyEvALLUtils.PARAM_HIERARCHY] = TASK1_3_HIERARCHY
params[PyEvALLUtils.PARAM_REPORT] = PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED  # Embedded report

# Define evaluation metrics
# metrics = ["ICM", "ICMNorm", "FMeasure"]
metrics=["ICMSoft", "ICMSoftNorm"]

# Run evaluation
report = evaluator.evaluate(predictions, gold, metrics, **params)

# Print evaluation report
report.print_report()


2025-04-24 08:57:36,133 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICMSoft', 'ICMSoftNorm']
2025-04-24 08:57:36,483 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Soft evaluation method
2025-04-24 08:57:37,877 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM-Soft Normalized evaluation method
2025-04-24 08:57:37,880 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Soft evaluation method
2025-04-24 08:57:39,252 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Soft evaluation method
{
  "metrics": {
    "ICMSoft": {
      "name": "Information Contrast Model Soft",
      "acronym": "ICM-Soft",
      "description": "Coming soon!",
      "status": "OK",
      "results": {
        "test_cases": [{
          "name": "EXIST2025",
          "average": -2.98347663559963
        }],
        "average_per_test_case": -2.98347663559963
      }
    },
   