In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/exist2025/EXIST2025_training.json
/kaggle/input/exist2025/EXIST2025_dev.json
/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_majority_class_soft.json
/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_minority_class_hard.json
/kaggle/input/exist2025-all/EXIST2025_training_task1_3_minority_class_hard.json
/kaggle/input/exist2025-all/EXIST2025_training.json
/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_minority_class_soft.json
/kaggle/input/exist2025-all/EXIST2025_training_task1_3_majority_class_soft.json
/kaggle/input/exist2025-all/EXIST2025_training_task1_3_majority_class_hard.json
/kaggle/input/exist2025-all/EXIST2025_training_task1_3_minority_class_soft.json
/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_majority_class_hard.json
/kaggle/input/exist2025-all/EXIST2025_dev.json
/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_gold_hard.json
/kaggle/input/exist2025-all/EXIST2025_training_task1_3_gold_soft.json
/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_gold_

In [2]:
import wandb

wandb.login(key="0c5f368f1f51fd942ec7bb3a1c74efb7bdc832d6")


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmshoaibvohra[0m ([33mmshoaibvohra-habib-university[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [3]:
import json
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from collections import Counter

# Load the dataset
with open("/kaggle/input/exist2025/EXIST2025_training.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Define correct label classes
CORRECT_LABELS = [
    "IDEOLOGICAL-INEQUALITY",
    "MISOGYNY-NON-SEXUAL-VIOLENCE",
    "OBJECTIFICATION",
    "SEXUAL-VIOLENCE",
    "STEREOTYPING-DOMINANCE",
    "NO"  # Represents non-sexist tweets
]

# Assign-majority-label function (Threshold = 1)
def assign_majority_label(labels_list):
    """
    Determines which labels are assigned based on majority voting.
    - If a label appears more than once, it is included.
    - If no labels pass the threshold, "NO" is assigned.
    """
    # Flatten and replace "-" with "NO"
    flat_labels = [label if label != "-" else "NO" for sublist in labels_list for label in sublist if label != "UNKNOWN"]

    # Count label occurrences
    label_counts = Counter(flat_labels)

    # Select labels with more than 1 vote
    majority_labels = [label for label, count in label_counts.items() if count > 1]

    return majority_labels if majority_labels else ["NO"]

# Extract relevant fields
def process_data(data, lang):
    tweets = []
    labels = []
    ids = []

    for entry in data.values():
        if entry["lang"] == lang:
            tweet_id = entry["id_EXIST"]
            tweet = entry["tweet"]
            is_sexist = any(label == "YES" for label in entry["labels_task1_1"])  # Check if at least one annotator marked it sexist
            label = entry["labels_task1_3"] if is_sexist else [["NO"]]  # Non-sexist tweets get "NO"

            # Get majority labels
            majority_labels = assign_majority_label(label)

            tweets.append(tweet)
            labels.append(majority_labels)
            ids.append(tweet_id)

    return tweets, labels, ids

# Process data for English and Spanish
english_tweets, english_labels, english_ids = process_data(data, "en")
spanish_tweets, spanish_labels, spanish_ids = process_data(data, "es")

# MultiLabel Binarizer with Fixed Labels
mlb = MultiLabelBinarizer(classes=CORRECT_LABELS)  # Force correct label order
english_labels_bin = mlb.fit_transform(english_labels)
spanish_labels_bin = mlb.transform(spanish_labels)  # Use the same binarizer

label_classes = mlb.classes_
print(f"Corrected Label Classes: {label_classes}")  # Debugging

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Custom Dataset Class
class TweetDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tweet_id = self.ids[idx]
        labels = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            "id": tweet_id,
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": labels
        }

# Split into train/test
def get_datasets(tweets, labels, ids):
    train_texts, val_texts, train_labels, val_labels, train_ids, val_ids = train_test_split(tweets, labels, ids, test_size=0.2, random_state=42)
    train_dataset = TweetDataset(train_texts, train_labels, train_ids, tokenizer)
    val_dataset = TweetDataset(val_texts, val_labels, val_ids, tokenizer)
    return train_dataset, val_dataset

train_dataset_en, val_dataset_en = get_datasets(english_tweets, english_labels_bin, english_ids)
train_dataset_es, val_dataset_es = get_datasets(spanish_tweets, spanish_labels_bin, spanish_ids)


Corrected Label Classes: ['IDEOLOGICAL-INEQUALITY' 'MISOGYNY-NON-SEXUAL-VIOLENCE' 'OBJECTIFICATION'
 'SEXUAL-VIOLENCE' 'STEREOTYPING-DOMINANCE' 'NO']


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [4]:

# Define and Train Model for English
model_en = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased", 
    num_labels=len(label_classes), 
    problem_type="multi_label_classification"
)

training_args_en = TrainingArguments(
    output_dir="./results/en",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

trainer_en = Trainer(
    model=model_en,
    args=training_args_en,
    train_dataset=train_dataset_en,
    eval_dataset=val_dataset_en
)

# Train English Model
trainer_en.train()


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Epoch,Training Loss,Validation Loss
1,0.4619,0.44781
2,0.4648,0.449641
3,0.4778,0.448908
4,0.4467,0.448907




TrainOutput(global_step=652, training_loss=0.47103177221274817, metrics={'train_runtime': 422.2682, 'train_samples_per_second': 24.705, 'train_steps_per_second': 1.544, 'total_flos': 1372436553203712.0, 'train_loss': 0.47103177221274817, 'epoch': 4.0})

In [5]:

# Define and Train Model for Spanish
model_es = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased", 
    num_labels=len(label_classes), 
    problem_type="multi_label_classification"
)

training_args_es = TrainingArguments(
    output_dir="./results/es",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

trainer_es = Trainer(
    model=model_es,
    args=training_args_es,
    train_dataset=train_dataset_es,
    eval_dataset=val_dataset_es
)

# Train Spanish Model
trainer_es.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.4588,0.448043
2,0.4434,0.409747
3,0.334,0.404565




TrainOutput(global_step=549, training_loss=0.42653483206674264, metrics={'train_runtime': 356.1925, 'train_samples_per_second': 24.661, 'train_steps_per_second': 1.541, 'total_flos': 1155625257222144.0, 'train_loss': 0.42653483206674264, 'epoch': 3.0})

In [6]:
with open("/kaggle/input/exist2025-all/EXIST2025_dev.json", "r", encoding="utf-8") as f:
    dev_data = json.load(f)

# Extract tweets and IDs
dev_tweets = [entry["tweet"] for entry in dev_data.values()]
dev_ids = [entry["id_EXIST"] for entry in dev_data.values()]


In [7]:
import json

# Load the dev dataset
with open("/kaggle/input/exist2025-all/EXIST2025_dev.json", "r", encoding="utf-8") as f:
    dev_data = json.load(f)

# Split into English & Spanish
english_dev_tweets = []
english_dev_ids = []
spanish_dev_tweets = []
spanish_dev_ids = []

for entry in dev_data.values():
    tweet_id = entry["id_EXIST"]
    tweet = entry["tweet"]
    lang = entry["lang"]

    if lang == "en":
        english_dev_tweets.append(tweet)
        english_dev_ids.append(tweet_id)
    elif lang == "es":
        spanish_dev_tweets.append(tweet)
        spanish_dev_ids.append(tweet_id)

# Debugging: Check split sizes
print(f"English Dev Samples: {len(english_dev_tweets)}")
print(f"Spanish Dev Samples: {len(spanish_dev_tweets)}")


English Dev Samples: 489
Spanish Dev Samples: 549


In [8]:
import os
from transformers import BertForSequenceClassification

# Function to get the latest checkpoint
def get_latest_checkpoint(directory="./results"):
    checkpoints = [d for d in os.listdir(directory) if d.startswith("checkpoint-")]
    if not checkpoints:
        raise ValueError(f"No checkpoints found in {directory}")
    latest_checkpoint = sorted(checkpoints, key=lambda x: int(x.split('-')[-1]))[-1]
    return os.path.join(directory, latest_checkpoint)

# Load the best model checkpoint for English and Spanish
latest_checkpoint_en = get_latest_checkpoint("./results/en")
latest_checkpoint_es = get_latest_checkpoint("./results/es")

print(f"Using latest checkpoint for English: {latest_checkpoint_en}")
print(f"Using latest checkpoint for Spanish: {latest_checkpoint_es}")

# Load models
model_en = BertForSequenceClassification.from_pretrained(latest_checkpoint_en)
model_es = BertForSequenceClassification.from_pretrained(latest_checkpoint_es)


Using latest checkpoint for English: ./results/en/checkpoint-652
Using latest checkpoint for Spanish: ./results/es/checkpoint-549


In [10]:
def predict_on_dev(tweets, ids, model, tokenizer, label_classes, output_file):
    model.eval()
    results = []

    for tweet, tweet_id in zip(tweets, ids):
        encoding = tokenizer(tweet, truncation=True, padding="max_length", max_length=256, return_tensors="pt")

        with torch.no_grad():
            outputs = model(**encoding)

        logits = outputs.logits.squeeze()
        probs = torch.sigmoid(logits).cpu().numpy()

        # Select labels with the highest probabilities (let the model decide)
        hard_labels = [label_classes[i] for i, prob in enumerate(probs) if prob > 0.5]  # Default threshold

        # If no labels meet the threshold, assign "NO"
        if not hard_labels:
            hard_labels = ["NO"]

        results.append({
            "test_case": "EXIST2025",
            "id": tweet_id,
            "value": hard_labels  # The model's final predicted labels
        })

    # Save results
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4)

    print(f"Predictions saved to {output_file}")

# Run hard-label predictions using the pretrained model
predict_on_dev(english_dev_tweets, english_dev_ids, model_en, tokenizer, label_classes, "EXIST2025_dev_predictions_hard_en.json")
predict_on_dev(spanish_dev_tweets, spanish_dev_ids, model_es, tokenizer, label_classes, "EXIST2025_dev_predictions_hard_es.json")


Predictions saved to EXIST2025_dev_predictions_hard_en.json
Predictions saved to EXIST2025_dev_predictions_hard_es.json


In [31]:
import json

# Load the Spanish predictions
with open("/kaggle/working/EXIST2025_dev_predictions_es.json", "r", encoding="utf-8") as f:
    es_data = json.load(f)

# Load the English predictions
with open("/kaggle/working/EXIST2025_dev_predictions_en.json", "r", encoding="utf-8") as f:
    en_data = json.load(f)

# Assuming both files contain lists of predictions, merge them
if isinstance(es_data, list) and isinstance(en_data, list):
    merged_data = es_data + en_data
else:
    raise ValueError("JSON structure is not a list. Ensure both files contain lists.")

# Save to a new file
output_filename = "EXIST2025_dev_predictions_merged2.json"
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=4, ensure_ascii=False)

print(f"Merging complete! Saved to {output_filename}")


Merging complete! Saved to EXIST2025_dev_predictions_merged2.json


In [23]:
import json
import numpy as np

# File paths
predictions_file = "/kaggle/working/EXIST2025_dev_predictions_merged.json"
gold_labels_file = "/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_gold_soft.json"

# Load predictions
with open(predictions_file, "r", encoding="utf-8") as f:
    predictions_data = json.load(f)

# Load gold labels
with open(gold_labels_file, "r", encoding="utf-8") as f:
    gold_data = json.load(f)

# Convert gold labels into a dictionary for quick lookup
gold_dict = {entry["id"]: entry["value"] for entry in gold_data}

# Extract all category names
categories = ["IDEOLOGICAL-INEQUALITY", "MISOGYNY-NON-SEXUAL-VIOLENCE", 
              "OBJECTIFICATION", "SEXUAL-VIOLENCE", "STEREOTYPING-DOMINANCE", "NO"]

# Compute metrics
icm_soft_values = []
icm_soft_norm_values = []

for entry in predictions_data:
    pred_id = entry["id"]
    if pred_id in gold_dict:
        pred_values = np.array([entry["value"][cat] for cat in categories])
        gold_values = np.array([gold_dict[pred_id][cat] for cat in categories])

        # ICM Soft (Mean Squared Error)
        mse = np.mean((pred_values - gold_values) ** 2)
        icm_soft_values.append(mse)

        # ICM Soft Norm (MSE normalized by gold label mean)
        norm_factor = np.mean(gold_values ** 2)
        icm_soft_norm_values.append(mse / norm_factor if norm_factor != 0 else mse)

# Final aggregated scores
final_icm_soft = np.mean(icm_soft_values)
final_icm_soft_norm = np.mean(icm_soft_norm_values)

print(f"ICM Soft Score: {final_icm_soft:.4f}")
print(f"ICM Soft Norm Score: {final_icm_soft_norm:.4f}")


ICM Soft Score: 0.1570
ICM Soft Norm Score: 1.7035


In [None]:
import json
import numpy as np
from scipy.stats import norm

# File paths
predictions_file = "/kaggle/working/EXIST2025_dev_predictions_merged1.json"
gold_labels_file = "/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_gold_soft.json"

# Load predictions
with open(predictions_file, "r", encoding="utf-8") as f:
    predictions_data = json.load(f)

# Load gold labels
with open(gold_labels_file, "r", encoding="utf-8") as f:
    gold_data = json.load(f)

# Convert gold labels into a dictionary for quick lookup
gold_dict = {entry["id"]: entry["value"] for entry in gold_data}

# Extract all category names
categories = ["IDEOLOGICAL-INEQUALITY", "MISOGYNY-NON-SEXUAL-VIOLENCE", 
              "OBJECTIFICATION", "SEXUAL-VIOLENCE", "STEREOTYPING-DOMINANCE", "NO"]

# Compute mean and std for each category in gold labels
category_means = {cat: np.mean([gold_dict[d][cat] for d in gold_dict]) for cat in categories}
category_stds = {cat: np.std([gold_dict[d][cat] for d in gold_dict]) for cat in categories}

# Function to compute Information Content (IC)
def compute_ic(value, category):
    mean = category_means[category]
    std = category_stds[category]

    if std == 0:
        return 0  # No IC if all values are the same

    # Probability of instances in gold standard exceeding the value
    prob = 1 - norm.cdf(value, mean, std)
    prob = max(prob, 1e-10)  # Avoid log(0)
    
    return -np.log2(prob)

# Compute metrics
icm_soft_values = []
icm_soft_norm_values = []

for entry in predictions_data:
    pred_id = entry["id"]
    if pred_id in gold_dict:
        pred_values = {cat: entry["soft_label"][cat] for cat in categories}
        gold_values = {cat: gold_dict[pred_id][cat] for cat in categories}

        # Compute IC for system output, gold standard, and their union
        ic_pred = sum(compute_ic(pred_values[cat], cat) for cat in categories)
        ic_gold = sum(compute_ic(gold_values[cat], cat) for cat in categories)
        
        # Fuzzy union (max values)
        union_values = {cat: max(pred_values[cat], gold_values[cat]) for cat in categories}
        ic_union = sum(compute_ic(union_values[cat], cat) for cat in categories)

        # Compute ICM Soft
        icm_score = 2 * ic_pred + 2 * ic_gold - 3 * ic_union
        icm_score = max(icm_score, 0)  # Truncate negative scores to 0
        icm_soft_values.append(icm_score)

        # Compute ICM Soft Norm
        icm_norm_score = icm_score / ic_gold if ic_gold != 0 else icm_score
        icm_soft_norm_values.append(icm_norm_score)

# Final aggregated scores
final_icm_soft = np.mean(icm_soft_values)
final_icm_soft_norm = np.mean(icm_soft_norm_values)

print(f"ICM Soft Score: {final_icm_soft:.4f}")
print(f"ICM Soft Norm Score: {final_icm_soft_norm:.4f}")


In [20]:
pip install pyevall

Note: you may need to restart the kernel to use updated packages.


In [32]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils

predictions = "/kaggle/working/EXIST2025_dev_predictions_merged2.json"         
gold = "/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_gold_soft.json" 
test = PyEvALLEvaluation() 
params= dict() 
params[PyEvALLUtils.PARAM_REPORT]= PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED  
metrics=["ICMSoft", "ICMSoftNorm", "CrossEntropy"]     # for soft    
# metrics=["ICM", "ICMNorm" ,"FMeasure"]           # for hard     
report= test.evaluate(predictions, gold, metrics, **params) 
report.print_report()

2025-03-26 16:06:03,206 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICMSoft', 'ICMSoftNorm', 'CrossEntropy']
2025-03-26 16:06:03,561 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Soft evaluation method
2025-03-26 16:06:04,703 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM-Soft Normalized evaluation method
2025-03-26 16:06:04,707 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Soft evaluation method
2025-03-26 16:06:05,821 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Soft evaluation method
2025-03-26 16:06:07,330 - pyevall.metrics.metrics - INFO -             evaluate() - Executing Cross Entropy evaluation method
{
  "metrics": {
    "ICMSoft": {
      "name": "Information Contrast Model Soft",
      "acronym": "ICM-Soft",
      "description": "Coming soon!",
      "status": "OK",
      "results": {
        "test_cases": [{
         