In [1]:
## File cleaning
import json

# Input and output files
input_files = ['/kaggle/input/dataset/EXIST2025_training.json', '/kaggle/input/dataset/EXIST2025_dev.json', '/kaggle/input/testing/EXIST2025_test_clean.json'] 

for infile in input_files:
    with open(infile, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Containers for language-specific cleaned data
    data_en = {}
    data_es = {}

    for key, entry in data.items():
        # Remove unnecessary fields
        entry.pop('labels_task1_2', None)
        entry.pop('labels_task1_3', None)

        # Split based on language
        if entry['lang'] == 'en':
            data_en[key] = entry
        elif entry['lang'] == 'es':
            data_es[key] = entry

    # Base name without path and extension
    base_name = infile.split('/')[-1].split('.')[0]

    # Save cleaned English and Spanish files in the writable `/kaggle/working/` directory
    with open(f"/kaggle/working/{base_name}_cleaned_en.json", 'w', encoding='utf-8') as f_en:
        json.dump(data_en, f_en, ensure_ascii=False, indent=2)

    with open(f"/kaggle/working/{base_name}_cleaned_es.json", 'w', encoding='utf-8') as f_es:
        json.dump(data_es, f_es, ensure_ascii=False, indent=2)

    print(f"{base_name}_cleaned_en.json and {base_name}_cleaned_es.json saved successfully.")

EXIST2025_training_cleaned_en.json and EXIST2025_training_cleaned_es.json saved successfully.
EXIST2025_dev_cleaned_en.json and EXIST2025_dev_cleaned_es.json saved successfully.
EXIST2025_test_clean_cleaned_en.json and EXIST2025_test_clean_cleaned_es.json saved successfully.


In [2]:
pip install transformers datasets pandas scikit-learn torch

Note: you may need to restart the kernel to use updated packages.


In [3]:
import wandb

wandb.login(key="a40bf999db96c982783dc52dd0594d3347848f02")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mfaisalsara124[0m ([33mfaisalsara124-habib-university[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [10]:
pip install PyEvALL

Collecting PyEvALL
  Downloading PyEvALL-0.1.78.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jsbeautifier==1.14.9 (from PyEvALL)
  Downloading jsbeautifier-1.14.9.tar.gz (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting setuptools==69.5.1 (from PyEvALL)
  Downloading setuptools-69.5.1-py3-none-any.whl.metadata (6.2 kB)
Collecting editorconfig>=0.12.2 (from jsbeautifier==1.14.9->PyEvALL)
  Downloading EditorConfig-0.17.0-py3-none-any.whl.metadata (3.8 kB)
Downloading setuptools-69.5.1-py3-none-any.whl (894 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m894.6/894.6 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading EditorConfig-0.17.0-py3-none-any.whl 

In [5]:
##Ensemble for english
##Models used: DistilRoberta, bert-base, roberta-base

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset
import pandas as pd
import numpy as np
import json
import os

def prepare_gold_dataset(clean_path, gold_path, output_path):
    with open(clean_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    with open(gold_path, "r", encoding="utf-8") as f:
        gold_labels = json.load(f)
    label_dict = {entry["id"]: 1 if entry["value"] == "YES" else 0 for entry in gold_labels}
    updated_data = {}
    for tweet_id, tweet_info in data.items():
        tweet_info = tweet_info.copy()
        gold_id = tweet_info.get("id_EXIST")
        if gold_id in label_dict:
            tweet_info["label"] = label_dict[gold_id]
        updated_data[tweet_id] = tweet_info
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(updated_data, f, indent=2, ensure_ascii=False)
    print(f"✅ Gold-labeled training set saved to {output_path}")

def load_and_tokenize_dataset(json_path, tokenizer, max_length=256):
    df = pd.read_json(json_path)
    df = df.T
    df = df[df['lang'] == 'en']
    df = df.dropna(subset=['label'])
    df = df[['tweet', 'label']].rename(columns={'tweet': 'text'})
    df = df.sample(frac=1, random_state=42)
    dataset = Dataset.from_pandas(df)
    return dataset.train_test_split(test_size=0.1)

def train_model(json_path, model_checkpoint, save_name):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
    splits = load_and_tokenize_dataset(json_path, tokenizer)
    
    def preprocess(example):
        return tokenizer(example['text'], truncation=True, padding='max_length', max_length=256)
    
    train_ds = splits['train'].map(preprocess, batched=True)
    val_ds = splits['test'].map(preprocess, batched=True)

    training_args = TrainingArguments(
        output_dir=f"results/{save_name}",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir=f"logs/{save_name}",
        logging_steps=50,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
    )

    trainer.train()
    model.save_pretrained(f"{save_name}_sexism_classifier")
    tokenizer.save_pretrained(f"{save_name}_sexism_classifier")
    print(f"✅ Model saved to {save_name}_sexism_classifier")


In [6]:
prepare_gold_dataset(
    clean_path="/kaggle/input/translated/EXIST2025_training_translated_en.json",
    gold_path="/kaggle/input/gold-hard/EXIST2025_training_task1_1_gold_hard.json",
    output_path="EXIST2025_training_with_gold.json"
)
train_model("EXIST2025_training_with_gold.json", "distilroberta-base", "distilroberta-base")
train_model("EXIST2025_training_with_gold.json", "bert-base-uncased", "bert-base-uncased")
train_model("EXIST2025_training_with_gold.json", "roberta-base", "roberta-base")

✅ Gold-labeled training set saved to EXIST2025_training_with_gold.json


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2583 [00:00<?, ? examples/s]

Map:   0%|          | 0/287 [00:00<?, ? examples/s]

  trainer = Trainer(




Epoch,Training Loss,Validation Loss
1,0.4564,0.422307
2,0.3455,0.445668
3,0.209,0.455466




✅ Model saved to distilroberta-base_sexism_classifier


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2583 [00:00<?, ? examples/s]

Map:   0%|          | 0/287 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.4809,0.381859
2,0.2854,0.363987
3,0.1846,0.389703




✅ Model saved to bert-base-uncased_sexism_classifier


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2583 [00:00<?, ? examples/s]

Map:   0%|          | 0/287 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.5012,0.418501
2,0.3568,0.370732
3,0.2424,0.451769




✅ Model saved to roberta-base_sexism_classifier


In [None]:
##Checking how the models are doing individually

In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import json
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

# === Load Gold-Labeled Dev Set ===
df = pd.read_json("EXIST2025_dev_cleaned_en.json").T
df = df[df['lang'] == 'en']

# Load gold labels for DEV set only
with open("/kaggle/input/gold-hard-dev/EXIST2025_dev_task1_1_gold_hard (1).json", "r", encoding="utf-8") as f:
    gold = json.load(f)
label_map = {entry["id"]: 1 if entry["value"] == "YES" else 0 for entry in gold}

# Add labels to df
df['label'] = df['id_EXIST'].map(label_map)
df = df.dropna(subset=['label'])  # drop rows without gold labels
df['label'] = df['label'].astype(int)
df = df[['id_EXIST', 'tweet', 'label']].rename(columns={'id_EXIST': 'id', 'tweet': 'text'})

print(f"✅ Total gold-labeled dev tweets: {len(df)}")

# === Models to Evaluate ===
model_paths = {
    "DistilRoBERTa": "distilroberta-base_sexism_classifier",
    "BERT-base": "bert-base-uncased_sexism_classifier",
    "RoBERTa-base": "roberta-base_sexism_classifier"
}

# === Evaluate each model ===
for model_name, path in model_paths.items():
    tokenizer = AutoTokenizer.from_pretrained(path)
    model = AutoModelForSequenceClassification.from_pretrained(path)
    model.eval()

    preds = []
    for text in df['text']:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            logits = model(**inputs).logits
            pred = torch.argmax(logits, dim=1).item()
            preds.append(pred)

    true_labels = df['label'].tolist()
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, preds, average='binary')
    accuracy = accuracy_score(true_labels, preds)

    print(f"\n🔍 Evaluation for {model_name}")
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print("\n📄 Classification Report:")
    print(classification_report(true_labels, preds, target_names=["non-sexist", "sexist"]))


✅ Total gold-labeled dev tweets: 444

🔍 Evaluation for DistilRoBERTa
Accuracy:  0.8311
Precision: 0.8251
Recall:    0.7784
F1 Score:  0.8011

📄 Classification Report:
              precision    recall  f1-score   support

  non-sexist       0.84      0.87      0.85       250
      sexist       0.83      0.78      0.80       194

    accuracy                           0.83       444
   macro avg       0.83      0.83      0.83       444
weighted avg       0.83      0.83      0.83       444


🔍 Evaluation for BERT-base
Accuracy:  0.8063
Precision: 0.8971
Recall:    0.6289
F1 Score:  0.7394

📄 Classification Report:
              precision    recall  f1-score   support

  non-sexist       0.77      0.94      0.85       250
      sexist       0.90      0.63      0.74       194

    accuracy                           0.81       444
   macro avg       0.83      0.79      0.79       444
weighted avg       0.82      0.81      0.80       444


🔍 Evaluation for RoBERTa-base
Accuracy:  0.8423
Prec

In [None]:
## Checking Accuracy for the emsemble model

In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import json
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

# === Load Gold-Labeled Dev Set ===
df = pd.read_json("EXIST2025_dev_cleaned_en.json").T
df = df[df['lang'] == 'en']

with open("/kaggle/input/gold-hard-dev/EXIST2025_dev_task1_1_gold_hard (1).json", "r", encoding="utf-8") as f:
    gold = json.load(f)
label_map = {entry["id"]: 1 if entry["value"] == "YES" else 0 for entry in gold}

df['label'] = df['id_EXIST'].map(label_map)
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)
df = df[['id_EXIST', 'tweet', 'label']].rename(columns={'id_EXIST': 'id', 'tweet': 'text'})

print(f"✅ Total gold-labeled dev tweets: {len(df)}")

# === Ensemble models (equal weights by default) ===
model_paths = [
    "distilroberta-base_sexism_classifier",
    "bert-base-uncased_sexism_classifier",
    "roberta-base_sexism_classifier"
]

# weights = [0.5, 0.2, 0.3]  # Equal weighting
# weights = [0.4, 0.3, 0.3] 
# weights = [0.5, 0.1, 0.4] 
weights = [0.5, 0.1, 0.4] 
weights = np.array(weights) / sum(weights)  # Normalize

all_model_probs = []

# === Collect probabilities from each model ===
for path in model_paths:
    tokenizer = AutoTokenizer.from_pretrained(path)
    model = AutoModelForSequenceClassification.from_pretrained(path)
    model.eval()

    model_probs = []
    for text in df['text']:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            logits = model(**inputs).logits
            probs = torch.softmax(logits, dim=1).squeeze(0).cpu().numpy()
            model_probs.append(probs)

    all_model_probs.append(np.array(model_probs))  # shape: [num_samples, 2]

# === Weighted soft voting ===
all_model_probs = np.array(all_model_probs)  # shape: [num_models, num_samples, 2]
weighted_probs = np.average(all_model_probs, axis=0, weights=weights)  # shape: [num_samples, 2]
ensemble_preds = np.argmax(weighted_probs, axis=1)

# === Evaluation ===
true_labels = df['label'].tolist()
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, ensemble_preds, average='binary')
accuracy = accuracy_score(true_labels, ensemble_preds)

print("\n🔗 Ensemble Evaluation on GOLD Dev Set (Soft Voting, Equal Weights):")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print("\n📄 Classification Report:")
print(classification_report(true_labels, ensemble_preds, target_names=["non-sexist", "sexist"]))

# === Save Output for PyEvALL ===
output = []
for tweet_id, pred in zip(df['id'], ensemble_preds):
    output.append({
        "test_case": "EXIST2025",
        "id": str(tweet_id),
        "value": "YES" if pred == 1 else "NO"
    })

# Sort by ID to ensure consistency
output_sorted = sorted(output, key=lambda x: int(x["id"]))

# Save to JSON
with open("ensemble_predictions_output.json", "w", encoding="utf-8") as f:
    json.dump(output_sorted, f, indent=2, ensure_ascii=False)

print("✅ Predictions saved for PyEvALL evaluation")


✅ Total gold-labeled dev tweets: 444

🔗 Ensemble Evaluation on GOLD Dev Set (Soft Voting, Equal Weights):
Accuracy:  0.8559
Precision: 0.8916
Recall:    0.7629
F1 Score:  0.8222

📄 Classification Report:
              precision    recall  f1-score   support

  non-sexist       0.83      0.93      0.88       250
      sexist       0.89      0.76      0.82       194

    accuracy                           0.86       444
   macro avg       0.86      0.85      0.85       444
weighted avg       0.86      0.86      0.85       444

✅ Predictions saved for PyEvALL evaluation


In [None]:
##ICM Score ensemble model predictions english

In [13]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils

predictions = "ensemble_predictions_output.json"
gold = "/kaggle/input/gold-hard-dev/EXIST2025_dev_task1_1_gold_hard (1).json"

# Initialize evaluator
evaluator = PyEvALLEvaluation()

# Set parameters
params = {
    PyEvALLUtils.PARAM_REPORT: PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED
}

# Choose metrics (ICM for hard labels)
metrics = ["ICM", "ICMNorm", "FMeasure"]  # You can also try ICMSoft for soft scores

# Run evaluation
report = evaluator.evaluate(predictions, gold, metrics, **params)
report.print_report()


2025-05-08 08:04:45,947 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM', 'ICMNorm', 'FMeasure']
2025-05-08 08:04:46,037 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-08 08:04:46,385 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Normalized evaluation method
2025-05-08 08:04:46,389 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-08 08:04:46,722 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-08 08:04:47,110 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
{
  "metrics": {
    "ICM": {
      "name": "Information Contrast model",
      "acronym": "ICM",
      "description": "Coming soon!",
      "status": "OK",
      "results": {
        "test_cases": [{
          "name": "EXIST2025",
          "average": -0.25954

In [None]:
## Trying AEDA+ensemble for english

In [6]:
pip install -q nltk

Note: you may need to restart the kernel to use updated packages.


In [12]:
import random

# Set of punctuations AEDA uses
PUNCTUATIONS = ['.', ',', '!', '?', ';', ':']

def aeda(sentence, punc_ratio=0.3, max_insert=3):
    words = sentence.split()
    n = len(words)
    num_puncs = min(max_insert, max(1, int(punc_ratio * n)))

    insert_positions = random.sample(range(n), num_puncs)
    for pos in insert_positions:
        punct = random.choice(PUNCTUATIONS)
        words[pos] = words[pos] + punct
    return ' '.join(words)


In [14]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset
import pandas as pd
import numpy as np
import json
import os

def prepare_gold_dataset(clean_path, gold_path, output_path):
    with open(clean_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    with open(gold_path, "r", encoding="utf-8") as f:
        gold_labels = json.load(f)
    label_dict = {entry["id"]: 1 if entry["value"] == "YES" else 0 for entry in gold_labels}
    updated_data = {}
    for tweet_id, tweet_info in data.items():
        tweet_info = tweet_info.copy()
        gold_id = tweet_info.get("id_EXIST")
        if gold_id in label_dict:
            tweet_info["label"] = label_dict[gold_id]
        updated_data[tweet_id] = tweet_info
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(updated_data, f, indent=2, ensure_ascii=False)
    print(f"✅ Gold-labeled training set saved to {output_path}")

def load_and_tokenize_dataset(json_path, tokenizer, max_length=256, apply_aeda=False):
    df = pd.read_json(json_path).T
    df = df[df['lang'] == 'en']
    df = df.dropna(subset=['label'])
    df = df[['tweet', 'label']].rename(columns={'tweet': 'text'})

    if apply_aeda:
        augmented_rows = []
        for _, row in df.iterrows():
            aug_text = aeda(row['text'])
            augmented_rows.append({'text': aug_text, 'label': row['label']})
        aug_df = pd.DataFrame(augmented_rows)
        df = pd.concat([df, aug_df], ignore_index=True)  # add augmented examples

    df = df.sample(frac=1, random_state=42)
    dataset = Dataset.from_pandas(df)
    return dataset.train_test_split(test_size=0.1)


def train_model(json_path, model_checkpoint, save_name, use_aeda=False):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
    splits = load_and_tokenize_dataset(json_path, tokenizer, apply_aeda=True)
    
    def preprocess(example):
        return tokenizer(example['text'], truncation=True, padding='max_length', max_length=256)
    
    train_ds = splits['train'].map(preprocess, batched=True)
    val_ds = splits['test'].map(preprocess, batched=True)

    training_args = TrainingArguments(
        output_dir=f"results/{save_name}",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir=f"logs/{save_name}",
        logging_steps=50,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
    )

    trainer.train()
    model.save_pretrained(f"{save_name}_sexism_classifier_aeda")
    tokenizer.save_pretrained(f"{save_name}_sexism_classifier_aeda")
    print(f"✅ Model saved to {save_name}_sexism_classifier_aeda")


In [None]:
prepare_gold_dataset(
    clean_path="/kaggle/input/translated/EXIST2025_training_translated_en.json",
    gold_path="/kaggle/input/gold-hard/EXIST2025_training_task1_1_gold_hard.json",
    output_path="EXIST2025_training_with_gold.json"
)
train_model("EXIST2025_training_with_gold.json", "distilroberta-base", "distilroberta-base", use_aeda=True)
train_model("EXIST2025_training_with_gold.json", "bert-base-uncased", "bert-base-uncased", use_aeda=True)
train_model("EXIST2025_training_with_gold.json", "roberta-base", "roberta-base", use_aeda=True)

✅ Gold-labeled training set saved to EXIST2025_training_with_gold.json


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/5166 [00:00<?, ? examples/s]

Map:   0%|          | 0/574 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.3893,0.319167
2,0.2195,0.231406




In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import json
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from scipy.optimize import minimize

# === Load Gold-Labeled Dev Set ===
df = pd.read_json("EXIST2025_dev_cleaned_en.json").T
df = df[df['lang'] == 'en']

with open("/kaggle/input/gold-hard-dev/EXIST2025_dev_task1_1_gold_hard (1).json", "r", encoding="utf-8") as f:
    gold = json.load(f)

label_map = {entry["id"]: 1 if entry["value"] == "YES" else 0 for entry in gold}
df['label'] = df['id_EXIST'].map(label_map)
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)
df = df[['id_EXIST', 'tweet', 'label']].rename(columns={'id_EXIST': 'id', 'tweet': 'text'})

print(f"✅ Total gold-labeled dev tweets: {len(df)}")

# === Load Predictions from Each Model ===
model_paths = [
    "distilroberta-base_sexism_classifier_aeda",
    "bert-base-uncased_sexism_classifier_aeda",
    "roberta-base_sexism_classifier_aeda"
]

all_model_probs = []

for path in model_paths:
    tokenizer = AutoTokenizer.from_pretrained(path)
    model = AutoModelForSequenceClassification.from_pretrained(path)
    model.eval()

    model_probs = []
    for text in df['text']:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            logits = model(**inputs).logits
            probs = torch.softmax(logits, dim=1).squeeze(0).cpu().numpy()
            model_probs.append(probs)

    all_model_probs.append(np.array(model_probs))

all_model_probs = np.array(all_model_probs)  # shape: [num_models, num_samples, 2]
true_labels = df['label'].tolist()

# === Define Objective Function for Optimization (maximize F1 → minimize -F1) ===
def evaluate_weights(weights, all_model_probs, true_labels):
    weights = np.array(weights)
    weights = weights / weights.sum()  # Normalize weights
    weighted_probs = np.average(all_model_probs, axis=0, weights=weights)
    preds = np.argmax(weighted_probs, axis=1)
    _, _, f1, _ = precision_recall_fscore_support(true_labels, preds, average='binary')
    return -f1  # Minimize negative F1

# === Optimize Weights ===
initial_weights = np.ones(len(model_paths)) / len(model_paths)
bounds = [(0, 1)] * len(model_paths)
constraints = {'type': 'eq', 'fun': lambda w: np.sum(w) - 1}

result = minimize(
    evaluate_weights,
    initial_weights,
    args=(all_model_probs, true_labels),
    method='SLSQP',
    bounds=bounds,
    constraints=constraints
)

best_weights = result.x
print(f"\n Optimal Ensemble Weights: {best_weights}")

# === Use Optimal Weights to Make Final Prediction ===
weighted_probs = np.average(all_model_probs, axis=0, weights=best_weights)
ensemble_preds = np.argmax(weighted_probs, axis=1)

# === Evaluation ===
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, ensemble_preds, average='binary')
accuracy = accuracy_score(true_labels, ensemble_preds)

print("\n🔗 Ensemble Evaluation on GOLD Dev Set (Optimized Soft Voting):")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print("\n📄 Classification Report:")
print(classification_report(true_labels, ensemble_preds, target_names=["non-sexist", "sexist"]))

# === Save Output for PyEvALL ===
output = []
for tweet_id, pred in zip(df['id'], ensemble_preds):
    output.append({
        "test_case": "EXIST2025",
        "id": str(tweet_id),
        "value": "YES" if pred == 1 else "NO"
    })

output_sorted = sorted(output, key=lambda x: int(x["id"]))
with open("ensemble_predictions_output.json", "w", encoding="utf-8") as f:
    json.dump(output_sorted, f, indent=2, ensure_ascii=False)

print("✅ Predictions saved for PyEvALL evaluation")

In [None]:
## Results for AEDA+ensemble for english (Better than just ensemble)

In [7]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils

predictions = "/kaggle/input/ensemble-aeda/ensemble_predictions_output_aeda.json"
gold = "/kaggle/input/gold-hard-dev/EXIST2025_dev_task1_1_gold_hard (1).json"

# Initialize evaluator
evaluator = PyEvALLEvaluation()

# Set parameters
params = {
    PyEvALLUtils.PARAM_REPORT: PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED
}

# Choose metrics (ICM for hard labels)
metrics = ["ICM", "ICMNorm", "FMeasure"]  # You can also try ICMSoft for soft scores

# Run evaluation
report = evaluator.evaluate(predictions, gold, metrics, **params)
report.print_report()

2025-05-08 17:26:27,850 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM', 'ICMNorm', 'FMeasure']
2025-05-08 17:26:27,955 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-08 17:26:28,308 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Normalized evaluation method
2025-05-08 17:26:28,310 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-08 17:26:28,635 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-08 17:26:29,035 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
cargado 29
{
  "metrics": {
    "ICM": {
      "name": "Information Contrast model",
      "acronym": "ICM",
      "description": "Coming soon!",
      "status": "OK",
      "results": {
        "test_cases": [{
          "name": "EXIST2025",
          "average

In [None]:
##For spanish (Base model was distilroberta-base)
##Ensemble model (distilroberta, bert-spanish, xlmroberta)

In [5]:
import json
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)

# === Step 1: Prepare gold-labeled Spanish training data ===
def prepare_gold_labeled_training_data(cleaned_path, gold_path, output_path):
    with open(cleaned_path, "r", encoding="utf-8") as f:
        cleaned_data = json.load(f)

    with open(gold_path, "r", encoding="utf-8") as f:
        gold_labels = json.load(f)

    # Map gold labels without checking for 'language'
    label_map = {entry["id"]: 1 if entry["value"] == "YES" else 0 for entry in gold_labels}

    updated_data = {}
    for tweet_id, tweet in cleaned_data.items():
        tweet = tweet.copy()
        id_ = tweet.get("id_EXIST")
        if tweet.get("lang") == "es" and id_ in label_map:
            tweet["label"] = label_map[id_]
            updated_data[tweet_id] = tweet

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(updated_data, f, indent=2, ensure_ascii=False)

    print(f"✅ Gold-labeled Spanish training data saved to: {output_path}")

# === Step 2: Generic training function ===
def train_model(json_path, model_checkpoint, save_name):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

    df = pd.read_json(json_path)
    df = df.T
    df = df[df['lang'] == 'es']
    df = df.dropna(subset=['label'])
    df = df[['tweet', 'label']].rename(columns={'tweet': 'text'})
    df = df.sample(frac=1, random_state=42)

    dataset = Dataset.from_pandas(df)
    splits = dataset.train_test_split(test_size=0.1)

    def tokenize_fn(example):
        return tokenizer(example['text'], truncation=True, padding='max_length', max_length=256)

    train_ds = splits['train'].map(tokenize_fn, batched=True)
    val_ds = splits['test'].map(tokenize_fn, batched=True)

    training_args = TrainingArguments(
        output_dir=f"results/{save_name}_es",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir=f"logs/{save_name}_es",
        logging_steps=50,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=1,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
    )

    trainer.train()

    model.save_pretrained(f"{save_name}_es_sexism_classifier")
    tokenizer.save_pretrained(f"{save_name}_es_sexism_classifier")
    print(f"✅ Model saved: {save_name}_es_sexism_classifier")

# === Step 3: Run everything ===
prepare_gold_labeled_training_data(
    "/kaggle/input/translated/EXIST2025_training_translated_es.json",
    "/kaggle/input/gold-hard/EXIST2025_training_task1_1_gold_hard.json",
    "EXIST2025_training_with_gold_es.json"
)

# Train all 3 models
train_model("EXIST2025_training_with_gold_es.json", "distilroberta-base", "distilroberta-base")
# train_model("EXIST2025_training_with_gold_es.json", "bert-base-uncased", "bert-base-uncased")
train_model("EXIST2025_training_with_gold_es.json", "dccuchile/bert-base-spanish-wwm-cased", "bert-spanish")
train_model("EXIST2025_training_with_gold_es.json", "xlm-roberta-base", "xlm-roberta-base")

✅ Gold-labeled Spanish training data saved to: EXIST2025_training_with_gold_es.json


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2874 [00:00<?, ? examples/s]

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

  trainer = Trainer(




Epoch,Training Loss,Validation Loss
1,0.6083,0.64548
2,0.5141,0.657212
3,0.4144,0.590656




✅ Model saved: distilroberta-base_es_sexism_classifier


tokenizer_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/480k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2874 [00:00<?, ? examples/s]

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.4846,0.460834
2,0.2676,0.4464
3,0.1141,0.752613




✅ Model saved: bert-spanish_es_sexism_classifier


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2874 [00:00<?, ? examples/s]

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.6537,0.544988
2,0.4162,0.465979
3,0.31,0.498115




✅ Model saved: xlm-roberta-base_es_sexism_classifier


In [6]:
train_model("EXIST2025_training_with_gold_es.json", "PlanTL-GOB-ES/roberta-base-bne", "roberta-bne")

tokenizer_config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/851k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/509k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.21M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2874 [00:00<?, ? examples/s]

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.4525,0.585962
2,0.2385,0.510444
3,0.0699,0.74542




✅ Model saved: roberta-bne_es_sexism_classifier


In [None]:
##Finds out the best weights for the models and gets the evaluations ensemble (Spanish)

In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import json
import numpy as np
from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score, classification_report
from itertools import product
from tqdm import tqdm

# === Load Spanish Dev Set ===
df = pd.read_json("EXIST2025_dev_cleaned_es.json").T
df = df[df['lang'] == 'es']

with open("/kaggle/input/gold-hard-dev/EXIST2025_dev_task1_1_gold_hard (1).json", "r", encoding="utf-8") as f:
    gold = json.load(f)

label_map = {entry["id"]: 1 if entry["value"] == "YES" else 0 for entry in gold}
df['label'] = df['id_EXIST'].map(label_map)
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)
df = df[['id_EXIST', 'tweet', 'label']].rename(columns={'id_EXIST': 'id', 'tweet': 'text'})

print(f"✅ Total Spanish dev samples with gold labels: {len(df)}")

# === Load all 3 trained Spanish models ===
model_paths = [
    # "distilroberta-base_es_sexism_classifier",
     "roberta-bne_es_sexism_classifier",
    # "bert-base-uncased_es_sexism_classifier",
    "bert-spanish_es_sexism_classifier",
    "xlm-roberta-base_es_sexism_classifier"
]

all_model_probs = []
for path in model_paths:
    tokenizer = AutoTokenizer.from_pretrained(path)
    model = AutoModelForSequenceClassification.from_pretrained(path)
    model.eval()

    probs = []
    for text in df['text']:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            logits = model(**inputs).logits
            prob = torch.softmax(logits, dim=1).squeeze(0).cpu().numpy()
            probs.append(prob)

    all_model_probs.append(np.array(probs))  # shape: [num_samples, 2]

# === Grid search to find best weights ===
all_model_probs = np.array(all_model_probs)  # shape: [3, num_samples, 2]
true_labels = np.array(df['label'].tolist())

search_space = np.arange(0.0, 1.1, 0.1)
best_f1 = 0
best_weights = None
best_preds = None

print("🔍 Grid searching for best weights (Spanish)...")
for w1, w2, w3 in tqdm(product(search_space, repeat=3)):
    weights = np.array([w1, w2, w3])
    if np.isclose(weights.sum(), 1.0):
        weighted_probs = np.average(all_model_probs, axis=0, weights=weights)
        preds = np.argmax(weighted_probs, axis=1)
        f1 = f1_score(true_labels, preds, average='binary')
        if f1 > best_f1:
            best_f1 = f1
            best_weights = weights
            best_preds = preds

# === Print best weights and metrics
print("\n Best F1 Score (Spanish Ensemble):")
print(f"Weights → DistilRoBERTa: {best_weights[0]:.2f}, BERT: {best_weights[1]:.2f}, RoBERTa: {best_weights[2]:.2f}")
print(f"F1 Score: {best_f1:.4f}")

precision, recall, _, _ = precision_recall_fscore_support(true_labels, best_preds, average='binary')
accuracy = accuracy_score(true_labels, best_preds)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print("\n📄 Classification Report:")
print(classification_report(true_labels, best_preds, target_names=["non-sexist", "sexist"]))

# === Save predictions for PyEvALL
output = []
for tweet_id, pred in zip(df['id'], best_preds):
    output.append({
        "test_case": "EXIST2025",
        "id": str(tweet_id),
        "value": "YES" if pred == 1 else "NO"
    })

output_sorted = sorted(output, key=lambda x: int(x["id"]))

with open("spanish_ensemble_predictions.json", "w", encoding="utf-8") as f:
    json.dump(output_sorted, f, indent=2, ensure_ascii=False)

print("✅ Spanish ensemble predictions saved for PyEvALL: 'spanish_ensemble_predictions.json'")

✅ Total Spanish dev samples with gold labels: 490
🔍 Grid searching for best weights (Spanish)...


1331it [00:00, 9682.30it/s]


 Best F1 Score (Spanish Ensemble):
Weights → DistilRoBERTa: 0.20, BERT: 0.30, RoBERTa: 0.50
F1 Score: 0.8402
Accuracy:  0.8347
Precision: 0.8659
Recall:    0.8161

📄 Classification Report:
              precision    recall  f1-score   support

  non-sexist       0.80      0.86      0.83       229
      sexist       0.87      0.82      0.84       261

    accuracy                           0.83       490
   macro avg       0.83      0.84      0.83       490
weighted avg       0.84      0.83      0.83       490

✅ Spanish ensemble predictions saved for PyEvALL: 'spanish_ensemble_predictions.json'





In [None]:
##Testing models to see which one performs the best ensemble spanish

In [5]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import json
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report
)

# === Load Spanish Dev Set ===
df = pd.read_json("EXIST2025_dev_cleaned_es.json").T
df = df[df['lang'] == 'es']

with open("/kaggle/input/gold-hard-dev/EXIST2025_dev_task1_1_gold_hard (1).json", "r", encoding="utf-8") as f:
    gold = json.load(f)

label_map = {entry["id"]: 1 if entry["value"] == "YES" else 0 for entry in gold}
df['label'] = df['id_EXIST'].map(label_map)
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)
df = df[['id_EXIST', 'tweet', 'label']].rename(columns={'id_EXIST': 'id', 'tweet': 'text'})

print(f"✅ Total gold-labeled Spanish dev tweets: {len(df)}")

# === Define model paths and names ===
models_to_evaluate = {
    # "DistilRoBERTa (ES)": "distilroberta-base_es_sexism_classifier",
    # "BERT-base (ES)": "bert-base-uncased_es_sexism_classifier",
    "Roberta-BNE (ES)": "roberta-bne_es_sexism_classifier",
    "BERT (ES)": "bert-spanish_es_sexism_classifier",
    "XLM-RoBERTa (ES)": "xlm-roberta-base_es_sexism_classifier",
}

# === Evaluate each model
for model_name, model_path in models_to_evaluate.items():
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    model.eval()

    preds = []
    for text in df['text']:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            logits = model(**inputs).logits
            pred = torch.argmax(logits, dim=1).item()
            preds.append(pred)

    true_labels = df['label'].tolist()
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, preds, average='binary')
    accuracy = accuracy_score(true_labels, preds)

    print(f"\n🔍 Evaluation for {model_name}")
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1 Score:  {f1:.4f}")
    print("\n📄 Classification Report:")
    print(classification_report(true_labels, preds, target_names=["non-sexist", "sexist"]))


✅ Total gold-labeled Spanish dev tweets: 490

🔍 Evaluation for DistilRoBERTa (ES)
Accuracy:  0.7510
Precision: 0.7405
Recall:    0.8199
F1 Score:  0.7782

📄 Classification Report:
              precision    recall  f1-score   support

  non-sexist       0.77      0.67      0.72       229
      sexist       0.74      0.82      0.78       261

    accuracy                           0.75       490
   macro avg       0.75      0.75      0.75       490
weighted avg       0.75      0.75      0.75       490


🔍 Evaluation for BERT (ES)
Accuracy:  0.7980
Precision: 0.8716
Recall:    0.7280
F1 Score:  0.7933

📄 Classification Report:
              precision    recall  f1-score   support

  non-sexist       0.74      0.88      0.80       229
      sexist       0.87      0.73      0.79       261

    accuracy                           0.80       490
   macro avg       0.81      0.80      0.80       490
weighted avg       0.81      0.80      0.80       490


🔍 Evaluation for XLM-RoBERTa (ES)
Accur

In [10]:
pip install PyEvALL

Collecting PyEvALL
  Downloading PyEvALL-0.1.78.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jsbeautifier==1.14.9 (from PyEvALL)
  Downloading jsbeautifier-1.14.9.tar.gz (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting setuptools==69.5.1 (from PyEvALL)
  Downloading setuptools-69.5.1-py3-none-any.whl.metadata (6.2 kB)
Collecting editorconfig>=0.12.2 (from jsbeautifier==1.14.9->PyEvALL)
  Downloading EditorConfig-0.17.0-py3-none-any.whl.metadata (3.8 kB)
Downloading setuptools-69.5.1-py3-none-any.whl (894 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m894.6/894.6 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading EditorConfig-0.17.0-py3-none-any.whl 

In [None]:
##ICM SCORES for ensemble spanish 

In [8]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils

evaluator = PyEvALLEvaluation()
params = {PyEvALLUtils.PARAM_REPORT: PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED}
metrics = ["ICM", "ICMNorm", "FMeasure"]

report = evaluator.evaluate("spanish_ensemble_predictions.json", "/kaggle/input/gold-hard-dev/EXIST2025_dev_task1_1_gold_hard (1).json", metrics, **params)
report.print_report()


2025-05-08 09:09:26,044 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM', 'ICMNorm', 'FMeasure']
2025-05-08 09:09:26,158 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-08 09:09:26,500 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Normalized evaluation method
2025-05-08 09:09:26,504 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-08 09:09:26,854 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-08 09:09:27,264 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
cargado 29
{
  "metrics": {
    "ICM": {
      "name": "Information Contrast model",
      "acronym": "ICM",
      "description": "Coming soon!",
      "status": "OK",
      "results": {
        "test_cases": [{
          "name": "EXIST2025",
          "average

In [None]:
# Trying AEDA+ensemble for spanish

In [None]:
import json
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import random

# === AEDA Function ===
PUNCTUATIONS = ['.', ',', '!', '?', ';', ':']

def aeda(sentence, punc_ratio=0.3, max_insert=3):
    words = sentence.split()
    n = len(words)
    num_puncs = min(max_insert, max(1, int(punc_ratio * n)))

    insert_positions = random.sample(range(n), num_puncs)
    for pos in insert_positions:
        punct = random.choice(PUNCTUATIONS)
        words[pos] = words[pos] + punct
    return ' '.join(words)

# === Step 1: Prepare gold-labeled Spanish training data ===
def prepare_gold_labeled_training_data(cleaned_path, gold_path, output_path):
    with open(cleaned_path, "r", encoding="utf-8") as f:
        cleaned_data = json.load(f)

    with open(gold_path, "r", encoding="utf-8") as f:
        gold_labels = json.load(f)

    label_map = {entry["id"]: 1 if entry["value"] == "YES" else 0 for entry in gold_labels}

    updated_data = {}
    for tweet_id, tweet in cleaned_data.items():
        tweet = tweet.copy()
        id_ = tweet.get("id_EXIST")
        if tweet.get("lang") == "es" and id_ in label_map:
            tweet["label"] = label_map[id_]
            updated_data[tweet_id] = tweet

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(updated_data, f, indent=2, ensure_ascii=False)

    print(f"✅ Gold-labeled Spanish training data saved to: {output_path}")

# === Step 2: Generic training function with AEDA ===
def train_model(json_path, model_checkpoint, save_name, use_aeda=True):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

    df = pd.read_json(json_path)
    df = df.T
    df = df[df['lang'] == 'es']
    df = df.dropna(subset=['label'])
    df = df[['tweet', 'label']].rename(columns={'tweet': 'text'})
    df = df.sample(frac=1, random_state=42)

    # Apply AEDA augmentation
    if use_aeda:
        augmented_rows = []
        for _, row in df.iterrows():
            aug_text = aeda(row['text'])
            augmented_rows.append({'text': aug_text, 'label': row['label']})
        aug_df = pd.DataFrame(augmented_rows)
        df = pd.concat([df, aug_df], ignore_index=True)
        df = df.sample(frac=1, random_state=42)  # shuffle again

    dataset = Dataset.from_pandas(df)
    splits = dataset.train_test_split(test_size=0.1)

    def tokenize_fn(example):
        return tokenizer(example['text'], truncation=True, padding='max_length', max_length=256)

    train_ds = splits['train'].map(tokenize_fn, batched=True)
    val_ds = splits['test'].map(tokenize_fn, batched=True)

    training_args = TrainingArguments(
        output_dir=f"results/{save_name}_es",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir=f"logs/{save_name}_es",
        logging_steps=50,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=1,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
    )

    trainer.train()

    model.save_pretrained(f"{save_name}_es_sexism_classifier_aeda")
    tokenizer.save_pretrained(f"{save_name}_es_sexism_classifier_aeda")
    print(f"✅ Model saved: {save_name}_es_sexism_classifier_aeda")

# === Step 3: Run everything ===
prepare_gold_labeled_training_data(
    "/kaggle/input/translated/EXIST2025_training_translated_es.json",
    "/kaggle/input/gold-hard/EXIST2025_training_task1_1_gold_hard.json",
    "EXIST2025_training_with_gold_es.json"
)

# Train all 3 Spanish models with AEDA
train_model("EXIST2025_training_with_gold_es.json", "PlanTL-GOB-ES/roberta-base-bne", "roberta-bne")
train_model("EXIST2025_training_with_gold_es.json", "dccuchile/bert-base-spanish-wwm-cased", "bert-spanish")
train_model("EXIST2025_training_with_gold_es.json", "xlm-roberta-base", "xlm-roberta-base")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import json
import numpy as np
from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score, classification_report
from itertools import product
from tqdm import tqdm

# === Load Spanish Dev Set ===
df = pd.read_json("EXIST2025_dev_cleaned_es.json").T
df = df[df['lang'] == 'es']

with open("/kaggle/input/gold-hard-dev/EXIST2025_dev_task1_1_gold_hard (1).json", "r", encoding="utf-8") as f:
    gold = json.load(f)

label_map = {entry["id"]: 1 if entry["value"] == "YES" else 0 for entry in gold}
df['label'] = df['id_EXIST'].map(label_map)
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)
df = df[['id_EXIST', 'tweet', 'label']].rename(columns={'id_EXIST': 'id', 'tweet': 'text'})

print(f"✅ Total Spanish dev samples with gold labels: {len(df)}")

# === Load all 3 trained Spanish models ===
model_paths = [
    # "distilroberta-base_es_sexism_classifier",
     "roberta-bne_es_sexism_classifier_aeda",
    # "bert-base-uncased_es_sexism_classifier",
    "bert-spanish_es_sexism_classifier_aeda",
    "xlm-roberta-base_es_sexism_classifier_aeda"
]

all_model_probs = []
for path in model_paths:
    tokenizer = AutoTokenizer.from_pretrained(path)
    model = AutoModelForSequenceClassification.from_pretrained(path)
    model.eval()

    probs = []
    for text in df['text']:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            logits = model(**inputs).logits
            prob = torch.softmax(logits, dim=1).squeeze(0).cpu().numpy()
            probs.append(prob)

    all_model_probs.append(np.array(probs))  # shape: [num_samples, 2]

# === Grid search to find best weights ===
all_model_probs = np.array(all_model_probs)  # shape: [3, num_samples, 2]
true_labels = np.array(df['label'].tolist())

search_space = np.arange(0.0, 1.1, 0.1)
best_f1 = 0
best_weights = None
best_preds = None

print("🔍 Grid searching for best weights (Spanish)...")
for w1, w2, w3 in tqdm(product(search_space, repeat=3)):
    weights = np.array([w1, w2, w3])
    if np.isclose(weights.sum(), 1.0):
        weighted_probs = np.average(all_model_probs, axis=0, weights=weights)
        preds = np.argmax(weighted_probs, axis=1)
        f1 = f1_score(true_labels, preds, average='binary')
        if f1 > best_f1:
            best_f1 = f1
            best_weights = weights
            best_preds = preds

# === Print best weights and metrics
print("\n Best F1 Score (Spanish Ensemble):")
print(f"Weights → DistilRoBERTa: {best_weights[0]:.2f}, BERT: {best_weights[1]:.2f}, RoBERTa: {best_weights[2]:.2f}")
print(f"F1 Score: {best_f1:.4f}")

precision, recall, _, _ = precision_recall_fscore_support(true_labels, best_preds, average='binary')
accuracy = accuracy_score(true_labels, best_preds)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print("\n📄 Classification Report:")
print(classification_report(true_labels, best_preds, target_names=["non-sexist", "sexist"]))

# === Save predictions for PyEvALL
output = []
for tweet_id, pred in zip(df['id'], best_preds):
    output.append({
        "test_case": "EXIST2025",
        "id": str(tweet_id),
        "value": "YES" if pred == 1 else "NO"
    })

output_sorted = sorted(output, key=lambda x: int(x["id"]))

with open("spanish_ensemble_predictions.json", "w", encoding="utf-8") as f:
    json.dump(output_sorted, f, indent=2, ensure_ascii=False)

print("✅ Spanish ensemble predictions saved for PyEvALL: 'spanish_ensemble_predictions.json'")

In [None]:
## Results for AEDA+Ensemble spanish

In [8]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils

evaluator = PyEvALLEvaluation()
params = {PyEvALLUtils.PARAM_REPORT: PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED}
metrics = ["ICM", "ICMNorm", "FMeasure"]

report = evaluator.evaluate("/kaggle/input/ensemble-aeda/spanish_ensemble_predictions_aeda.json", "/kaggle/input/gold-hard-dev/EXIST2025_dev_task1_1_gold_hard (1).json", metrics, **params)
report.print_report()

2025-05-08 17:28:18,749 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM', 'ICMNorm', 'FMeasure']
2025-05-08 17:28:18,843 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-08 17:28:19,237 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Normalized evaluation method
2025-05-08 17:28:19,239 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-08 17:28:19,627 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-08 17:28:20,014 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
{
  "metrics": {
    "ICM": {
      "name": "Information Contrast model",
      "acronym": "ICM",
      "description": "Coming soon!",
      "status": "OK",
      "results": {
        "test_cases": [{
          "name": "EXIST2025",
          "average": -0.18939

In [None]:
## Final Score for english and spanish just for ensemble

In [9]:
import json

# === Load English & Spanish predictions ===
with open("/kaggle/input/ensemble-final-preds/ensemble_predictions_output (2).json", "r", encoding="utf-8") as f:
    en_preds = json.load(f)

with open("/kaggle/input/ensemble-final-preds/spanish_ensemble_predictions (3).json", "r", encoding="utf-8") as f:
    es_preds = json.load(f)

# === Merge and sort by ID ===
combined_preds = en_preds + es_preds
combined_preds_sorted = sorted(combined_preds, key=lambda x: int(x["id"]))

# === Save to a single file ===
with open("combined_ensemble_predictions.json", "w", encoding="utf-8") as f:
    json.dump(combined_preds_sorted, f, indent=2, ensure_ascii=False)

print("✅ Combined predictions saved as 'combined_ensemble_predictions.json'")


✅ Combined predictions saved as 'combined_ensemble_predictions.json'


In [10]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils

evaluator = PyEvALLEvaluation()
params = {PyEvALLUtils.PARAM_REPORT: PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED}
metrics = ["ICM", "ICMNorm", "FMeasure"]

report = evaluator.evaluate(
    "combined_ensemble_predictions.json",
    "/kaggle/input/gold-hard-dev/EXIST2025_dev_task1_1_gold_hard (1).json",
    metrics,
    **params
)
report.print_report()

2025-05-08 09:23:32,788 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM', 'ICMNorm', 'FMeasure']
2025-05-08 09:23:32,903 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-08 09:23:33,311 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Normalized evaluation method
2025-05-08 09:23:33,314 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-08 09:23:33,750 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-08 09:23:34,178 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
{
  "metrics": {
    "ICM": {
      "name": "Information Contrast model",
      "acronym": "ICM",
      "description": "Coming soon!",
      "status": "OK",
      "results": {
        "test_cases": [{
          "name": "EXIST2025",
          "average": 0.531855

In [None]:
## Final Score for english and spanish for ensemble + AEDA (Better than just ensemble)

In [10]:
import json

# === Load English & Spanish predictions ===
with open("/kaggle/input/ensemble-aeda/ensemble_predictions_output_aeda.json", "r", encoding="utf-8") as f:
    en_preds = json.load(f)

with open("/kaggle/input/ensemble-aeda/spanish_ensemble_predictions_aeda.json", "r", encoding="utf-8") as f:
    es_preds = json.load(f)

# === Merge and sort by ID ===
combined_preds = en_preds + es_preds
combined_preds_sorted = sorted(combined_preds, key=lambda x: int(x["id"]))

# === Save to a single file ===
with open("combined_ensemble_predictions_aeda.json", "w", encoding="utf-8") as f:
    json.dump(combined_preds_sorted, f, indent=2, ensure_ascii=False)

print("✅ Combined predictions saved as 'combined_ensemble_predictions.json'")

✅ Combined predictions saved as 'combined_ensemble_predictions.json'


In [11]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils

evaluator = PyEvALLEvaluation()
params = {PyEvALLUtils.PARAM_REPORT: PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED}
metrics = ["ICM", "ICMNorm", "FMeasure"]

report = evaluator.evaluate(
    "combined_ensemble_predictions_aeda.json",
    "/kaggle/input/gold-hard-dev/EXIST2025_dev_task1_1_gold_hard (1).json",
    metrics,
    **params
)
report.print_report()

2025-05-08 17:30:16,176 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM', 'ICMNorm', 'FMeasure']
2025-05-08 17:30:16,286 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-08 17:30:16,696 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Normalized evaluation method
2025-05-08 17:30:16,698 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-08 17:30:17,108 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-08 17:30:17,503 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
{
  "metrics": {
    "ICM": {
      "name": "Information Contrast model",
      "acronym": "ICM",
      "description": "Coming soon!",
      "status": "OK",
      "results": {
        "test_cases": [{
          "name": "EXIST2025",
          "average": 0.554787

In [None]:
### Final Code after all the modifications

In [None]:
#Training aeda+ensemble for english (Custom training and dev sets used instead of splitting)

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import Dataset
import pandas as pd
import numpy as np
import json
import os

# === AEDA Function (stub) ===
def aeda(text):
    # Replace this with your actual AEDA function
    puncts = [';', ':', '!', '?', ',', '.', '。', '،']
    words = text.split()
    n_insertions = max(1, int(0.1 * len(words)))  # Insert punctuation into ~10% of words
    for _ in range(n_insertions):
        idx = np.random.randint(0, len(words))
        punct = np.random.choice(puncts)
        words.insert(idx, punct)
    return " ".join(words)

# === Prepare Gold Labels ===
def prepare_gold_dataset(clean_path, gold_path, output_path):
    with open(clean_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    with open(gold_path, "r", encoding="utf-8") as f:
        gold_labels = json.load(f)
    label_dict = {entry["id"]: 1 if entry["value"] == "YES" else 0 for entry in gold_labels}
    updated_data = {}
    for tweet_id, tweet_info in data.items():
        tweet_info = tweet_info.copy()
        gold_id = tweet_info.get("id_EXIST")
        if gold_id in label_dict:
            tweet_info["label"] = label_dict[gold_id]
        updated_data[tweet_id] = tweet_info
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(updated_data, f, indent=2, ensure_ascii=False)
    print(f" Gold-labeled dataset saved to {output_path}")

# === Dataset Loader (for separate train/dev) ===
def load_and_tokenize_dataset(json_path, tokenizer, max_length=256, apply_aeda=False):
    df = pd.read_json(json_path).T
    df = df[df['lang'] == 'en']
    df = df.dropna(subset=['label'])
    df = df[['tweet', 'label']].rename(columns={'tweet': 'text'})

    if apply_aeda:
        augmented_rows = []
        for _, row in df.iterrows():
            aug_text = aeda(row['text'])
            augmented_rows.append({'text': aug_text, 'label': row['label']})
        aug_df = pd.DataFrame(augmented_rows)
        df = pd.concat([df, aug_df], ignore_index=True)

    df = df.sample(frac=1, random_state=42)
    dataset = Dataset.from_pandas(df)

    def preprocess(example):
        return tokenizer(example['text'], truncation=True, padding='max_length', max_length=max_length)

    return dataset.map(preprocess, batched=True)

# === Model Training ===
def train_model(train_json_path, val_json_path, model_checkpoint, save_name, use_aeda=False):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

    train_ds = load_and_tokenize_dataset(train_json_path, tokenizer, apply_aeda=use_aeda)
    val_ds = load_and_tokenize_dataset(val_json_path, tokenizer, apply_aeda=False)

    training_args = TrainingArguments(
        output_dir=f"results/{save_name}",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir=f"logs/{save_name}",
        logging_steps=50,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
    )

    trainer.train()
    suffix = "_aeda" if use_aeda else ""
    model.save_pretrained(f"{save_name}_sexism_classifier{suffix}")
    tokenizer.save_pretrained(f"{save_name}_sexism_classifier{suffix}")
    print(f" Model saved to {save_name}_sexism_classifier{suffix}")

# === Run Pipeline ===
prepare_gold_dataset(
    clean_path="/kaggle/input/translated/EXIST2025_training_translated_en.json",
    gold_path="/kaggle/input/gold-hard/EXIST2025_training_task1_1_gold_hard.json",
    output_path="EXIST2025_training_with_gold.json"
)

prepare_gold_dataset(
    clean_path="/kaggle/input/dev-tanslated/EXIST2025_dev_translated_en.json",
    gold_path="/kaggle/input/gold-hard-dev/EXIST2025_dev_task1_1_gold_hard (1).json",
    output_path="EXIST2025_dev_with_gold.json"
)

model_list = ["distilroberta-base", "bert-base-uncased", "roberta-base"]

for model_checkpoint in model_list:
    save_name = model_checkpoint.replace("/", "-")
    train_model(
        train_json_path="EXIST2025_training_with_gold.json",
        val_json_path="EXIST2025_dev_with_gold.json",
        model_checkpoint=model_checkpoint,
        save_name=save_name,
        use_aeda=True
    )


 Gold-labeled dataset saved to EXIST2025_training_with_gold.json
 Gold-labeled dataset saved to EXIST2025_dev_with_gold.json


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/5740 [00:00<?, ? examples/s]

Map:   0%|          | 0/444 [00:00<?, ? examples/s]

  trainer = Trainer(




Epoch,Training Loss,Validation Loss
1,0.348,0.405793
2,0.1993,0.58574
3,0.066,0.785042




 Model saved to distilroberta-base_sexism_classifier_aeda


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/5740 [00:00<?, ? examples/s]

Map:   0%|          | 0/444 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.3196,0.353956
2,0.0952,0.760569
3,0.0158,0.819183




 Model saved to bert-base-uncased_sexism_classifier_aeda


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/5740 [00:00<?, ? examples/s]

Map:   0%|          | 0/444 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.3757,0.375355
2,0.223,0.505997
3,0.0816,0.658462




 Model saved to roberta-base_sexism_classifier_aeda


In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import json
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
from scipy.optimize import minimize

# === Load Gold-Labeled Dev Set ===
df = pd.read_json("EXIST2025_dev_with_gold.json").T
df = df[df['lang'] == 'en']
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)
df = df[['id_EXIST', 'tweet', 'label']].rename(columns={'id_EXIST': 'id', 'tweet': 'text'})

print(f" Total gold-labeled dev tweets: {len(df)}")

# === Ensemble models ===
model_paths = [
    "distilroberta-base_sexism_classifier_aeda",
    "bert-base-uncased_sexism_classifier_aeda",
    "roberta-base_sexism_classifier_aeda"
]

all_model_probs = []

# === Collect probabilities from each model ===
for path in model_paths:
    tokenizer = AutoTokenizer.from_pretrained(path)
    model = AutoModelForSequenceClassification.from_pretrained(path)
    model.eval()

    model_probs = []
    for text in df['text']:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            logits = model(**inputs).logits
            probs = torch.softmax(logits, dim=1).squeeze(0).cpu().numpy()
            model_probs.append(probs)

    all_model_probs.append(np.array(model_probs))  # shape: [num_samples, 2]

all_model_probs = np.array(all_model_probs)  # shape: [num_models, num_samples, 2]
true_labels = np.array(df['label'].tolist())

# === Optimize weights using F1 score ===
def f1_objective(weights):
    weighted_probs = np.average(all_model_probs, axis=0, weights=weights)
    preds = np.argmax(weighted_probs, axis=1)
    f1 = precision_recall_fscore_support(true_labels, preds, average='binary')[2]
    return 1 - f1  # because we want to maximize F1

# Constraint: sum of weights = 1
constraints = {'type': 'eq', 'fun': lambda w: np.sum(w) - 1}

# Bounds: all weights between 0 and 1
bounds = [(0, 1)] * len(model_paths)

# Initial guess: equal weights
initial_weights = np.ones(len(model_paths)) / len(model_paths)

# Run optimization
result = minimize(f1_objective, initial_weights, bounds=bounds, constraints=constraints)
best_weights = result.x
print(f"\n Optimal Weights Found: {best_weights.round(3)}")

# === Weighted soft voting with optimized weights ===
weighted_probs = np.average(all_model_probs, axis=0, weights=best_weights)
ensemble_preds = np.argmax(weighted_probs, axis=1)

# === Evaluation ===
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, ensemble_preds, average='binary')
accuracy = accuracy_score(true_labels, ensemble_preds)

print("\n Ensemble Evaluation on GOLD Dev Set (Soft Voting, Optimized Weights):")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print("\n Classification Report:")
print(classification_report(true_labels, ensemble_preds, target_names=["non-sexist", "sexist"]))

# === Save Output for PyEvALL ===
output = []
for tweet_id, pred in zip(df['id'], ensemble_preds):
    output.append({
        "test_case": "EXIST2025",
        "id": str(tweet_id),
        "value": "YES" if pred == 1 else "NO"
    })

output_sorted = sorted(output, key=lambda x: int(x["id"]))

with open("ensemble_predictions_output.json", "w", encoding="utf-8") as f:
    json.dump(output_sorted, f, indent=2, ensure_ascii=False)

print("Predictions saved for PyEvALL evaluation")

 Total gold-labeled dev tweets: 444

 Optimal Weights Found: [0.333 0.333 0.333]

 Ensemble Evaluation on GOLD Dev Set (Soft Voting, Optimized Weights):
Accuracy:  0.8649
Precision: 0.8350
Recall:    0.8608
F1 Score:  0.8477

 Classification Report:
              precision    recall  f1-score   support

  non-sexist       0.89      0.87      0.88       250
      sexist       0.83      0.86      0.85       194

    accuracy                           0.86       444
   macro avg       0.86      0.86      0.86       444
weighted avg       0.87      0.86      0.87       444

Predictions saved for PyEvALL evaluation


In [9]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils

predictions = "ensemble_predictions_output.json"
gold = "/kaggle/input/gold-hard-dev/EXIST2025_dev_task1_1_gold_hard (1).json"

# Initialize evaluator
evaluator = PyEvALLEvaluation()

# Set parameters
params = {
    PyEvALLUtils.PARAM_REPORT: PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED
}

# Choose metrics (ICM for hard labels)
metrics = ["ICM", "ICMNorm", "FMeasure"]  # You can also try ICMSoft for soft scores

# Run evaluation
report = evaluator.evaluate(predictions, gold, metrics, **params)
report.print_report()

2025-05-10 15:01:07,143 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM', 'ICMNorm', 'FMeasure']
2025-05-10 15:01:07,260 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-10 15:01:07,654 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Normalized evaluation method
2025-05-10 15:01:07,657 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-10 15:01:08,010 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-10 15:01:08,422 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
{
  "metrics": {
    "ICM": {
      "name": "Information Contrast model",
      "acronym": "ICM",
      "description": "Coming soon!",
      "status": "OK",
      "results": {
        "test_cases": [{
          "name": "EXIST2025",
          "average": -0.24534

In [None]:
#Training aeda+ensemble for spanish (Custom training and dev sets used instead of splitting)

In [5]:
import json
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import random

# === AEDA Function ===
PUNCTUATIONS = ['.', ',', '!', '?', ';', ':']

def aeda(sentence, punc_ratio=0.3, max_insert=3):
    words = sentence.split()
    n = len(words)
    num_puncs = min(max_insert, max(1, int(punc_ratio * n)))

    insert_positions = random.sample(range(n), num_puncs)
    for pos in insert_positions:
        punct = random.choice(PUNCTUATIONS)
        words[pos] = words[pos] + punct
    return ' '.join(words)

# === Gold Label Merger ===
def prepare_gold_labeled_data(cleaned_path, gold_path, output_path, lang='es'):
    with open(cleaned_path, "r", encoding="utf-8") as f:
        cleaned_data = json.load(f)
    with open(gold_path, "r", encoding="utf-8") as f:
        gold_labels = json.load(f)

    label_map = {entry["id"]: 1 if entry["value"] == "YES" else 0 for entry in gold_labels}
    updated_data = {}

    for tweet_id, tweet in cleaned_data.items():
        tweet = tweet.copy()
        id_ = tweet.get("id_EXIST")
        if tweet.get("lang") == lang and id_ in label_map:
            tweet["label"] = label_map[id_]
            updated_data[tweet_id] = tweet

    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(updated_data, f, indent=2, ensure_ascii=False)
    print(f" Gold-labeled {lang.upper()} data saved to: {output_path}")

# === Load & Tokenize Spanish Dataset ===
def load_and_tokenize_dataset(json_path, tokenizer, apply_aeda=False):
    df = pd.read_json(json_path).T
    df = df[df['lang'] == 'es']
    df = df.dropna(subset=['label'])
    df = df[['tweet', 'label']].rename(columns={'tweet': 'text'})

    if apply_aeda:
        augmented_rows = []
        for _, row in df.iterrows():
            aug_text = aeda(row['text'])
            augmented_rows.append({'text': aug_text, 'label': row['label']})
        aug_df = pd.DataFrame(augmented_rows)
        df = pd.concat([df, aug_df], ignore_index=True)
        df = df.sample(frac=1, random_state=42)

    dataset = Dataset.from_pandas(df)
    return dataset.map(lambda x: tokenizer(x['text'], truncation=True, padding='max_length', max_length=256), batched=True)

# === Train Spanish Model ===
def train_model(train_json_path, val_json_path, model_checkpoint, save_name, use_aeda=True):
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

    train_ds = load_and_tokenize_dataset(train_json_path, tokenizer, apply_aeda=use_aeda)
    val_ds = load_and_tokenize_dataset(val_json_path, tokenizer, apply_aeda=False)

    training_args = TrainingArguments(
        output_dir=f"results/{save_name}_es",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=100,
        weight_decay=0.01,
        logging_dir=f"logs/{save_name}_es",
        logging_steps=50,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        save_total_limit=1,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
    )

    trainer.train()

    model.save_pretrained(f"{save_name}_es_sexism_classifier_aeda")
    tokenizer.save_pretrained(f"{save_name}_es_sexism_classifier_aeda")
    print(f" Model saved: {save_name}_es_sexism_classifier_aeda")

# === Run Everything ===

# Step 1: Prepare gold-labeled train and dev sets
prepare_gold_labeled_data(
    "/kaggle/input/translated/EXIST2025_training_translated_es.json",
    "/kaggle/input/gold-hard/EXIST2025_training_task1_1_gold_hard.json",
    "EXIST2025_training_with_gold_es.json"
)

prepare_gold_labeled_data(
    "/kaggle/input/dev-tanslated/EXIST2025_dev_translated_es.json",
    "/kaggle/input/gold-hard-dev/EXIST2025_dev_task1_1_gold_hard (1).json",
    "EXIST2025_dev_with_gold_es.json"
)

# Step 2: Train models
model_list = [
    ("PlanTL-GOB-ES/roberta-base-bne", "roberta-bne"),
    ("dccuchile/bert-base-spanish-wwm-cased", "bert-spanish"),
    ("xlm-roberta-base", "xlm-roberta-base")
]

for checkpoint, name in model_list:
    train_model(
        train_json_path="EXIST2025_training_with_gold_es.json",
        val_json_path="EXIST2025_dev_with_gold_es.json",
        model_checkpoint=checkpoint,
        save_name=name,
        use_aeda=True
    )


 Gold-labeled ES data saved to: EXIST2025_training_with_gold_es.json
 Gold-labeled ES data saved to: EXIST2025_dev_with_gold_es.json


tokenizer_config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/851k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/509k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.21M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/957 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at PlanTL-GOB-ES/roberta-base-bne and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/6388 [00:00<?, ? examples/s]

Map:   0%|          | 0/490 [00:00<?, ? examples/s]

  trainer = Trainer(




Epoch,Training Loss,Validation Loss
1,0.2526,0.456768
2,0.0627,0.700236
3,0.0007,0.863545




 Model saved: roberta-bne_es_sexism_classifier_aeda


tokenizer_config.json:   0%|          | 0.00/364 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/480k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/6388 [00:00<?, ? examples/s]

Map:   0%|          | 0/490 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.26,0.586947
2,0.0658,0.976499
3,0.004,1.075608




 Model saved: bert-spanish_es_sexism_classifier_aeda


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/6388 [00:00<?, ? examples/s]

Map:   0%|          | 0/490 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.5097,0.544351
2,0.3286,0.47176
3,0.2336,0.609794




 Model saved: xlm-roberta-base_es_sexism_classifier_aeda


In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
import json
import numpy as np
from sklearn.metrics import f1_score, precision_recall_fscore_support, accuracy_score, classification_report
from itertools import product
from tqdm import tqdm

# === Load gold-labeled Spanish dev set ===
df = pd.read_json("EXIST2025_dev_with_gold_es.json").T
df = df[df['lang'] == 'es']
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)
df = df[['id_EXIST', 'tweet', 'label']].rename(columns={'id_EXIST': 'id', 'tweet': 'text'})

print(f"Total Spanish dev samples with gold labels: {len(df)}")

# === Load trained Spanish models ===
model_paths = [
    "roberta-bne_es_sexism_classifier_aeda",
    "bert-spanish_es_sexism_classifier_aeda",
    "xlm-roberta-base_es_sexism_classifier_aeda"
]

model_labels = ["RoBERTa-bne", "BERT-Spanish", "XLM-RoBERTa"]

all_model_probs = []
for path in model_paths:
    tokenizer = AutoTokenizer.from_pretrained(path)
    model = AutoModelForSequenceClassification.from_pretrained(path)
    model.eval()

    probs = []
    for text in df['text']:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        with torch.no_grad():
            logits = model(**inputs).logits
            prob = torch.softmax(logits, dim=1).squeeze(0).cpu().numpy()
            probs.append(prob)

    all_model_probs.append(np.array(probs))  # shape: [num_samples, 2]

# === Grid search for best weights ===
all_model_probs = np.array(all_model_probs)  # shape: [3, num_samples, 2]
true_labels = np.array(df['label'].tolist())

search_space = np.arange(0.0, 1.1, 0.1)
best_f1 = 0
best_weights = None
best_preds = None

print("Grid searching for best weights (Spanish)...")
for w1, w2, w3 in tqdm(product(search_space, repeat=3)):
    weights = np.array([w1, w2, w3])
    if np.isclose(weights.sum(), 1.0):
        weighted_probs = np.average(all_model_probs, axis=0, weights=weights)
        preds = np.argmax(weighted_probs, axis=1)
        f1 = f1_score(true_labels, preds, average='binary')
        if f1 > best_f1:
            best_f1 = f1
            best_weights = weights
            best_preds = preds

# === Print best weights and evaluation metrics ===
print("\nBest Ensemble F1 Score (Spanish):")
for label, weight in zip(model_labels, best_weights):
    print(f"  {label}: {weight:.2f}")
print(f"\nF1 Score:  {best_f1:.4f}")

precision, recall, _, _ = precision_recall_fscore_support(true_labels, best_preds, average='binary')
accuracy = accuracy_score(true_labels, best_preds)

print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print("\nClassification Report:")
print(classification_report(true_labels, best_preds, target_names=["non-sexist", "sexist"]))

# === Save predictions for PyEvALL ===
output = []
for tweet_id, pred in zip(df['id'], best_preds):
    output.append({
        "test_case": "EXIST2025",
        "id": str(tweet_id),
        "value": "YES" if pred == 1 else "NO"
    })

output_sorted = sorted(output, key=lambda x: int(x["id"]))

with open("spanish_ensemble_predictions.json", "w", encoding="utf-8") as f:
    json.dump(output_sorted, f, indent=2, ensure_ascii=False)

print("Predictions saved for PyEvALL: 'spanish_ensemble_predictions.json'")

Total Spanish dev samples with gold labels: 490
Grid searching for best weights (Spanish)...


1331it [00:00, 8892.22it/s]


Best Ensemble F1 Score (Spanish):
  RoBERTa-bne: 0.30
  BERT-Spanish: 0.40
  XLM-RoBERTa: 0.30

F1 Score:  0.8642
Accuracy:  0.8551
Precision: 0.8626
Recall:    0.8659

Classification Report:
              precision    recall  f1-score   support

  non-sexist       0.85      0.84      0.84       229
      sexist       0.86      0.87      0.86       261

    accuracy                           0.86       490
   macro avg       0.85      0.85      0.85       490
weighted avg       0.86      0.86      0.86       490

Predictions saved for PyEvALL: 'spanish_ensemble_predictions.json'





In [9]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils

predictions = "spanish_ensemble_predictions.json"
gold = "/kaggle/input/gold-hard-dev/EXIST2025_dev_task1_1_gold_hard (1).json"

# Initialize evaluator
evaluator = PyEvALLEvaluation()

# Set parameters
params = {
    PyEvALLUtils.PARAM_REPORT: PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED
}

# Choose metrics (ICM for hard labels)
metrics = ["ICM", "ICMNorm", "FMeasure"]  # You can also try ICMSoft for soft scores

# Run evaluation
report = evaluator.evaluate(predictions, gold, metrics, **params)
report.print_report()

2025-05-10 15:54:21,381 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM', 'ICMNorm', 'FMeasure']
2025-05-10 15:54:21,509 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-10 15:54:21,882 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Normalized evaluation method
2025-05-10 15:54:21,885 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-10 15:54:22,267 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-10 15:54:22,727 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
cargado 29
{
  "metrics": {
    "ICM": {
      "name": "Information Contrast model",
      "acronym": "ICM",
      "description": "Coming soon!",
      "status": "OK",
      "results": {
        "test_cases": [{
          "name": "EXIST2025",
          "average

In [3]:
import json

# === Load English & Spanish predictions ===
with open("/kaggle/input/final-predictions/ensemble_predictions_output_final.json", "r", encoding="utf-8") as f:
    en_preds = json.load(f)

with open("/kaggle/input/final-predictions/spanish_ensemble_predictions_final.json", "r", encoding="utf-8") as f:
    es_preds = json.load(f)

# === Merge and sort by ID ===
combined_preds = en_preds + es_preds
combined_preds_sorted = sorted(combined_preds, key=lambda x: int(x["id"]))

# === Save to a single file ===
with open("combined_ensemble_predictions_aeda.json", "w", encoding="utf-8") as f:
    json.dump(combined_preds_sorted, f, indent=2, ensure_ascii=False)

print(" Combined predictions saved as 'combined_ensemble_predictions.json'")

 Combined predictions saved as 'combined_ensemble_predictions.json'


In [None]:
#Final Results after all the modifications

In [12]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils

evaluator = PyEvALLEvaluation()
params = {PyEvALLUtils.PARAM_REPORT: PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED}
metrics = ["ICM", "ICMNorm", "FMeasure"]

report = evaluator.evaluate(
    "combined_ensemble_predictions_aeda.json",
    "/kaggle/input/gold-hard-dev/EXIST2025_dev_task1_1_gold_hard (1).json",
    metrics,
    **params
)
report.print_report()

2025-05-10 15:57:36,502 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM', 'ICMNorm', 'FMeasure']
2025-05-10 15:57:36,613 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-10 15:57:37,070 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Normalized evaluation method
2025-05-10 15:57:37,074 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-10 15:57:37,459 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-05-10 15:57:37,839 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
{
  "metrics": {
    "ICM": {
      "name": "Information Contrast model",
      "acronym": "ICM",
      "description": "Coming soon!",
      "status": "OK",
      "results": {
        "test_cases": [{
          "name": "EXIST2025",
          "average": 0.578830