In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/exist2025/EXIST2025_training.json
/kaggle/input/exist2025/EXIST2025_dev.json
/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_majority_class_soft.json
/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_minority_class_hard.json
/kaggle/input/exist2025-all/EXIST2025_training_task1_3_minority_class_hard.json
/kaggle/input/exist2025-all/EXIST2025_training.json
/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_minority_class_soft.json
/kaggle/input/exist2025-all/EXIST2025_training_task1_3_majority_class_soft.json
/kaggle/input/exist2025-all/EXIST2025_training_task1_3_majority_class_hard.json
/kaggle/input/exist2025-all/EXIST2025_training_task1_3_minority_class_soft.json
/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_majority_class_hard.json
/kaggle/input/exist2025-all/EXIST2025_dev.json
/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_gold_hard.json
/kaggle/input/exist2025-all/EXIST2025_training_task1_3_gold_soft.json
/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_gold_

In [4]:
import wandb

wandb.login(key="0c5f368f1f51fd942ec7bb3a1c74efb7bdc832d6")


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmshoaibvohra[0m ([33mmshoaibvohra-habib-university[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [5]:
import json
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

# Load the dataset
with open("/kaggle/input/exist2025/EXIST2025_training.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Define correct label classes
CORRECT_LABELS = [
    "IDEOLOGICAL-INEQUALITY",
    "MISOGYNY-NON-SEXUAL-VIOLENCE",
    "OBJECTIFICATION",
    "SEXUAL-VIOLENCE",
    "STEREOTYPING-DOMINANCE",
    "NO"  # Represents non-sexist tweets (previously "-")
]

# Extract relevant fields
def process_data(data, lang):
    tweets = []
    labels = []
    ids = []

    for entry in data.values():
        if entry["lang"] == lang:
            tweet_id = entry["id_EXIST"]
            tweet = entry["tweet"]
            is_sexist = any(label == "YES" for label in entry["labels_task1_1"])  # Check if at least one annotator marked it sexist
            label = entry["labels_task1_3"] if is_sexist else [["NO"]]  # Non-sexist tweets get "NO"

            # Flatten labels
            processed_labels = [l if l != "-" else "NO" for sublist in label for l in sublist]

            # Remove "UNKNOWN"
            processed_labels = [l for l in processed_labels if l != "UNKNOWN"]

            # Ensure every tweet has at least one label
            if not processed_labels:
                processed_labels = ["NO"]

            tweets.append(tweet)
            labels.append(processed_labels)
            ids.append(tweet_id)

    return tweets, labels, ids

# Process data for English and Spanish
english_tweets, english_labels, english_ids = process_data(data, "en")
spanish_tweets, spanish_labels, spanish_ids = process_data(data, "es")

# MultiLabel Binarizer with Fixed Labels
mlb = MultiLabelBinarizer(classes=CORRECT_LABELS)  # Force correct label order
english_labels_bin = mlb.fit_transform(english_labels)
spanish_labels_bin = mlb.transform(spanish_labels)  # Use the same binarizer

label_classes = mlb.classes_
print(f"Corrected Label Classes: {label_classes}")  # Debugging

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Custom Dataset Class
class TweetDataset(Dataset):
    def __init__(self, texts, labels, ids, tokenizer, max_length=256):
        self.texts = texts
        self.labels = labels
        self.ids = ids
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tweet_id = self.ids[idx]
        labels = torch.tensor(self.labels[idx], dtype=torch.float)
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        return {
            "id": tweet_id,
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": labels
        }

# Split into train/test
def get_datasets(tweets, labels, ids):
    train_texts, val_texts, train_labels, val_labels, train_ids, val_ids = train_test_split(tweets, labels, ids, test_size=0.2, random_state=42)
    train_dataset = TweetDataset(train_texts, train_labels, train_ids, tokenizer)
    val_dataset = TweetDataset(val_texts, val_labels, val_ids, tokenizer)
    return train_dataset, val_dataset

train_dataset_en, val_dataset_en = get_datasets(english_tweets, english_labels_bin, english_ids)
train_dataset_es, val_dataset_es = get_datasets(spanish_tweets, spanish_labels_bin, spanish_ids)


Corrected Label Classes: ['IDEOLOGICAL-INEQUALITY' 'MISOGYNY-NON-SEXUAL-VIOLENCE' 'OBJECTIFICATION'
 'SEXUAL-VIOLENCE' 'STEREOTYPING-DOMINANCE' 'NO']


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [6]:
# EN model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased", 
    num_labels=len(label_classes), 
    problem_type="multi_label_classification"
)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results/en",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_en,  # Change to train_dataset_es for Spanish
    eval_dataset=val_dataset_en,  # Change to val_dataset_es for Spanish
)

# Train the model
trainer.train()


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Epoch,Training Loss,Validation Loss
1,0.5381,0.548271
2,0.4799,0.500603
3,0.4241,0.485042
4,0.35,0.510568
5,0.3351,0.519737




TrainOutput(global_step=815, training_loss=0.44129459082714617, metrics={'train_runtime': 550.5446, 'train_samples_per_second': 23.686, 'train_steps_per_second': 1.48, 'total_flos': 1715545691504640.0, 'train_loss': 0.44129459082714617, 'epoch': 5.0})

In [7]:
# Es model
model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased", 
    num_labels=len(label_classes), 
    problem_type="multi_label_classification"
)

# Training Arguments
training_args = TrainingArguments(
    output_dir="./results/es",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_es,  # Change to train_dataset_es for Spanish
    eval_dataset=val_dataset_es,  # Change to val_dataset_es for Spanish
)

# Train the model
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.5652,0.55968
2,0.5318,0.533694
3,0.4323,0.530696
4,0.377,0.535597
5,0.3335,0.543649




TrainOutput(global_step=915, training_loss=0.46197543209367764, metrics={'train_runtime': 616.4443, 'train_samples_per_second': 23.749, 'train_steps_per_second': 1.484, 'total_flos': 1926042095370240.0, 'train_loss': 0.46197543209367764, 'epoch': 5.0})

In [8]:
with open("/kaggle/input/exist2025-all/EXIST2025_dev.json", "r", encoding="utf-8") as f:
    dev_data = json.load(f)

# Extract tweets and IDs
dev_tweets = [entry["tweet"] for entry in dev_data.values()]
dev_ids = [entry["id_EXIST"] for entry in dev_data.values()]


In [9]:
import json

# Load the dev dataset
with open("/kaggle/input/exist2025-all/EXIST2025_dev.json", "r", encoding="utf-8") as f:
    dev_data = json.load(f)

# Split into English & Spanish
english_dev_tweets = []
english_dev_ids = []
spanish_dev_tweets = []
spanish_dev_ids = []

for entry in dev_data.values():
    tweet_id = entry["id_EXIST"]
    tweet = entry["tweet"]
    lang = entry["lang"]

    if lang == "en":
        english_dev_tweets.append(tweet)
        english_dev_ids.append(tweet_id)
    elif lang == "es":
        spanish_dev_tweets.append(tweet)
        spanish_dev_ids.append(tweet_id)

# Debugging: Check split sizes
print(f"English Dev Samples: {len(english_dev_tweets)}")
print(f"Spanish Dev Samples: {len(spanish_dev_tweets)}")


English Dev Samples: 489
Spanish Dev Samples: 549


In [10]:
import os
from transformers import BertForSequenceClassification

# Function to get the latest checkpoint
def get_latest_checkpoint(directory="./results"):
    checkpoints = [d for d in os.listdir(directory) if d.startswith("checkpoint-")]
    if not checkpoints:
        raise ValueError(f"No checkpoints found in {directory}")
    latest_checkpoint = sorted(checkpoints, key=lambda x: int(x.split('-')[-1]))[-1]
    return os.path.join(directory, latest_checkpoint)

# Load the best model checkpoint for English and Spanish
latest_checkpoint_en = get_latest_checkpoint("./results/en")
latest_checkpoint_es = get_latest_checkpoint("./results/es")

print(f"Using latest checkpoint for English: {latest_checkpoint_en}")
print(f"Using latest checkpoint for Spanish: {latest_checkpoint_es}")

# Load models
model_en = BertForSequenceClassification.from_pretrained(latest_checkpoint_en)
model_es = BertForSequenceClassification.from_pretrained(latest_checkpoint_es)


Using latest checkpoint for English: ./results/en/checkpoint-815
Using latest checkpoint for Spanish: ./results/es/checkpoint-915


In [30]:
def predict_on_dev(tweets, ids, model, tokenizer, label_classes, output_file):
    model.eval()
    results = []

    for tweet, tweet_id in zip(tweets, ids):
        encoding = tokenizer(tweet, truncation=True, padding="max_length", max_length=256, return_tensors="pt")

        with torch.no_grad():
            outputs = model(**encoding)

        logits = outputs.logits.squeeze()
        probs = torch.sigmoid(logits).cpu().numpy()

        # Convert probabilities to dictionary format and sort by highest probability
        soft_label_dict = {label_classes[i]: float(probs[i]) for i in range(len(label_classes))}
        sorted_soft_label_dict = dict(sorted(soft_label_dict.items(), key=lambda item: item[1], reverse=True))  # Sort descending

        results.append({
            "test_case": "EXIST2025",
            "id": tweet_id,
            "value": sorted_soft_label_dict  # Rename "soft_label" to "value" and sort it
        })

    # Save results
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4)

    print(f"Predictions saved to {output_file}")

# Run predictions
predict_on_dev(english_dev_tweets, english_dev_ids, model_en, tokenizer, label_classes, "EXIST2025_dev_predictions_en.json")
predict_on_dev(spanish_dev_tweets, spanish_dev_ids, model_es, tokenizer, label_classes, "EXIST2025_dev_predictions_es.json")


Predictions saved to EXIST2025_dev_predictions_en.json
Predictions saved to EXIST2025_dev_predictions_es.json


In [11]:
def predict_hard_labels_from_soft_model(tweets, ids, model, tokenizer, label_classes, output_file, threshold=0.5):
    """
    Uses the soft model to predict hard labels by applying a threshold.
    - Labels are assigned if their probability > threshold.
    - If no labels pass the threshold, assigns "NO".
    """
    model.eval()
    results = []

    for tweet, tweet_id in zip(tweets, ids):
        encoding = tokenizer(tweet, truncation=True, padding="max_length", max_length=256, return_tensors="pt")

        with torch.no_grad():
            outputs = model(**encoding)

        logits = outputs.logits.squeeze()
        probs = torch.sigmoid(logits).cpu().numpy()

        # Convert probabilities to hard labels using threshold
        hard_labels = [label_classes[i] for i, prob in enumerate(probs) if prob > threshold]

        # If no labels meet the threshold, assign "NO"
        if not hard_labels:
            hard_labels = ["NO"]

        results.append({
            "test_case": "EXIST2025",
            "id": tweet_id,
            "value": hard_labels  # Final hard labels
        })

    # Save results
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4)

    print(f"Hard label predictions saved to {output_file}")

# Run hard label prediction using soft model
predict_hard_labels_from_soft_model(english_dev_tweets, english_dev_ids, model_en, tokenizer, label_classes, "EXIST2025_dev_predictions_hard_en.json")
predict_hard_labels_from_soft_model(spanish_dev_tweets, spanish_dev_ids, model_es, tokenizer, label_classes, "EXIST2025_dev_predictions_hard_es.json")


Hard label predictions saved to EXIST2025_dev_predictions_hard_en.json
Hard label predictions saved to EXIST2025_dev_predictions_hard_es.json


In [13]:
import json

# Load the Spanish predictions
with open("/kaggle/working/EXIST2025_dev_predictions_hard_es.json", "r", encoding="utf-8") as f:
    es_data = json.load(f)

# Load the English predictions
with open("/kaggle/working/EXIST2025_dev_predictions_hard_en.json", "r", encoding="utf-8") as f:
    en_data = json.load(f)

# Assuming both files contain lists of predictions, merge them
if isinstance(es_data, list) and isinstance(en_data, list):
    merged_data = es_data + en_data
else:
    raise ValueError("JSON structure is not a list. Ensure both files contain lists.")

# Save to a new file
output_filename = "EXIST2025_dev_predictions_merged2.json"
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(merged_data, f, indent=4, ensure_ascii=False)

print(f"Merging complete! Saved to {output_filename}")


Merging complete! Saved to EXIST2025_dev_predictions_merged2.json


In [23]:
import json
import numpy as np

# File paths
predictions_file = "/kaggle/working/EXIST2025_dev_predictions_merged.json"
gold_labels_file = "/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_gold_soft.json"

# Load predictions
with open(predictions_file, "r", encoding="utf-8") as f:
    predictions_data = json.load(f)

# Load gold labels
with open(gold_labels_file, "r", encoding="utf-8") as f:
    gold_data = json.load(f)

# Convert gold labels into a dictionary for quick lookup
gold_dict = {entry["id"]: entry["value"] for entry in gold_data}

# Extract all category names
categories = ["IDEOLOGICAL-INEQUALITY", "MISOGYNY-NON-SEXUAL-VIOLENCE", 
              "OBJECTIFICATION", "SEXUAL-VIOLENCE", "STEREOTYPING-DOMINANCE", "NO"]

# Compute metrics
icm_soft_values = []
icm_soft_norm_values = []

for entry in predictions_data:
    pred_id = entry["id"]
    if pred_id in gold_dict:
        pred_values = np.array([entry["value"][cat] for cat in categories])
        gold_values = np.array([gold_dict[pred_id][cat] for cat in categories])

        # ICM Soft (Mean Squared Error)
        mse = np.mean((pred_values - gold_values) ** 2)
        icm_soft_values.append(mse)

        # ICM Soft Norm (MSE normalized by gold label mean)
        norm_factor = np.mean(gold_values ** 2)
        icm_soft_norm_values.append(mse / norm_factor if norm_factor != 0 else mse)

# Final aggregated scores
final_icm_soft = np.mean(icm_soft_values)
final_icm_soft_norm = np.mean(icm_soft_norm_values)

print(f"ICM Soft Score: {final_icm_soft:.4f}")
print(f"ICM Soft Norm Score: {final_icm_soft_norm:.4f}")


ICM Soft Score: 0.1570
ICM Soft Norm Score: 1.7035


In [14]:
pip install pyevall

Collecting pyevall
  Downloading PyEvALL-0.1.76.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jsbeautifier==1.14.9 (from pyevall)
  Downloading jsbeautifier-1.14.9.tar.gz (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting setuptools==69.5.1 (from pyevall)
  Downloading setuptools-69.5.1-py3-none-any.whl.metadata (6.2 kB)
Collecting editorconfig>=0.12.2 (from jsbeautifier==1.14.9->pyevall)
  Downloading EditorConfig-0.17.0-py3-none-any.whl.metadata (3.8 kB)
Downloading setuptools-69.5.1-py3-none-any.whl (894 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m894.6/894.6 kB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading EditorConfig-0.17.0-py3-none-any.whl (16 kB)
B

In [17]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils

predictions = "/kaggle/working/EXIST2025_dev_predictions_merged2.json"         
gold = "/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_gold_hard.json" 
test = PyEvALLEvaluation() 
params= dict() 
params[PyEvALLUtils.PARAM_REPORT]= PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED  
# metrics=["ICMSoft", "ICMSoftNorm", "CrossEntropy"]     # for soft    
metrics=["ICM", "ICMNorm" ,"FMeasure"] 
TASK1_3_HIERARCHY = {"YES":["IDEOLOGICAL-INEQUALITY","STEREOTYPING-DOMINANCE","OBJECTIFICATION", "SEXUAL-VIOLENCE", "MISOGYNY-NON-SEXUAL-VIOLENCE"], "NO":[]}
params[PyEvALLUtils.PARAM_HIERARCHY]= TASK1_3_HIERARCHY  
report= test.evaluate(predictions, gold, metrics, **params) 
report.print_report()



2025-03-26 19:37:39,281 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM', 'ICMNorm', 'FMeasure']
2025-03-26 19:37:39,461 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-03-26 19:37:39,466 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Normalized evaluation method
2025-03-26 19:37:39,471 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
{
  "metrics": {
    "ICM": {
      "name": "Information Contrast model",
      "acronym": "ICM",
      "description": "Coming soon!\\nThe evaluation FAIL.",
      "status": "FAIL",
      "results": {
        "test_cases": [],
        "average_per_test_case": null
      },
      "preconditions": {
        "METRIC_PRECONDITION_NOT_IMPLEMENTED_EVALUATION_CONTEXT": {
          "name": "METRIC_PRECONDITION_NOT_IMPLEMENTED_EVALUATION_CONTEXT",
          "description": " The selected context of

In [21]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils

# Define file paths
predictions = "/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_majority_class_hard.json"  # Change to your actual prediction file
gold = "/kaggle/input/exist2025-all/EXIST2025_dev_task1_3_gold_hard.json"   # Change to your actual gold file

# Define hierarchical structure for subtask 1.3
TASK1_3_HIERARCHY = {
    "YES": ["IDEOLOGICAL-INEQUALITY", "STEREOTYPING-DOMINANCE",
            "OBJECTIFICATION", "SEXUAL-VIOLENCE", "MISOGYNY-NON-SEXUAL-VIOLENCE"],
    "NO": []
}

# Initialize PyEvALL evaluation
evaluator = PyEvALLEvaluation()

# Set evaluation parameters
params = dict()
params[PyEvALLUtils.PARAM_HIERARCHY] = TASK1_3_HIERARCHY
params[PyEvALLUtils.PARAM_REPORT] = PyEvALLUtils.PARAM_OPTION_REPORT_EMBEDDED  # Embedded report

# Define evaluation metrics
metrics = ["ICM", "ICMNorm", "FMeasure"]
# metrics=["ICMSoft", "ICMSoftNorm"]

# Run evaluation
report = evaluator.evaluate(predictions, gold, metrics, **params)

# Print evaluation report
report.print_report()


2025-03-26 19:53:25,358 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM', 'ICMNorm', 'FMeasure']
2025-03-26 19:53:25,541 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-03-26 19:53:26,014 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Normalized evaluation method
2025-03-26 19:53:26,017 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-03-26 19:53:26,489 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2025-03-26 19:53:26,951 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
{
  "metrics": {
    "ICM": {
      "name": "Information Contrast model",
      "acronym": "ICM",
      "description": "Coming soon!",
      "status": "OK",
      "results": {
        "test_cases": [{
          "name": "EXIST2025",
          "average": -1.72372