In [7]:
!pip install sentence-transformers rouge nltk

import json
import pandas as pd
import glob
import numpy as np
from sentence_transformers import SentenceTransformer, util
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge

# --------------------------
# Helper Functions
# --------------------------
empty_cases = {"both_empty": 0, "only_one_empty": 0}

def exact_match_accuracy(pred: str, gold: str) -> float:
    p, g = str(pred).strip(), str(gold).strip()
    if not p and not g:
        empty_cases["both_empty"] += 1
        return 1.0
    if not p or not g:
        empty_cases["only_one_empty"] += 1
        return 0.0
    return 1.0 if p == g else 0.0

def normalized_cosine_similarity(pred: str, gold: str, model: SentenceTransformer) -> float:
    p, g = str(pred).strip(), str(gold).strip()
    if not p and not g:
        empty_cases["both_empty"] += 1
        return 1.0
    if not p or not g:
        empty_cases["only_one_empty"] += 1
        return 0.0
    emb_pred = model.encode(p, convert_to_tensor=True)
    emb_gold = model.encode(g, convert_to_tensor=True)
    cosine_score = util.cos_sim(emb_pred, emb_gold).item()
    return (cosine_score + 1) / 2

def bleu_score(pred: str, gold: str) -> float:
    p, g = str(pred).strip(), str(gold).strip()
    if not p and not g:
        empty_cases["both_empty"] += 1
        return 1.0
    if not p or not g:
        empty_cases["only_one_empty"] += 1
        return 0.0
    ref_tokens = g.split()
    pred_tokens = p.split()
    smoothing = SmoothingFunction().method1
    return sentence_bleu([ref_tokens], pred_tokens, smoothing_function=smoothing)

def rouge_l_score(pred: str, gold: str) -> float:
    p, g = str(pred).strip(), str(gold).strip()
    if not p and not g:
        empty_cases["both_empty"] += 1
        return 1.0
    if not p or not g:
        empty_cases["only_one_empty"] += 1
        return 0.0
    rouge = Rouge()
    try:
        scores = rouge.get_scores(p, g)
        return scores[0]['rouge-l']['f']
    except ValueError:
        return 0.0

# --------------------------
# Task Category Groups
# --------------------------
CLASSIFICATION_CATEGORIES = [
    'Answer Verification', 'Answerability Classification', 'Cause Effect Classification',
    'Coherence Classification', 'Commonsense Classification', 'Coreference Resolution',
    'Dialogue Act Recognition', 'Discourse Connective Identification',
    'Discourse Relation Classification', 'Ethics Classification', 'Gender Classification',
    'Irony Detection', 'Intent Identification', 'Named Entity Recognition',
    'Negotiation Strategy Detection', 'Overlap Extraction', 'Pos Tagging',
    'Preposition Prediction', 'Section Classification', 'Spam Classification',
    'Speaker Identification', 'Speaker Relation Classification', 'Stance Detection',
    'Stereotype Detection', 'Toxic Language Detection', 'Word Relation Classification',
    'Fill in The Blank', 'Language Identification', 'Keyword Tagging',
    'Grammar Error Correction', 'Grammar Error Detection', 'Punctuation Error Detection',
    'Spelling Error Detection', 'Misc.', 'Dialogue State Tracking',
    'Information Extraction', 'Sentiment Analysis', 'Entity Relation Classification',
    'Textual Entailment', 'Text Categorization', 'Linguistic Probing'
]

SBERT_CATEGORIES = [
    'Sentence Composition', 'Sentence Compression', 'Sentence Expansion',
    'Sentence Ordering', 'Sentence Perturbation', 'Story Composition',
    'Style Transfer', 'Entity Generation', 'Text Matching', 'Text Quality Evaluation',
    'Question Answering', 'Wrong Candidate Generation'
]

BLEU_CATEGORIES = [
    'Paraphrasing', 'Text Simplification', 'Text to Code',
    'Title Generation', 'Translation', 'Question Rewriting', 'Question Generation',
    'Question Decomposition'
]

ROUGE_CATEGORIES = [
    'Summarization', 'Code to Text', 'Data to Text', 'Dialogue Generation',
    'Explanation', 'Paper Review', 'Text Completion', 'Poem Generation',
]

FACT_NUMERIC_CATEGORIES = [
    'Fact Verification', 
    'Question Understanding', 'Word Analogy', 'Word Semantics', 'Mathematics',
    'Number Conversion', 'Program Execution'
]

# --------------------------
# Compute best score among predictions and golds
# --------------------------
def best_score(preds, golds, category, model):
    """Compute best pairwise score among all predictions and all golds."""
    best = 0.0
    for p in preds:
        for g in golds:
            if category in CLASSIFICATION_CATEGORIES or category in FACT_NUMERIC_CATEGORIES:
                score = exact_match_accuracy(p, g)
            elif category in SBERT_CATEGORIES:
                score = normalized_cosine_similarity(p, g, model)
            elif category in BLEU_CATEGORIES:
                score = bleu_score(p, g)
            elif category in ROUGE_CATEGORIES:
                score = rouge_l_score(p, g)
            else:
                score = exact_match_accuracy(p, g)
            if score > best:
                best = score
    return best

# --------------------------
# Compute metrics per task_name/input group
# --------------------------
def compute_metrics_for_task(df: pd.DataFrame, task_name: str, category: str, model: SentenceTransformer) -> dict:
    """Compute average best-match score per unique (task_name, inputs) and track counts."""
    task_df = df[df['task_name'] == task_name]
    if len(task_df) == 0:
        return {'avg_acc': np.nan, 'avg_latency': np.nan, 'num_score_records': 0, 'num_latency_records': 0}

    scores = []
    latencies = []

    # Group by unique (task_name, inputs)
    grouped = task_df.groupby(['task_name', 'inputs'])
    for (tname, input_val), group in grouped:
        preds = group['output_text'].tolist()
        golds = group['targets'].tolist()
        scores.append(best_score(preds, golds, category, model))
        latencies.extend(group['latency_sec'].tolist())

    return {
        'avg_acc': float(np.mean(scores)),
        'avg_latency': float(np.mean(latencies)),
        'num_score_records': len(grouped),  # number of unique input groups
        'num_latency_records': len(latencies)  # total number of rows
    }

# --------------------------
# Main Script
# --------------------------
def main(results_folder: str, task_map_file: str, task_domain_map_file: str, output_file: str, model_name: str):
    # Load parquet files
    all_files = glob.glob(f"{results_folder}/*.parquet")
    df_list = [pd.read_parquet(f) for f in all_files]
    df = pd.concat(df_list, ignore_index=True)
    print(f"Loaded {len(df)} records from {len(all_files)} parquet files.")

    # Load task_name → category map
    with open(task_map_file, 'r') as f:
        task_to_category = json.load(f)

    # Load task_name → domain map
    with open(task_domain_map_file, 'r') as f:
        task_to_domain = json.load(f)

    # Load SBERT model
    sbert_model = SentenceTransformer('all-mpnet-base-v2')

    # Compute metrics per task_name
    results = []
    for task_name, category in task_to_category.items():
        task_df = df[df['task_name'] == task_name]
        num_records = len(task_df)
        if num_records == 0:
            print(f"⚠️ No records found for task: {task_name} (category={category})")
            continue

        metrics = compute_metrics_for_task(df, task_name, category, sbert_model)

        results.append({
            'task_name': task_name,
            'avg_acc': metrics['avg_acc'],
            'avg_latency': metrics['avg_latency'],
            'num_score_records': metrics['num_score_records'],
            'num_latency_records': metrics['num_latency_records'],
            'model_name': model_name,
            'category': category,
            'domain': task_to_domain.get(task_name, "Unknown")
        })

    # Save to CSV
    pd.DataFrame(results)[
        ['task_name', 'avg_acc', 'avg_latency', 'num_score_records', 'num_latency_records', 
         'model_name', 'category', 'domain']
    ].to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

if __name__ == "__main__":
    RESULTS_FOLDER = "results_chunks_google_flan-t5-large"
    TASK_MAP_FILE = "task_to_category_map.json"
    TASK_DOMAIN_MAP_FILE = "task_to_domain_map.json"
    OUTPUT_FILE = "task_metrics.csv"
    MODEL_NAME = "google/flan-t5-large"

    main(RESULTS_FOLDER, TASK_MAP_FILE, TASK_DOMAIN_MAP_FILE, OUTPUT_FILE, MODEL_NAME)

    print(f"\nEmpty case stats:")
    print(f" Both empty (counted as correct): {empty_cases['both_empty']}")
    print(f" One empty (counted as incorrect): {empty_cases['only_one_empty']}")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Loaded 98417 records from 500 parquet files.
⚠️ No records found for task: task003_mctaco_question_generation_event_duration (category=Question Generation)
⚠️ No records found for task: task004_mctaco_answer_generation_event_duration (category=Question Answering)
⚠️ No records found for task: task005_mctaco_wrong_answer_generation_event_duration (category=Wrong Candidate Generation)
⚠️ No records found for task: task006_mctaco_question_generation_transient_stationary (category=Question Generation)
⚠️ No records found for task: task007_mctaco_answer_generation_transient_stationary (category=Question Answering)
⚠️ No records found for task: task008_mctaco_wrong_answer_generation_transient_stationary (category=Wrong Candidate Generation)
⚠️ No records found for task: task009_mctaco_question_generation_event_ordering (category=Question Generation)
⚠️ No records found for task: task010_mctaco_answer_generation_event_ordering (category=Question Answering)
⚠️ No records found for task: task01

In [5]:
import pandas as pd

# Load the CSV
df = pd.read_csv("task_metrics.csv")

# 1️⃣ Task names where avg_acc == 0
zero_acc_tasks = df[df['avg_acc'] == 0]['task_name'].tolist()
print("Tasks with avg_acc == 0:")
for t in zero_acc_tasks:
    print(t)

# 2️⃣ Task names where 0 < avg_acc <= 0.05
low_acc_tasks = df[(df['avg_acc'] > 0) & (df['avg_acc'] <= 0.05)]['task_name'].tolist()
print("\nTasks with 0 < avg_acc <= 0.05:")
for t in low_acc_tasks:
    print(t)


Tasks with avg_acc == 0:
task080_piqa_answer_generation
task081_piqa_wrong_answer_generation
task085_unnatural_addsub_arithmetic
task087_new_operator_addsub_arithmetic
task093_conala_normalize_lists
task094_conala_calculate_mean
task122_conala_list_index_addition
task123_conala_sort_dictionary
task168_strategyqa_question_decomposition
task176_break_decompose_questions
task183_rhyme_generation
task371_synthetic_product_of_list
task454_swag_incorrect_answer_generation
task592_sciq_incorrect_answer_generation
task598_cuad_answer_generation
task849_pubmedqa_answer_generation
task1087_two_number_sum
task1088_array_of_products
task1151_swap_max_min
task1190_add_integer_to_list
task1318_country_national_dish
task1319_country_by_barcode_prefix
task1425_country_iso_numeric
task1428_country_surface_area

Tasks with 0 < avg_acc <= 0.05:
task024_cosmosqa_answer_generation
task025_cosmosqa_incorrect_answer_generation
task062_bigbench_repeat_copy_logic
task074_squad1.1_question_generation
task076_sp