In [None]:
!pip install sentence-transformers rouge nltk

import os
from datetime import datetime

import json
import pandas as pd
import glob
import numpy as np
from sentence_transformers import SentenceTransformer, util
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge

empty_cases = {"both_empty": 0, "only_one_empty": 0}

def exact_match_accuracy(pred: str, gold: str) -> float:
    p, g = str(pred).strip(), str(gold).strip()
    if not p and not g:
        empty_cases["both_empty"] += 1
        return 1.0
    if not p or not g:
        empty_cases["only_one_empty"] += 1
        return 0.0
    return 1.0 if p == g else 0.0

def normalized_cosine_similarity(pred: str, gold: str, model: SentenceTransformer) -> float:
    p, g = str(pred).strip(), str(gold).strip()
    if not p and not g:
        empty_cases["both_empty"] += 1
        return 1.0
    if not p or not g:
        empty_cases["only_one_empty"] += 1
        return 0.0
    emb_pred = model.encode(p, convert_to_tensor=True)
    emb_gold = model.encode(g, convert_to_tensor=True)
    cosine_score = util.cos_sim(emb_pred, emb_gold).item()
    return (cosine_score + 1) / 2

def bleu_score(pred: str, gold: str) -> float:
    p, g = str(pred).strip(), str(gold).strip()
    if not p and not g:
        empty_cases["both_empty"] += 1
        return 1.0
    if not p or not g:
        empty_cases["only_one_empty"] += 1
        return 0.0
    ref_tokens = g.split()
    pred_tokens = p.split()
    smoothing = SmoothingFunction().method1
    return sentence_bleu([ref_tokens], pred_tokens, smoothing_function=smoothing)

def rouge_l_score(pred: str, gold: str) -> float:
    p, g = str(pred).strip(), str(gold).strip()
    if not p and not g:
        empty_cases["both_empty"] += 1
        return 1.0
    if not p or not g:
        empty_cases["only_one_empty"] += 1
        return 0.0
    rouge = Rouge()
    try:
        scores = rouge.get_scores(p, g)
        return scores[0]['rouge-l']['f']
    except ValueError:
        return 0.0

# Task Category Groups
CLASSIFICATION_CATEGORIES = [
    'Answer Verification', 'Answerability Classification', 'Cause Effect Classification',
    'Coherence Classification', 'Commonsense Classification', 'Coreference Resolution',
    'Dialogue Act Recognition', 'Discourse Connective Identification',
    'Discourse Relation Classification', 'Ethics Classification', 'Gender Classification',
    'Irony Detection', 'Intent Identification', 'Named Entity Recognition',
    'Negotiation Strategy Detection', 'Overlap Extraction', 'Pos Tagging',
    'Preposition Prediction', 'Section Classification', 'Spam Classification',
    'Speaker Identification', 'Speaker Relation Classification', 'Stance Detection',
    'Stereotype Detection', 'Toxic Language Detection', 'Word Relation Classification',
    'Fill in The Blank', 'Language Identification', 'Keyword Tagging',
    'Grammar Error Correction', 'Grammar Error Detection', 'Punctuation Error Detection',
    'Spelling Error Detection', 'Misc.', 'Dialogue State Tracking',
    'Information Extraction', 'Sentiment Analysis', 'Entity Relation Classification',
    'Textual Entailment', 'Text Categorization', 'Linguistic Probing'
]

SBERT_CATEGORIES = [
    'Sentence Composition', 'Sentence Compression', 'Sentence Expansion',
    'Sentence Ordering', 'Sentence Perturbation', 'Story Composition',
    'Style Transfer', 'Entity Generation', 'Text Matching', 'Text Quality Evaluation',
    'Question Answering', 'Wrong Candidate Generation'
]

BLEU_CATEGORIES = [
    'Paraphrasing', 'Text Simplification', 'Text to Code',
    'Title Generation', 'Translation', 'Question Rewriting', 'Question Generation',
    'Question Decomposition'
]

ROUGE_CATEGORIES = [
    'Summarization', 'Code to Text', 'Data to Text', 'Dialogue Generation',
    'Explanation', 'Paper Review', 'Text Completion', 'Poem Generation',
]

FACT_NUMERIC_CATEGORIES = [
    'Fact Verification', 
    'Question Understanding', 'Word Analogy', 'Word Semantics', 'Mathematics',
    'Number Conversion', 'Program Execution'
]

# Compute best score among predictions and golds
def best_score(preds, golds, category, model):
    best = 0.0
    for p in preds:
        for g in golds:
            if category in CLASSIFICATION_CATEGORIES or category in FACT_NUMERIC_CATEGORIES:
                score = exact_match_accuracy(p, g)
            elif category in SBERT_CATEGORIES:
                score = normalized_cosine_similarity(p, g, model)
            elif category in BLEU_CATEGORIES:
                score = bleu_score(p, g)
            elif category in ROUGE_CATEGORIES:
                score = rouge_l_score(p, g)
            else:
                score = exact_match_accuracy(p, g)
            if score > best:
                best = score
    return best

# Compute metrics per task_name/input group
def compute_metrics_for_task(df: pd.DataFrame, task_name: str, category: str, model: SentenceTransformer) -> dict:
    task_df = df[df['task_name'] == task_name]
    if len(task_df) == 0:
        return {'avg_acc': np.nan, 'avg_latency': np.nan, 'num_score_records': 0, 'num_latency_records': 0}

    scores, latencies = [], []

    grouped = task_df.groupby(['task_name', 'inputs'])
    for (tname, input_val), group in grouped:
        preds = group['output_text'].tolist()
        golds = group['targets'].tolist()
        scores.append(best_score(preds, golds, category, model))
        latencies.extend(group['latency_sec'].tolist())

    # Get representative definition (first one)
    definition = task_df['definition'].iloc[0] if 'definition' in task_df.columns else ""

    return {
        'avg_acc': float(np.mean(scores)),
        'avg_latency': float(np.mean(latencies)),
        'num_score_records': len(grouped),
        'num_latency_records': len(latencies),
        'definition': definition
    }

def main(results_folder: str, task_map_file: str, task_domain_map_file: str, task_summary_file: str, output_file: str, model_name: str):
    # Load parquet files
    all_files = glob.glob(f"{results_folder}/*.parquet")
    df_list = [pd.read_parquet(f) for f in all_files]
    df = pd.concat(df_list, ignore_index=True)
    print(f"Loaded {len(df)} records from {len(all_files)} parquet files.")

    # Load task_name -> category map
    with open(task_map_file, 'r') as f:
        task_to_category = json.load(f)

    # Load task_name -> domain map
    with open(task_domain_map_file, 'r') as f:
        task_to_domain = json.load(f)

    # Load summaries (short definitions)
    with open(task_summary_file, 'r') as f:
        summary_data = json.load(f)
    task_to_summary = {
        item["Name"].strip("`"): item["Summary"]
        for item in summary_data if "Name" in item and "Summary" in item
    }

    # Load SBERT model
    sbert_model = SentenceTransformer('all-mpnet-base-v2')

    # Compute metrics per task_name
    results = []
    for task_name, category in task_to_category.items():
        task_df = df[df['task_name'] == task_name]
        num_records = len(task_df)
        if num_records == 0:
            print(f"No records found for task: {task_name} (category={category})")
            continue

        metrics = compute_metrics_for_task(df, task_name, category, sbert_model)

        results.append({
            'task_name': task_name,
            'avg_acc': metrics['avg_acc'],
            'avg_latency': metrics['avg_latency'],
            'num_score_records': metrics['num_score_records'],
            'num_latency_records': metrics['num_latency_records'],
            'model_name': model_name,
            'category': category,
            'domain': task_to_domain.get(task_name, "Unknown"),
            'definition': metrics['definition'],
            'short_definition': task_to_summary.get(task_name, "")
        })

    # Save to CSV
    pd.DataFrame(results)[
        ['task_name', 'avg_acc', 'avg_latency', 'num_score_records', 'num_latency_records',
         'model_name', 'category', 'domain', 'definition', 'short_definition']
    ].to_csv(output_file, index=False)

    print(f"Results saved to {output_file}")

if __name__ == "__main__":
    MODEL_NAME = "microsoft_Phi-4-mini-instruct"

    # Create unique run folder
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    RUN_FOLDER = f"runs/{MODEL_NAME}_{timestamp}"
    os.makedirs(RUN_FOLDER, exist_ok=True)

    # Redirect ALL outputs under this folder
    RESULTS_FOLDER = "results_chunks_" + MODEL_NAME
    os.makedirs(RESULTS_FOLDER, exist_ok=True)

    TASK_MAP_FILE = "task_to_category_map.json"
    TASK_DOMAIN_MAP_FILE = "task_to_domain_map.json"
    TASK_SUMMARY_FILE = "task_summary.json"

    OUTPUT_FILE = os.path.join(RUN_FOLDER, "task_metrics_" + MODEL_NAME + ".csv")

    # Run main
    main(RESULTS_FOLDER, TASK_MAP_FILE, TASK_DOMAIN_MAP_FILE, TASK_SUMMARY_FILE, OUTPUT_FILE, MODEL_NAME)

    # Save empty stats as a text file inside run folder
    with open(os.path.join(RUN_FOLDER, "empty_case_stats.txt"), "w") as f:
        f.write(f"Both empty (correct): {empty_cases['both_empty']}\n")
        f.write(f"One empty (incorrect): {empty_cases['only_one_empty']}\n")

    print("\nEmpty case stats:")
    print(f" Both empty (counted as correct): {empty_cases['both_empty']}")
    print(f" One empty (counted as incorrect): {empty_cases['only_one_empty']}")
