In [7]:
import os
import sys
import json
from collections import defaultdict
import pandas as pd
from IPython.display import display, HTML
project_root = os.path.abspath("..")
sys.path.append(project_root)

# Import metric evaluation from your existing codebase
from evaluation.explanation_evaluation_calc import evaluate_all_cot

In [8]:
# Define path structure
datasets = ["truthfulqa", "strategyqa", "medqa", "commonsenseqa"]
models = ["mistral", "llama", "qwen"]
metric_keys = ["redundancy", "weak_relevance", "strong_relevance"]
base_path = os.path.join(project_root, "results", "generation")

In [9]:
# Create master dictionary to collect average scores
summary = defaultdict(dict)

for dataset in datasets:
    for model in models:
        subfolder = f"{dataset}_{model}"
        folder_path = os.path.join(base_path, subfolder)
        
        if not os.path.exists(folder_path):
            print(f"[SKIPPED] Missing folder: {folder_path}")
            continue

        jsonl_files = [f for f in os.listdir(folder_path) if f.endswith(".jsonl")]
        if not jsonl_files:
            print(f"[SKIPPED] No .jsonl file in {folder_path}")
            continue

        filepath = os.path.join(folder_path, jsonl_files[0])
        print(f"[✓] Evaluating: {filepath}")

        try:
            results = evaluate_all_cot(filepath)
        except Exception as e:
            print(f"[ERROR] Failed to evaluate: {filepath}")
            print(e)
            continue

        for metric in metric_keys:
            try:
                avg_score = sum(entry[metric] for entry in results) / len(results)
                summary[dataset][f"{model}_{metric}"] = round(avg_score, 4)
            except Exception as e:
                summary[dataset][f"{model}_{metric}"] = "N/A"


[✓] Evaluating: c:\Users\rishi\Desktop\ImperialMSc\Individual_projec\Code\ExplainabilityInLLMs-MScThesis\results\generation\truthfulqa_mistral\cot_outputs_ollama_meta_reasoning_conclusion_step_indices_fewshot.jsonl
[✓] Evaluating: c:\Users\rishi\Desktop\ImperialMSc\Individual_projec\Code\ExplainabilityInLLMs-MScThesis\results\generation\truthfulqa_llama\cot_outputs_ollama_meta_reasoning_conclusion_step_indices_fewshot.jsonl
[✓] Evaluating: c:\Users\rishi\Desktop\ImperialMSc\Individual_projec\Code\ExplainabilityInLLMs-MScThesis\results\generation\truthfulqa_qwen\cot_outputs_ollama_meta_reasoning_conclusion_step_indices_fewshot.jsonl
[✓] Evaluating: c:\Users\rishi\Desktop\ImperialMSc\Individual_projec\Code\ExplainabilityInLLMs-MScThesis\results\generation\strategyqa_mistral\cot_outputs_ollama_meta_reasoning_conclusion_step_indices_fewshot.jsonl
[✓] Evaluating: c:\Users\rishi\Desktop\ImperialMSc\Individual_projec\Code\ExplainabilityInLLMs-MScThesis\results\generation\strategyqa_llama\cot_

In [28]:
# Convert summary into DataFrame
df = pd.DataFrame.from_dict(summary, orient="index").reset_index()
df.rename(columns={"index": "Dataset"}, inplace=True)

# Capitalize dataset names
dataset_name_map = {
    "truthfulqa": "TruthfulQA",
    "strategyqa": "StrategyQA",
    "medqa": "MedQA",
    "commonsenseqa": "CommonSenseQA"
}
df["Dataset"] = df["Dataset"].map(dataset_name_map)

# Extract column data and build MultiIndex columns
column_tuples = []
new_data = {}

for model in models:
    pretty_model = model.capitalize() if model != "llama" else "LLaMA"
    for metric in metric_keys:
        pretty_metric = metric.replace("_", " ").title()
        flat_col = f"{model}_{metric}"
        multi_col = (pretty_model, pretty_metric)
        column_tuples.append(multi_col)
        new_data[multi_col] = df[flat_col]

# Create MultiIndex DataFrame
multi_df = pd.DataFrame(new_data)
multi_df.insert(0, ("", "Dataset"), df["Dataset"])  # Insert Dataset as top-level blank
multi_df.columns = pd.MultiIndex.from_tuples(multi_df.columns)

# Overwrite df with multi_df
df = multi_df


In [32]:
# Display styled multi-index table with centered, padded subheaders
custom_table_html = f"""
<style>
th.col_heading.level0 {{
    text-align: center !important;
    padding: 10px 12px;
    background-color: #f9f9f9;
    font-weight: bold;
}}

th.col_heading.level1 {{
    text-align: center !important;
    padding: 8px 16px; /* Increased left-right padding */
}}

td {{
    text-align: center;
    padding: 6px 10px;
}}
</style>
<div style="max-height: 500px; overflow: auto;">
{df.to_html(index=False, escape=False, border=0)}
</div>
"""

display(HTML(custom_table_html))


Unnamed: 0_level_0,Mistral,Mistral,Mistral,LLaMA,LLaMA,LLaMA,Qwen,Qwen,Qwen
Dataset,Redundancy,Weak Relevance,Strong Relevance,Redundancy,Weak Relevance,Strong Relevance,Redundancy,Weak Relevance,Strong Relevance
TruthfulQA,0.1989,1.0,0.3857,0.1434,1.0,0.2542,0.708,1.0,0.843
StrategyQA,0.2251,1.0,0.4366,0.1419,1.0,0.2728,0.7478,1.0,0.8816
MedQA,0.1876,1.0,0.3721,0.2485,1.0,0.2233,0.7859,1.0,0.8852
CommonSenseQA,0.1956,1.0,0.3846,0.1536,1.0,0.2749,0.7461,1.0,0.8624


In [30]:
output_csv = os.path.join(project_root, "results", "cot_method", "cot_metrics_summary.csv")
output_html = os.path.join(project_root, "results", "cot_method", "cot_metrics_summary.html")

# Save as CSV
df.to_csv(output_csv, index=False)
print(f"[✓] CSV saved to: {output_csv}")

# Save as HTML
df.to_html(output_html, index=False)
print(f"[✓] HTML saved to: {output_html}")


[✓] CSV saved to: c:\Users\rishi\Desktop\ImperialMSc\Individual_projec\Code\ExplainabilityInLLMs-MScThesis\results\cot_method\cot_metrics_summary.csv
[✓] HTML saved to: c:\Users\rishi\Desktop\ImperialMSc\Individual_projec\Code\ExplainabilityInLLMs-MScThesis\results\cot_method\cot_metrics_summary.html
