In [4]:
import os
import json
import pandas as pd
from typing import List

In [5]:
def distinct_n(sentences: List[str], n: int = 1) -> float:
    """
    Compute Distinct-N for a list of generated sentences.

    Args:
        sentences (List[str]): A list of generated text strings.
        n (int): The n-gram size (e.g. 1 for Distinct-1, 2 for Distinct-2).

    Returns:
        float: The distinct-n score (number of unique n-grams / total n-grams).
    """
    all_ngrams = []
    for sentence in sentences:
        tokens = sentence.strip().split()
        ngrams = zip(*[tokens[i:] for i in range(n)])
        all_ngrams.extend(ngrams)

    total_ngrams = len(all_ngrams)
    unique_ngrams = len(set(all_ngrams))

    if total_ngrams == 0:
        return 0.0
    return unique_ngrams / total_ngrams

In [6]:
def calculate_token_diversity(directory_path: str, column_name: str, n: int = 2) -> None:
    results = {}

    for filename in os.listdir(directory_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(directory_path, filename)
            try:
                df = pd.read_csv(file_path)
                if column_name in df.columns:
                    sentences = df[column_name].dropna().astype(str).tolist()
                    score = distinct_n(sentences, n=n)
                    results[filename] = {"distinct_n": score}
                else:
                    results[filename] = {"error": f"Column '{column_name}' not found"}
            except Exception as e:
                results[filename] = {"error": str(e)}

    # Save all results to a JSON file
    output_path = os.path.join(directory_path, "distinct_n_results.json")
    with open(output_path, "w") as f:
        json.dump(results, f, indent=4)

    print(f"Results saved to {output_path}")


In [7]:
calculate_token_diversity("/home/praveen/theoden/emnlp25/results/datasets", column_name="gen_answer", n=2)

Results saved to /home/praveen/theoden/emnlp25/results/datasets/distinct_n_results.json


In [8]:
tite = pd.read_csv('/home/praveen/theoden/emnlp25/title_df.csv')

In [9]:
tite['type_retain'] = tite['type_retain'].fillna('general')

In [10]:
tite['type_retain'].value_counts(normalize=True)

type_retain
domain     0.635203
entity     0.202110
general    0.162687
Name: proportion, dtype: float64

In [11]:
test = pd.read_csv('/home/praveen/theoden/emnlp25/full_test_set.csv')

In [12]:
test.head()

Unnamed: 0,title,question,answer,type
0,Abraham Lincoln,Which new political party did Abraham Lincoln ...,Republican Party,general
1,Abraham Lincoln,"Which act angered Abraham Lincoln in 1854, pro...",Kansas–Nebraska Act,general
2,Abraham Lincoln,In which state was Abraham Lincoln born?,Kentucky,general
3,Abraham Lincoln,What profession did Abraham Lincoln have befor...,Lawyer,general
4,Abraham Lincoln,When was Abraham Lincoln born?,"February 12, 1809",general


In [None]:
final_output_paths = []

# Use corrected nested paths for batch processing
for base_dir in corrected_dirs:
    files = os.listdir(base_dir)
    grouped = {}

    for fname in files:
        algo, experiment, dtype = parse_filename(fname)
        if not all([algo, experiment, dtype]):
            continue
        key = (algo, experiment)
        grouped.setdefault(key, {})[dtype] = os.path.join(base_dir, fname)

    for key, paths in grouped.items():
        if "forget" in paths and "test" in paths:
            algo, experiment = key
            forget_path = paths["forget"]
            test_path = paths["test"]

            forget_df = pd.read_csv(forget_path)
            test_df = pd.read_csv(test_path)

            # Normalize column names
            forget_df.columns = [col.lower().strip().replace("-", "_") for col in forget_df.columns]
            test_df.columns = [col.lower().strip().replace("-", "_") for col in test_df.columns]

            # Check required columns
            required_cols = {"title", "probs", "rouge_l", "cos_sim"}
            if not required_cols.issubset(forget_df.columns) or not required_cols.issubset(test_df.columns):
                continue

            # FE computation
            forget_df["fe_sample_score"] = forget_df[["probs", "rouge_l", "cos_sim"]].mean(axis=1)
            fe_title = 1 - forget_df.groupby("title")["fe_sample_score"].mean()

            # MU-T computation (filter out 'general')
            if "type" in test_df.columns:
                test_df = test_df[test_df["type"].str.lower() != "general"]
            test_df["mut_sample_score"] = test_df.apply(rowwise_hmean, axis=1)
            mut_title = test_df.groupby("title")["mut_sample_score"].mean()

            # Combine
            title_set = sorted(set(fe_title.index).union(mut_title.index))
            result = {
                title: {
                    "FE": round(fe_title.get(title, float("nan")), 6),
                    "MU_T": round(mut_title.get(title, float("nan")), 6)
                }
                for title in title_set
            }

            # Write to JSON
            out_filename = f"{algo}_{experiment}_per_title.json"
            out_path = os.path.join("/mnt/data/", out_filename)
            with open(out_path, "w") as f:
                json.dump(result, f, indent=2)
            final_output_paths.append(out_path)
