In [None]:
!pip install textstat

import textstat
import pandas as pd

# Function to compute readability metrics
def calculate_readability_metrics(text):
    return {
        "Flesch-Kincaid Reading Ease": textstat.flesch_reading_ease(text),
        "Flesch-Kincaid Grade Level": textstat.flesch_kincaid_grade(text),
        "Gunning Fog Score": textstat.gunning_fog(text),
        "SMOG Index": textstat.smog_index(text)
    }

Collecting textstat
  Downloading textstat-0.7.5-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting cmudict (from textstat)
  Downloading cmudict-1.0.32-py3-none-any.whl.metadata (3.6 kB)
Downloading textstat-0.7.5-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.3/105.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cmudict-1.0.32-py3-none-any.whl (939 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.4/939.4 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, cmudict, textstat
Successfully installed cmudict-1.0.32 pyphen-0.17.2 textstat-0.7.5


In [None]:
#@title ABC
# Load CSV file
file_path = '/content/dyslexia_friendly_texts_all_20.csv'
df = pd.read_csv(file_path)

# Extract column names (LLM names)
llm_names = df.columns[1:]  # Exclude the first column (original text)

# Prepare results storage
results = []

# Process each row (publication)
for index, row in df.iterrows():
    original_text = row.iloc[0]  # First column: Original unsimplified text

    # Compute metrics for original text
    original_metrics = calculate_readability_metrics(original_text)
    results.append({
        "Publication": index + 1,
        "LLM": "Original",
        **original_metrics
    })

    # Compute metrics for each LLM's simplification
    for llm in llm_names:
        simplified_text = row[llm]
        simplified_metrics = calculate_readability_metrics(simplified_text)

        results.append({
            "Publication": index + 1,
            "LLM": llm,
            **simplified_metrics
        })

# Convert to DataFrame
df_results = pd.DataFrame(results)

print(df_results)
# Display results in Jupyter Notebook
#tools.display_dataframe_to_user(name="Readability Metrics", dataframe=df_results)

    Publication         LLM  Flesch-Kincaid Reading Ease  \
0             1    Original                        50.16   
1             1      GPT-4o                        69.07   
2             1     Llama 3                        58.58   
3             1  Gemini 2.0                        79.26   
4             2    Original                        34.26   
..          ...         ...                          ...   
75           19  Gemini 2.0                        88.02   
76           20    Original                        51.04   
77           20      GPT-4o                        51.04   
78           20     Llama 3                        75.20   
79           20  Gemini 2.0                        77.94   

    Flesch-Kincaid Grade Level  Gunning Fog Score  SMOG Index  
0                         11.5              13.64        13.4  
1                          6.3               9.28        10.5  
2                          8.2               9.29        11.2  
3                      

In [None]:
!pip install -U sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-4.0.2-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import textstat

# Load CSV
file_path = "/content/dyslexia_friendly_texts_all_20.csv"
df = pd.read_csv(file_path, encoding='latin-1')
df.fillna("", inplace=True)

# Extract LLM names
llm_names = df.columns[1:]

# Load sentence embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# --- Helper Functions ---
def semantic_similarity(text1, text2):
    text1, text2 = str(text1), str(text2)
    if not text1.strip() or not text2.strip(): return 0.0
    emb1, emb2 = model.encode(text1), model.encode(text2)
    if np.all(emb1==0) or np.all(emb2==0): return 0.0
    sim = 1 - cosine(emb1, emb2)
    return sim if not np.isnan(sim) else 1.0 if np.array_equal(emb1, emb2) else 0.0

def compression_penalty(original, simplified):
    orig_len = len(str(original).split())
    simp_len = len(str(simplified).split())
    if orig_len == 0: return 1.0
    return max(0, 1 - (simp_len / orig_len))

def balanced_meaning_preservation(original, simplified):
    sim = semantic_similarity(original, simplified)
    penalty = compression_penalty(original, simplified)
    return sim * (1 - penalty)

# --- Define metric types and weights ---
invert_cols = ["Flesch-Kincaid Grade Level", "Gunning Fog Score", "Coleman-Liau Index"]
retain_cols = ["Flesch-Kincaid Reading Ease", "BMPS"]
all_metrics = retain_cols + invert_cols

# Custom weights - adjust as needed
#weights_dict = {
#    "Flesch-Kincaid Reading Ease": 0.20,
#    "BMPS": 0.40,
#    "Flesch-Kincaid Grade Level": 0.15,
#    "Gunning Fog Score": 0.15,
#    "Coleman-Liau Index": 0.10
#}
#weights_array = np.array([weights_dict[col] for col in all_metrics])

# --- Define metric types and weights ---
invert_cols = ["Flesch-Kincaid Grade Level", "Gunning Fog Score", "Coleman-Liau Index"]
retain_cols = ["Flesch-Kincaid Reading Ease", "BMPS"]
all_metrics = retain_cols + invert_cols

# Equal weights for all metrics
num_metrics = len(all_metrics)
weights_array = np.ones(num_metrics) / num_metrics  # Equal weight for each metric (1/n)

print(f"Using equal weights ({1/num_metrics:.2f}) for all {num_metrics} metrics")

# --- Collect raw metrics ---
print("Collecting raw metrics...")
records = []

for index, row in df.iterrows():
    original = str(row["Article"])
    if not original.strip():
        print(f"Warning: Skipping row {index+1} due to empty 'Article'.")
        continue

    for llm in llm_names:
        simplified = str(row[llm])
        if not simplified.strip():
            metrics = {
                "Flesch-Kincaid Reading Ease": 0,
                "Flesch-Kincaid Grade Level": 20,
                "Gunning Fog Score": 20,
                "Coleman-Liau Index": 20,
                "BMPS": 0.0
            }
        else:
            metrics = {
                "Flesch-Kincaid Reading Ease": textstat.flesch_reading_ease(simplified),
                "Flesch-Kincaid Grade Level": textstat.flesch_kincaid_grade(simplified),
                "Gunning Fog Score": textstat.gunning_fog(simplified),
                "Coleman-Liau Index": textstat.coleman_liau_index(simplified),
                "BMPS": balanced_meaning_preservation(original, simplified)
            }
        records.append({
            "Article ID": index + 1,
            "LLM": llm,
            **metrics
        })

df_raw = pd.DataFrame(records)
print(f"Collected metrics for {len(df_raw)} Article x LLM combinations.")

# --- APPROACH 1: Global normalization followed by aggregation and TOPSIS ---
print("\n=== APPROACH 1: Global Normalization → Aggregation → TOPSIS ===")

# Create a copy for global normalization
df_global = df_raw.copy()

# Invert metrics where higher is worse
for col in invert_cols:
    max_val = df_global[col].max()
    min_val = df_global[col].min()
    df_global[col] = max_val + min_val - df_global[col]

# Normalize all metrics globally
for col in all_metrics:
    min_val = df_global[col].min()
    max_val = df_global[col].max()
    if max_val > min_val:
        df_global[col] = (df_global[col] - min_val) / (max_val - min_val)
    else:
        df_global[col] = 0.5  # Handle constant values

# Aggregate metrics by LLM
df_agg_global = df_global.groupby("LLM")[all_metrics].mean()

# Apply TOPSIS on aggregated metrics
decision_matrix = df_agg_global[all_metrics].to_numpy()
ideal_best = np.max(decision_matrix, axis=0)
ideal_worst = np.min(decision_matrix, axis=0)

D_plus = np.sqrt(np.sum(weights_array * (decision_matrix - ideal_best)**2, axis=1))
D_minus = np.sqrt(np.sum(weights_array * (decision_matrix - ideal_worst)**2, axis=1))

topsis_global = D_minus / (D_plus + D_minus)
df_agg_global["TOPSIS_Global"] = topsis_global
df_agg_global["Global_Rank"] = df_agg_global["TOPSIS_Global"].rank(ascending=False, method="min")

# --- APPROACH 2: Per-article normalization and TOPSIS ---
print("\n=== APPROACH 2: Per-Article Normalization → Per-Article TOPSIS → Aggregation ===")

article_scores = []

for article_id, group in df_raw.groupby("Article ID"):
    df_article = group.copy()

    # Invert where higher is worse for this article
    for col in invert_cols:
        max_val = df_article[col].max()
        min_val = df_article[col].min()
        df_article[col] = max_val + min_val - df_article[col]

    # Normalize within this article
    for col in all_metrics:
        min_val = df_article[col].min()
        max_val = df_article[col].max()
        if max_val > min_val:
            df_article[col] = (df_article[col] - min_val) / (max_val - min_val)
        else:
            df_article[col] = 0.5  # Handle constant values

    # TOPSIS for this article
    M = df_article[all_metrics].to_numpy()
    ideal_best = np.max(M, axis=0)
    ideal_worst = np.min(M, axis=0)

    D_pos = np.sqrt(np.sum(weights_array * (M - ideal_best)**2, axis=1))
    D_neg = np.sqrt(np.sum(weights_array * (M - ideal_worst)**2, axis=1))

    topsis_scores = D_neg / (D_pos + D_neg)

    df_article["TOPSIS_Local"] = topsis_scores
    df_article["Local_Rank"] = df_article["TOPSIS_Local"].rank(ascending=False, method="min")

    article_scores.append(df_article)

df_all_local = pd.concat(article_scores)

# Aggregate local TOPSIS scores
df_agg_local = df_all_local.groupby("LLM").agg({
    "TOPSIS_Local": ["mean", "std", "min", "max"],
    "Local_Rank": ["mean", "std", "min", "max"]
})

# Flatten column names
df_agg_local.columns = ['_'.join(col).strip() for col in df_agg_local.columns.values]
df_agg_local["Local_Rank_Agg"] = df_agg_local["TOPSIS_Local_mean"].rank(ascending=False, method="min")

# --- COMBINED APPROACH: Hybrid scoring and comprehensive ranking ---
print("\n=== COMBINED APPROACH: Hybrid Evaluation ===")

# Join the results from both approaches
df_combined = pd.merge(
    df_agg_global[["TOPSIS_Global", "Global_Rank"]],
    df_agg_local[["TOPSIS_Local_mean", "TOPSIS_Local_std", "Local_Rank_Agg"]],
    left_index=True, right_index=True
)

# Calculate a hybrid score (weighted combination of global and local TOPSIS)
# Adjust weights (0.5, 0.5) as needed to prioritize global vs local metrics
global_weight = 0.5
local_weight = 0.5

# Normalize the TOPSIS scores to [0,1] before combining
g_min, g_max = df_combined["TOPSIS_Global"].min(), df_combined["TOPSIS_Global"].max()
l_min, l_max = df_combined["TOPSIS_Local_mean"].min(), df_combined["TOPSIS_Local_mean"].max()

if g_max > g_min:
    df_combined["TOPSIS_Global_Norm"] = (df_combined["TOPSIS_Global"] - g_min) / (g_max - g_min)
else:
    df_combined["TOPSIS_Global_Norm"] = 0.5

if l_max > l_min:
    df_combined["TOPSIS_Local_Norm"] = (df_combined["TOPSIS_Local_mean"] - l_min) / (l_max - l_min)
else:
    df_combined["TOPSIS_Local_Norm"] = 0.5

# Calculate hybrid score and rank
df_combined["Hybrid_Score"] = (
    global_weight * df_combined["TOPSIS_Global_Norm"] +
    local_weight * df_combined["TOPSIS_Local_Norm"]
)
df_combined["Hybrid_Rank"] = df_combined["Hybrid_Score"].rank(ascending=False, method="min")

# Calculate a consistency-adjusted score
# Models with high consistency (low std dev) get a bonus
consistency_weight = 0.2  # Adjust as needed (0.0 to ignore consistency)
max_std = df_combined["TOPSIS_Local_std"].max()
min_std = df_combined["TOPSIS_Local_std"].min()

if max_std > min_std:
    consistency_factor = 1 - ((df_combined["TOPSIS_Local_std"] - min_std) / (max_std - min_std))
else:
    consistency_factor = np.ones(len(df_combined))

df_combined["Consistency_Adjusted_Score"] = (
    df_combined["Hybrid_Score"] * (1 + consistency_weight * consistency_factor)
)
df_combined["Final_Rank"] = df_combined["Consistency_Adjusted_Score"].rank(ascending=False, method="min")

# Add raw metrics averages for reference
df_raw_means = df_raw.groupby("LLM")[all_metrics].mean()
df_combined = pd.merge(df_combined, df_raw_means, left_index=True, right_index=True)

# Sort and display the final results
df_final = df_combined.sort_values("Final_Rank")
print("\n=== FINAL COMBINED RANKINGS ===")
print(df_final[["Final_Rank", "Consistency_Adjusted_Score", "Global_Rank", "Local_Rank_Agg",
                "TOPSIS_Global", "TOPSIS_Local_mean", "TOPSIS_Local_std"]])

# Export to CSV if needed
# df_final.to_csv("llm_combined_evaluation.csv")

# Return dataframes for further analysis
print("\nComplete evaluation finished. Results available in df_final.")

Using equal weights (0.20) for all 5 metrics
Collecting raw metrics...
Collected metrics for 60 Article x LLM combinations.

=== APPROACH 1: Global Normalization → Aggregation → TOPSIS ===

=== APPROACH 2: Per-Article Normalization → Per-Article TOPSIS → Aggregation ===

=== COMBINED APPROACH: Hybrid Evaluation ===

=== FINAL COMBINED RANKINGS ===
            Final_Rank  Consistency_Adjusted_Score  Global_Rank  \
LLM                                                               
Gemini 2.0         1.0                    1.200000          1.0   
GPT-4o             2.0                    0.215746          2.0   
Llama 3            3.0                    0.000000          3.0   

            Local_Rank_Agg  TOPSIS_Global  TOPSIS_Local_mean  TOPSIS_Local_std  
LLM                                                                             
Gemini 2.0             1.0       0.876993           0.736897          0.193925  
GPT-4o                 2.0       0.217510           0.412009          0

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import textstat

# Load CSV
file_path = "/content/dyslexia_friendly_texts_all_20.csv"
df = pd.read_csv(file_path, encoding='latin-1')
df.fillna("", inplace=True)

# Extract LLM names
llm_names = df.columns[1:]

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

# --- Helper Functions ---
def semantic_similarity(text1, text2):
    text1, text2 = str(text1), str(text2)
    if not text1.strip() or not text2.strip(): return 0.0
    emb1, emb2 = model.encode(text1), model.encode(text2)
    if np.all(emb1 == 0) or np.all(emb2 == 0): return 0.0
    sim = 1 - cosine(emb1, emb2)
    return sim if not np.isnan(sim) else 1.0 if np.array_equal(emb1, emb2) else 0.0

def compression_penalty(original, simplified):
    orig_len = len(str(original).split())
    simp_len = len(str(simplified).split())
    if orig_len == 0: return 1.0
    return max(0, 1 - (simp_len / orig_len))

def balanced_meaning_preservation(original, simplified):
    sim = semantic_similarity(original, simplified)
    penalty = compression_penalty(original, simplified)
    return sim * (1 - penalty)

# --- Collect Metrics Per Article x LLM ---
records = []

for index, row in df.iterrows():
    original = str(row["Article"])
    for llm in llm_names:
        simplified = str(row[llm])
        if not simplified.strip():
            metrics = {
                "Flesch-Kincaid Reading Ease": 0,
                "Flesch-Kincaid Grade Level": 20,
                "Gunning Fog Score": 20,
                "Coleman-Liau Index": 20,
                "BMPS": 0.0
            }
        else:
            metrics = {
                "Flesch-Kincaid Reading Ease": textstat.flesch_reading_ease(simplified),
                "Flesch-Kincaid Grade Level": textstat.flesch_kincaid_grade(simplified),
                "Gunning Fog Score": textstat.gunning_fog(simplified),
                "Coleman-Liau Index": textstat.coleman_liau_index(simplified),
                "BMPS": balanced_meaning_preservation(original, simplified)
            }
        records.append({
            "Article ID": index + 1,
            "LLM": llm,
            **metrics
        })

df_raw = pd.DataFrame(records)

# --- Normalize & Apply TOPSIS Per Article ---
article_scores = []

invert_cols = ["Flesch-Kincaid Grade Level", "Gunning Fog Score", "Coleman-Liau Index"]
all_metrics = ["Flesch-Kincaid Reading Ease", "BMPS"] + invert_cols

for article_id, group in df_raw.groupby("Article ID"):
    df_article = group.copy()

    # Invert where higher is worse
    for col in invert_cols:
        max_val = df_article[col].max()
        min_val = df_article[col].min()
        df_article[col] = max_val + min_val - df_article[col]

    # Normalize
    for col in all_metrics:
        min_val = df_article[col].min()
        max_val = df_article[col].max()
        if max_val != min_val:
            df_article[col] = (df_article[col] - min_val) / (max_val - min_val)
        else:
            df_article[col] = 0.5

    # TOPSIS
    M = df_article[all_metrics].to_numpy()
    weights = np.ones(len(all_metrics)) / len(all_metrics)
    ideal_best = np.max(M, axis=0)
    ideal_worst = np.min(M, axis=0)
    D_pos = np.sqrt(np.sum(weights * (M - ideal_best)**2, axis=1))
    D_neg = np.sqrt(np.sum(weights * (M - ideal_worst)**2, axis=1))
    topsis_scores = D_neg / (D_pos + D_neg)

    df_article["TOPSIS"] = topsis_scores
    article_scores.append(df_article[["Article ID", "LLM", "TOPSIS"] + all_metrics])

df_all = pd.concat(article_scores)

# --- Aggregate Across Articles ---
df_summary = df_all.groupby("LLM").agg({
    "TOPSIS": ["mean", "std"],
    **{m: "mean" for m in all_metrics}
})

df_summary.columns = ['_'.join(col).strip() for col in df_summary.columns.values]
df_summary["TOPSIS Rank"] = df_summary["TOPSIS_mean"].rank(ascending=False, method="min")

# Show final summary
print(df_summary)

            TOPSIS_mean  TOPSIS_std  Flesch-Kincaid Reading Ease_mean  \
LLM                                                                     
GPT-4o         0.412009    0.267576                          0.244011   
Gemini 2.0     0.736897    0.193925                          0.895808   
Llama 3        0.328071    0.221581                          0.325690   

            BMPS_mean  Flesch-Kincaid Grade Level_mean  \
LLM                                                      
GPT-4o       0.726039                         0.256018   
Gemini 2.0   0.443229                         0.888235   
Llama 3      0.296927                         0.310223   

            Gunning Fog Score_mean  Coleman-Liau Index_mean  TOPSIS Rank  
LLM                                                                       
GPT-4o                    0.269753                 0.351486          2.0  
Gemini 2.0                0.977404                 0.802065          1.0  
Llama 3                   0.222095         

In [None]:
import pandas as pd
import numpy as np
#import ace_tools as tools
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import textstat # Make sure textstat is imported

# Load CSV
file_path = "/content/dyslexia_friendly_texts_all_20.csv" # Make sure this path is correct
df = pd.read_csv(file_path, encoding='latin-1')

# Handle potential missing values by filling with an empty string
df.fillna("", inplace=True)

# Extract LLM names (all columns except "Article")
llm_names = df.columns[1:]

# Load sentence embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# --- Helper Functions (Semantic Similarity, Compression, BMPS) ---
# (Using the robust versions from the previous iteration)
def semantic_similarity(text1, text2):
    text1 = str(text1)
    text2 = str(text2)
    if not text1 or not text2: return 0.0
    emb1 = model.encode(text1)
    emb2 = model.encode(text2)
    if np.all(emb1==0) or np.all(emb2==0): return 0.0
    similarity = 1 - cosine(emb1, emb2)
    # Handle potential NaN from cosine if vectors are identical or zero
    return similarity if not np.isnan(similarity) else (1.0 if np.array_equal(emb1, emb2) else 0.0)


def compression_penalty(original, simplified):
    original = str(original)
    simplified = str(simplified)
    orig_len = len(original.split())
    simp_len = len(simplified.split())
    if orig_len == 0: return 1.0
    # Avoid division by zero if original length is 0 but simplified is not
    if orig_len == 0 and simp_len > 0: return 0.0 # Or handle as appropriate
    return max(0, 1 - (simp_len / orig_len))

def balanced_meaning_preservation(original, simplified):
    original = str(original)
    simplified = str(simplified)
    if not original or not simplified: return 0.0
    sim_score = semantic_similarity(original, simplified)
    comp_penalty = compression_penalty(original, simplified)
    return sim_score * (1 - comp_penalty)

# --- Compute Scores ---
results = []
print("Processing texts...")
for index, row in df.iterrows():
    original_text = str(row["Article"])
    if not original_text.strip():
        print(f"Warning: Skipping row {index+1} due to empty 'Article'.")
        continue

    for llm in llm_names:
        simplified_text = str(row[llm])
        # print(f"Processing Article {index+1}, LLM: {llm}") # Optional: for detailed progress

        if not simplified_text.strip():
             print(f"Warning: Empty simplified text for Article {index+1}, LLM: {llm}. Assigning default scores.")
             readability_metrics = {
                 "Flesch-Kincaid Reading Ease": 0,
                 "Flesch-Kincaid Grade Level": 20,
                 "Gunning Fog Score": 20,
                 "Coleman-Liau Index": 20 # Using Coleman-Liau
             }
             bmps_score = 0.0
        else:
            readability_metrics = {
                "Flesch-Kincaid Reading Ease": textstat.flesch_reading_ease(simplified_text),
                "Flesch-Kincaid Grade Level": textstat.flesch_kincaid_grade(simplified_text),
                "Gunning Fog Score": textstat.gunning_fog(simplified_text),
                "Coleman-Liau Index": textstat.coleman_liau_index(simplified_text) # Using Coleman-Liau
            }
            bmps_score = balanced_meaning_preservation(original_text, simplified_text)

        results.append({
            "Article ID": index + 1, "LLM": llm, **readability_metrics, "BMPS": bmps_score
        })
print("Finished processing texts.")

# --- Process Results ---
if not results:
    print("Error: No results were generated.")
else:
    df_results = pd.DataFrame(results)

    # Define metric types
    higher_is_better = ["Flesch-Kincaid Reading Ease", "BMPS"]
    higher_is_worse = ["Flesch-Kincaid Grade Level", "Gunning Fog Score", "Coleman-Liau Index"] # Using Coleman-Liau

    # Invert metrics
    print("\nInverting 'higher is worse' metrics...")
    df_inverted = df_results.copy()
    inverted_col_names = {}
    for col in higher_is_worse:
        if col in df_inverted.columns and df_inverted[col].notna().any():
            max_val = df_inverted[col].max()
            min_val = df_inverted[col].min()
            inverted_name = f"{col} (Inverted)"
            if max_val != min_val:
                 df_inverted[col] = max_val + min_val - df_inverted[col]
                 inverted_col_names[col] = inverted_name
                 df_inverted.rename(columns={col: inverted_name}, inplace=True)
            else:
                 inverted_name = f"{col} (Inverted - Constant)"
                 inverted_col_names[col] = inverted_name
                 df_inverted.rename(columns={col: inverted_name}, inplace=True)
                 print(f"Warning: Column '{col}' constant, assigned name '{inverted_name}'.")

        else:
             print(f"Warning: Cannot invert column '{col}'. Skipping.")

    # Update metric list for normalization
    all_metrics_post_inversion = higher_is_better + list(inverted_col_names.values())

    # Normalize metrics (Min-Max)
    print(f"\nNormalizing metrics: {all_metrics_post_inversion}...")
    df_normalized = df_inverted.copy()
    def normalize(df, columns):
        for col in columns:
            if col in df.columns and df[col].notna().any():
                min_val = df[col].min()
                max_val = df[col].max()
                if max_val > min_val:
                    df[col] = (df[col] - min_val) / (max_val - min_val)
                else:
                    df[col] = 0.5 # Assign 0.5 if constant
            else:
                 print(f"Warning: Column '{col}' not found or all NaN during normalization. Skipping.")
        return df
    df_normalized = normalize(df_normalized, all_metrics_post_inversion)

    # Aggregate scores (Mean of Normalized Scores)
    print("\nAggregating scores...")
    # Ensure we only try to aggregate columns that actually exist after inversion/normalization
    numeric_cols_for_agg = [col for col in all_metrics_post_inversion if col in df_normalized.columns and pd.api.types.is_numeric_dtype(df_normalized[col])]

    if not numeric_cols_for_agg:
         print("Error: No valid numeric columns found for aggregation.")
    else:
        df_agg = df_normalized.groupby("LLM")[numeric_cols_for_agg].mean()
        print("\n--- Aggregated Normalized Scores (Input to TOPSIS) ---")
        print(df_agg)

        # <<< --- START: TOPSIS Calculation (replacing simple averaging) --- >>>
        print("\n--- Applying TOPSIS based on aggregated normalized scores ---")

        # Define the list of metric columns to use in TOPSIS
        # This should match the columns used in aggregation
        metrics_for_topsis = numeric_cols_for_agg

        # --- Define Weights ---
        # Option 1: Equal weights (as per user's last snippet)
        num_criteria = len(metrics_for_topsis)
        weights_array = np.array([1/num_criteria] * num_criteria)
        print("Using EQUAL weights for TOPSIS.")

        # Option 2: Custom weights (Recommended - Adjust these values)
        # Ensure weights correspond to the order in 'metrics_for_topsis'
        # Example: {'FK Ease': 0.2, 'BMPS': 0.4, 'FK Grade (Inv)': 0.15, 'Fog (Inv)': 0.15, 'Coleman (Inv)': 0.1}
        # Create the array carefully based on the actual order in metrics_for_topsis
        #weight_dict = {
       #     'Flesch-Kincaid Reading Ease': 0.20,
       #     'BMPS': 0.40,
       #     'Flesch-Kincaid Grade Level (Inverted)': 0.15, # Adjust name if constant col name used
       #     'Gunning Fog Score (Inverted)': 0.15,         # Adjust name if constant col name used
       #     'Coleman-Liau Index (Inverted)': 0.10          # Adjust name if constant col name used
       # }
         # Ensure all metrics for TOPSIS have weights and create array in correct order
      #  try:
      #      weights_array = np.array([weight_dict[col] for col in metrics_for_topsis])
      #      # Normalize weights if they don't sum to 1 (optional but good practice)
      #      if not np.isclose(weights_array.sum(), 1.0):
      #          print("Warning: Custom weights do not sum to 1. Normalizing.")
      #          weights_array = weights_array / weights_array.sum()
      #      print(f"Using CUSTOM weights for TOPSIS: {list(zip(metrics_for_topsis, weights_array))}")
      #  except KeyError as e:
      #      print(f"Error: Missing weight for metric: {e}. Check weight_dict keys and metrics_for_topsis list.")
      #      weights_array = None # Prevent further execution if weights are wrong


        if weights_array is not None:
            # Extract the decision matrix
            # Ensure df_agg only contains the columns defined in metrics_for_topsis for this step
            decision_matrix = df_agg[metrics_for_topsis].to_numpy()

            # Define ideal best and worst (using max/min on the aggregated normalized data)
            ideal_best = np.max(decision_matrix, axis=0)
            ideal_worst = np.min(decision_matrix, axis=0)

            # Compute weighted Euclidean distances
            D_plus = np.sqrt(np.sum(weights_array * (decision_matrix - ideal_best)**2, axis=1))
            D_minus = np.sqrt(np.sum(weights_array * (decision_matrix - ideal_worst)**2, axis=1))

            # Compute TOPSIS score
            sum_D = D_plus + D_minus
            topsis_score = np.divide(D_minus, sum_D, out=np.zeros_like(D_minus, dtype=float), where=sum_D!=0)

            # Add results back to df_agg
            df_agg['TOPSIS Score'] = topsis_score
            df_agg['TOPSIS Rank'] = df_agg['TOPSIS Score'].rank(ascending=False, method="min")

            # Sort by TOPSIS Rank
            df_ranked_topsis = df_agg.sort_values(by="TOPSIS Rank")

            # Display Final Ranked Results
            print("\n--- Final LLM Rankings (Based on TOPSIS) ---")
            # tools.display_dataframe_to_user(name="LLM Rankings (TOPSIS - Coleman-Liau)", dataframe=df_ranked_topsis)
            print(df_ranked_topsis)
        else:
            print("Could not perform TOPSIS due to weight definition error.")

        # <<< --- END: TOPSIS Calculation --- >>>

Processing texts...
Finished processing texts.

Inverting 'higher is worse' metrics...

Normalizing metrics: ['Flesch-Kincaid Reading Ease', 'BMPS', 'Flesch-Kincaid Grade Level (Inverted)', 'Gunning Fog Score (Inverted)', 'Coleman-Liau Index (Inverted)']...

Aggregating scores...

--- Aggregated Normalized Scores (Input to TOPSIS) ---
            Flesch-Kincaid Reading Ease      BMPS  \
LLM                                                 
GPT-4o                         0.436820  0.544060   
Gemini 2.0                     0.721196  0.467568   
Llama 3                        0.449057  0.423807   

            Flesch-Kincaid Grade Level (Inverted)  \
LLM                                                 
GPT-4o                                   0.398684   
Gemini 2.0                               0.664474   
Llama 3                                  0.405263   

            Gunning Fog Score (Inverted)  Coleman-Liau Index (Inverted)  
LLM                                                      

In [None]:
import pandas as pd
import numpy as np
#import ace_tools as tools
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import textstat # Make sure textstat is imported

# Load CSV
# Using the path provided by the user previously
file_path = "/content/dyslexia_prompt.csv"
df = pd.read_csv(file_path, encoding='latin-1')

# Handle potential missing values by filling with an empty string
df.fillna("", inplace=True)

# Extract LLM names (all columns except "Article")
llm_names = df.columns[1:]  # e.g., "GPT-4o", "Llama 3", "Gemini"

# Load sentence embedding model for BMPS calculation
model = SentenceTransformer('all-MiniLM-L6-v2')

# Semantic similarity function
def semantic_similarity(text1, text2):
    text1 = str(text1)
    text2 = str(text2)
    if not text1 or not text2: return 0.0
    emb1 = model.encode(text1)
    emb2 = model.encode(text2)
    if np.all(emb1==0) or np.all(emb2==0): return 0.0
    similarity = 1 - cosine(emb1, emb2)
    return similarity if not np.isnan(similarity) else 1.0

# Compression penalty function
def compression_penalty(original, simplified):
    original = str(original)
    simplified = str(simplified)
    orig_len = len(original.split())
    simp_len = len(simplified.split())
    if orig_len == 0: return 1.0
    return max(0, 1 - (simp_len / orig_len))

# Balanced Meaning Preservation Score (BMPS)
def balanced_meaning_preservation(original, simplified):
    original = str(original)
    simplified = str(simplified)
    if not original or not simplified: return 0.0
    sim_score = semantic_similarity(original, simplified)
    comp_penalty = compression_penalty(original, simplified)
    return sim_score * (1 - comp_penalty)

# Compute readability and BMPS scores for each LLM
results = []

# --- Process Texts ---
print("Processing texts...")
for index, row in df.iterrows():
    original_text = str(row["Article"])

    if not original_text.strip():
        print(f"Warning: Skipping row {index+1} due to empty 'Article'.")
        continue

    for llm in llm_names:
        simplified_text = str(row[llm])
        print(f"Processing Article {index+1}, LLM: {llm}")

        if not simplified_text.strip():
             print(f"Warning: Empty simplified text for Article {index+1}, LLM: {llm}. Assigning default low/high scores.")
             readability_metrics = {
                 "Flesch-Kincaid Reading Ease": 0, # Low ease
                 "Flesch-Kincaid Grade Level": 20, # High grade level
                 "Gunning Fog Score": 20, # High fog
                 # Assign default high score for Coleman-Liau
                 "Coleman-Liau Index": 20
             }
             bmps_score = 0.0
        else:
            # Compute readability metrics
            readability_metrics = {
                "Flesch-Kincaid Reading Ease": textstat.flesch_reading_ease(simplified_text),
                "Flesch-Kincaid Grade Level": textstat.flesch_kincaid_grade(simplified_text),
                "Gunning Fog Score": textstat.gunning_fog(simplified_text),
                # --- Changed ARI to Coleman-Liau Index ---
                "Coleman-Liau Index": textstat.coleman_liau_index(simplified_text)
            }

            # Compute BMPS
            bmps_score = balanced_meaning_preservation(original_text, simplified_text)

        # Store results
        results.append({
            "Article ID": index + 1,
            "LLM": llm,
            **readability_metrics,
            "BMPS": bmps_score
        })

print("Finished processing texts.")

# --- Process Results ---
if not results:
    print("Error: No results were generated. Check input data and processing steps.")
else:
    # Convert results to DataFrame
    df_results = pd.DataFrame(results)
    print("\n--- Raw Scores DataFrame ---")
    print(df_results.head())

    # Define metric types
    higher_is_better = ["Flesch-Kincaid Reading Ease", "BMPS"]
    # --- Updated higher_is_worse list to include Coleman-Liau ---
    higher_is_worse = ["Flesch-Kincaid Grade Level", "Gunning Fog Score", "Coleman-Liau Index"]

    # --- Invert metrics where higher score means worse performance ---
    print("\nInverting 'higher is worse' metrics...")
    df_inverted = df_results.copy()
    inverted_col_names = {} # To track original -> inverted names
    for col in higher_is_worse:
        if col in df_inverted.columns and df_inverted[col].notna().any():
            max_val = df_inverted[col].max()
            min_val = df_inverted[col].min()
            inverted_name = f"{col} (Inverted)"
            if max_val != min_val:
                 df_inverted[col] = max_val + min_val - df_inverted[col]
                 # Track rename for later list update
                 inverted_col_names[col] = inverted_name
                 df_inverted.rename(columns={col: inverted_name}, inplace=True)
            else:
                 print(f"Warning: Cannot invert column '{col}' as all values are the same ({min_val}). Keeping original values.")
                 inverted_name = f"{col} (Inverted - Constant)"
                 # Track rename for later list update
                 inverted_col_names[col] = inverted_name
                 df_inverted.rename(columns={col: inverted_name}, inplace=True)
        else:
             print(f"Warning: Column '{col}' not found or contains only NaNs. Skipping inversion.")

    print("\n--- Scores DataFrame After Inversion ---")
    print(df_inverted.head())

    # Update metric list for normalization using the new inverted names
    all_metrics_for_norm = higher_is_better + list(inverted_col_names.values())

    # --- Normalize metrics using Min-Max scaling ---
    print(f"\nNormalizing metrics: {all_metrics_for_norm}...")
    df_normalized = df_inverted.copy()

    def normalize(df, columns):
        for col in columns:
            if col in df.columns and df[col].notna().any():
                min_val = df[col].min()
                max_val = df[col].max()
                if max_val > min_val:
                    df[col] = (df[col] - min_val) / (max_val - min_val)
                else:
                    df[col] = 0.5
                    print(f"Warning: Column '{col}' has constant values after inversion. Normalized to 0.5.")
            else:
                print(f"Warning: Column '{col}' not found or is all NaN during normalization. Skipping.")
        return df

    # Apply normalization
    df_normalized = normalize(df_normalized, all_metrics_for_norm)

    print("\n--- Scores DataFrame After Normalization ---")
    print(df_normalized.head())


    # --- Aggregate scores for each LLM ---
    print("\nAggregating scores...")
    numeric_cols_for_agg = [col for col in all_metrics_for_norm if col in df_normalized.columns and pd.api.types.is_numeric_dtype(df_normalized[col])]

    if not numeric_cols_for_agg:
         print("Error: No valid numeric columns found for aggregation.")
    else:
        df_agg = df_normalized.groupby("LLM")[numeric_cols_for_agg].mean()

        print("\n--- Aggregated Normalized Scores ---")
        print(df_agg)

        # --- Compute a total score and Rank ---
        print("\nCalculating Total Score and Rank...")
        df_agg["Total Score"] = df_agg.mean(axis=1)
        df_agg["Rank"] = df_agg["Total Score"].rank(ascending=False, method="min")

        # Sort by rank
        df_ranked = df_agg.sort_values(by="Rank")

        # --- Display Final Ranked Results ---
        print("\n--- Final LLM Rankings (Based on Averaged Normalized Scores using Coleman-Liau) ---")
        # tools.display_dataframe_to_user(name="LLM Rankings (Coleman-Liau)", dataframe=df_ranked)
        print(df_ranked)

Processing texts...
Processing Article 1, LLM: GPT-4o
Processing Article 1, LLM: Llama 3
Processing Article 1, LLM: Gemini 2.0
Finished processing texts.

--- Raw Scores DataFrame ---
   Article ID         LLM  Flesch-Kincaid Reading Ease  \
0           1      GPT-4o                        50.63   
1           1     Llama 3                        42.07   
2           1  Gemini 2.0                        51.14   

   Flesch-Kincaid Grade Level  Gunning Fog Score  Coleman-Liau Index      BMPS  
0                         9.2              10.36               11.70  0.261720  
1                        10.4              11.41               14.43  0.115761  
2                         9.0              11.54               11.70  0.260710  

Inverting 'higher is worse' metrics...

--- Scores DataFrame After Inversion ---
   Article ID         LLM  Flesch-Kincaid Reading Ease  \
0           1      GPT-4o                        50.63   
1           1     Llama 3                        42.07   
2  

In [None]:
import numpy as np
import pandas as pd

# --- PREREQUISITE ---
# Assume 'df_agg' is the DataFrame resulting from the main script:
# It contains the MEAN of NORMALIZED scores for each LLM.
# Crucially, columns where lower was better (like Grade Level, Fog, Coleman-Liau)
# have already been INVERTED and RENAMED (e.g., "Metric Name (Inverted)").
# All columns in df_agg intended for TOPSIS should now be "higher is better".

# --- Example df_agg structure (replace with your actual df_agg) ---
# This structure MUST match the output of your main processing script
data_agg_example = {
    'Flesch-Kincaid Reading Ease': [0.8, 0.7, 0.9], # Already normalized
    'BMPS': [0.75, 0.85, 0.7], # Already normalized
    'Flesch-Kincaid Grade Level (Inverted)': [0.85, 0.9, 0.8], # Inverted & normalized
    'Gunning Fog Score (Inverted)': [0.9, 0.88, 0.92], # Inverted & normalized
    'Coleman-Liau Index (Inverted)': [0.78, 0.82, 0.75] # Inverted & normalized
}
llm_names_example = ['GPT-4o', 'Llama 3', 'Gemini']
df_agg = pd.DataFrame(data_agg_example, index=llm_names_example)
# ----------------------------------------------------------------

# --- Define the list of metric columns to use in TOPSIS ---
# These names MUST match the columns in your actual df_agg DataFrame
# (after inversion and normalization in the main script)
metrics_for_topsis = [
    "Flesch-Kincaid Reading Ease",
    "BMPS",
    "Flesch-Kincaid Grade Level (Inverted)", # Use the inverted name
    "Gunning Fog Score (Inverted)",        # Use the inverted name
    "Coleman-Liau Index (Inverted)"         # Use the inverted name for Coleman-Liau
    # Add/remove names based on the exact output columns of your main script
]

# Filter df_agg to include only the metrics for TOPSIS
df_topsis_input = df_agg[metrics_for_topsis]

# Use equal weights as per the original snippet's logic.
# You can adjust weights here if desired.
num_criteria = len(metrics_for_topsis)
weights = np.array([1/num_criteria] * num_criteria)

# Extract the decision matrix (ensure only numeric data)
decision_matrix = df_topsis_input.to_numpy()

# --- Calculations based on the user's provided snippet logic ---
# (Note: This is a weighted Euclidean distance approach, not standard TOPSIS with vector norm)

# Define the ideal best and ideal worst solutions for each criterion.
# Assumes all criteria in decision_matrix are benefit type (higher is better)
ideal_best = np.max(decision_matrix, axis=0)
ideal_worst = np.min(decision_matrix, axis=0)

# Compute the weighted Euclidean distance from each alternative to the ideal best and ideal worst.
# Ensure weights array shape aligns for broadcasting if necessary (it should be 1D here)
D_plus = np.sqrt(np.sum(weights * (decision_matrix - ideal_best)**2, axis=1))
D_minus = np.sqrt(np.sum(weights * (decision_matrix - ideal_worst)**2, axis=1))

# Compute the TOPSIS score (relative closeness to the ideal solution)
# Handle potential division by zero if D_plus + D_minus is zero
sum_D = D_plus + D_minus
topsis_score = np.divide(D_minus, sum_D, out=np.zeros_like(D_minus, dtype=float), where=sum_D!=0)
# If sum_D is 0, means D_plus=0 and D_minus=0 (alternative is both best/worst), score becomes 0. Adjust if needed.

# Add the score and rank to a copy of the input df or the original df_agg
df_ranked_topsis = df_topsis_input.copy() # Or use df_agg.copy() if you want other columns too
df_ranked_topsis["TOPSIS Score"] = topsis_score

# The alternative with the higher TOPSIS Score is considered better.
# Using 'min' rank method to handle ties like in the main script
df_ranked_topsis["TOPSIS Rank"] = df_ranked_topsis["TOPSIS Score"].rank(ascending=False, method="min")

# Sort the LLMs based on their TOPSIS ranking.
df_ranked_topsis = df_ranked_topsis.sort_values(by="TOPSIS Rank")

print("--- TOPSIS-Based Ranking (Using User Snippet Logic) ---")
# Add other columns back if needed from the original df_agg before printing
# For example: df_ranked_topsis = df_agg.join(df_ranked_topsis[['TOPSIS Score', 'TOPSIS Rank']]).sort_values(by="TOPSIS Rank")
print(df_ranked_topsis)

--- TOPSIS-Based Ranking (Using User Snippet Logic) ---
         Flesch-Kincaid Reading Ease  BMPS  \
Gemini                           0.9  0.70   
Llama 3                          0.7  0.85   
GPT-4o                           0.8  0.75   

         Flesch-Kincaid Grade Level (Inverted)  Gunning Fog Score (Inverted)  \
Gemini                                    0.80                          0.92   
Llama 3                                   0.90                          0.88   
GPT-4o                                    0.85                          0.90   

         Coleman-Liau Index (Inverted)  TOPSIS Score  TOPSIS Rank  
Gemini                            0.75      0.513301          1.0  
Llama 3                           0.82      0.486699          2.0  
GPT-4o                            0.78      0.449237          3.0  


In [None]:

import pandas as pd
import numpy as np
#import ace_tools as tools
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine

# Load CSV
file_path = "/content/dyslexia_prompt.csv"
df = pd.read_csv(file_path, encoding='latin-1')
# Extract LLM names (all columns except "Article")
llm_names = df.columns[1:]  # "GPT-4o", "Llama 3", "Gemini"

# Load sentence embedding model for BMPS calculation
# The 'from_tf' parameter has been removed.
# If you need to load a model from TensorFlow, use the SentenceTransformer.from_pretrained() method.
model = SentenceTransformer('all-MiniLM-L6-v2')

# Semantic similarity function
def semantic_similarity(text1, text2):
    emb1 = model.encode(text1)
    emb2 = model.encode(text2)
    return 1 - cosine(emb1, emb2)  # Cosine similarity

# Compression penalty function
def compression_penalty(original, simplified):
    orig_len = len(original.split())
    simp_len = len(simplified.split())
    return max(0, 1 - (simp_len / orig_len))  # Penalize longer texts

# Balanced Meaning Preservation Score (BMPS)
def balanced_meaning_preservation(original, simplified):
    sim_score = semantic_similarity(original, simplified)
    comp_penalty = compression_penalty(original, simplified)
    return sim_score * (1 - comp_penalty)  # Adjusted score

# Compute readability and BMPS scores for each LLM
results = []


#
for index, row in df.iterrows():
    original_text = row["Article"]  # The complex version

    for llm in llm_names:
        simplified_text = row[llm]
        print(llm)

        # Compute readability metrics
        readability_metrics = {
            "Flesch-Kincaid Reading Ease": textstat.flesch_reading_ease(simplified_text),
            "Flesch-Kincaid Grade Level": textstat.flesch_kincaid_grade(simplified_text),
            "Gunning Fog Score": textstat.gunning_fog(simplified_text),
            "SMOG Index": textstat.smog_index(simplified_text)
        }

        # Compute BMPS
        bmps_score = balanced_meaning_preservation(original_text, simplified_text)

        # Store results
        results.append({
            "Article ID": index + 1,
            "LLM": llm,
            **readability_metrics,
            "BMPS": bmps_score
        })

# Convert results to DataFrame
df_results = pd.DataFrame(results)

# Readability metric columns
higher_is_better = ["Flesch-Kincaid Reading Ease", "BMPS"]  # Higher is better
higher_is_worse = ["Flesch-Kincaid Grade Level", "Gunning Fog Score", "SMOG Index"]  # Higher is worse

# Invert metrics where higher = worse (so higher is always better)
for col in higher_is_worse:
    max_val = df_results[col].max()
    min_val = df_results[col].min()
    df_results[col] = max_val + min_val - df_results[col]  # Flip scale

# Normalize readability metrics using Min-Max scaling
def normalize(df, columns):
    for col in columns:
        min_val = df[col].min()
        max_val = df[col].max()
        df[col] = (df[col] - min_val) / (max_val - min_val)  # Normalize between 0 and 1
    return df

# Apply normalization
all_metrics = higher_is_better + higher_is_worse
df_results = normalize(df_results, all_metrics)

# Aggregate scores for each LLM (mean across all articles)
df_agg = df_results.groupby("LLM")[all_metrics].mean()

# Compute a total score by averaging across all metrics (including BMPS)
df_agg["Total Score"] = df_agg.mean(axis=1)

# Rank the LLMs based on Total Score (higher score = better ranking)
df_agg["Rank"] = df_agg["Total Score"].rank(ascending=False, method="dense")

# Sort by rank
df_ranked = df_agg.sort_values(by="Rank")

# Display results in Jupyter Notebook
#tools.display_dataframe_to_user(name="LLM Rankings with BMPS", dataframe=df_ranked)
print(df_ranked)

GPT-4o
Llama 3
Gemini 2.0
            Flesch-Kincaid Reading Ease      BMPS  Flesch-Kincaid Grade Level  \
LLM                                                                             
GPT-4o                         0.943771  1.000000                    0.857143   
Gemini 2.0                     1.000000  0.993077                    1.000000   
Llama 3                        0.000000  0.000000                    0.000000   

            Gunning Fog Score  SMOG Index  Total Score  Rank  
LLM                                                           
GPT-4o               1.000000         NaN     0.950228   1.0  
Gemini 2.0           0.000000         NaN     0.748269   2.0  
Llama 3              0.110169         NaN     0.027542   3.0  


In [None]:
import numpy as np
import pandas as pd

# For demonstration, assume you already have an aggregated DataFrame "df_agg" with normalized metrics.
# Here, all_metrics is a list of the metric columns.
all_metrics = ["Flesch-Kincaid Reading Ease", "Flesch-Kincaid Grade Level",
               "Gunning Fog Score", "SMOG Index", "BMPS"]

# If you haven't computed df_agg yet from df_results, you might have computed something like:
df_agg = df_results.groupby("LLM")[all_metrics].mean()

# Use equal weights if you don't have a data-driven reason to favor any metric.
# You can adjust the weights if necessary.
num_criteria = len(all_metrics)
weights = np.array([1/num_criteria] * num_criteria)

# Extract the decision matrix (each row corresponds to an alternative LLM)
decision_matrix = df_agg[all_metrics].to_numpy()

# For TOPSIS, define the ideal best and ideal worst solutions for each criterion.
ideal_best = np.max(decision_matrix, axis=0)   # For benefit criteria, the ideal is the maximum value.
ideal_worst = np.min(decision_matrix, axis=0)    # The worst solution is the minimum value.

# Compute the weighted Euclidean distance from each alternative to the ideal best and ideal worst.
D_plus = np.sqrt(np.sum(weights * (decision_matrix - ideal_best)**2, axis=1))
D_minus = np.sqrt(np.sum(weights * (decision_matrix - ideal_worst)**2, axis=1))

# Compute the TOPSIS score (relative closeness to the ideal solution) for each alternative.
df_agg["TOPSIS Score"] = D_minus / (D_plus + D_minus)

# The alternative with the higher TOPSIS Score is considered better.
df_agg["TOPSIS Rank"] = df_agg["TOPSIS Score"].rank(ascending=False, method="dense")

# Sort the LLMs based on their TOPSIS ranking.
df_ranked_topsis = df_agg.sort_values(by="TOPSIS Rank")

print("TOPSIS-Based Ranking:")
print(df_ranked_topsis)

TOPSIS-Based Ranking:
            Flesch-Kincaid Reading Ease  Flesch-Kincaid Grade Level  \
LLM                                                                   
GPT-4o                         0.943771                    0.857143   
Gemini 2.0                     1.000000                    1.000000   
Llama 3                        0.000000                    0.000000   

            Gunning Fog Score  SMOG Index      BMPS  TOPSIS Score  TOPSIS Rank  
LLM                                                                             
GPT-4o               1.000000         NaN  1.000000           NaN          NaN  
Gemini 2.0           0.000000         NaN  0.993077           NaN          NaN  
Llama 3              0.110169         NaN  0.000000           NaN          NaN  


In [None]:
#@title ABC
# Readability metric columns
higher_is_better = ["Flesch-Kincaid Reading Ease"]  # Higher is good
higher_is_worse = ["Flesch-Kincaid Grade Level", "Gunning Fog Score", "SMOG Index"]  # Higher is bad

df = df_results.copy()

# Invert metrics where higher = worse (so higher is always better)
for col in higher_is_worse:
    max_val = df[col].max()
    min_val = df[col].min()
    df[col] = max_val + min_val - df[col]  # Flip scale

# Normalize readability metrics using Min-Max scaling
def normalize(df, columns):
    for col in columns:
        min_val = df[col].min()
        max_val = df[col].max()
        df[col] = (df[col] - min_val) / (max_val - min_val)  # Normalize between 0 and 1
    return df

# Apply normalization to all metrics
all_metrics = higher_is_better + higher_is_worse
df = normalize(df_results, all_metrics)

# Aggregate scores for each LLM (mean across all publications)
df_agg = df.groupby("LLM")[all_metrics].mean()

# Compute a total score by averaging across all metrics
df_agg["Total Score"] = df_agg.mean(axis=1)

# Rank the LLMs based on Total Score (higher score = better ranking)
df_agg["Rank"] = df_agg["Total Score"].rank(ascending=False, method="dense")

# Sort by rank
df_ranked = df_agg.sort_values(by="Rank")

print(df_ranked)

            Flesch-Kincaid Reading Ease  Flesch-Kincaid Grade Level  \
LLM                                                                   
Original                       0.507168                    0.438770   
GPT-4o                         0.658461                    0.244385   
Llama 3                        0.665883                    0.241711   
Gemini 2.0                     0.830920                    0.136364   

            Gunning Fog Score  SMOG Index  Total Score  Rank  
LLM                                                           
Original             0.423565    0.452116     0.455405   1.0  
GPT-4o               0.216226    0.367725     0.371699   2.0  
Llama 3              0.230854    0.298942     0.359347   3.0  
Gemini 2.0           0.083575    0.310053     0.340228   4.0  


In [None]:
import pandas as pd
import numpy as np
#import ace_tools as tools
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine

# Load CSV
file_path = "/content/dyslexia_friendly_texts_all_20.csv"
df = pd.read_csv(file_path, encoding='latin-1')

# Extract LLM names (all columns except "Article")
llm_names = df.columns[1:]  # "GPT-4o", "Llama 3", "Gemini"

# Load sentence embedding model for BMPS calculation
model = SentenceTransformer('all-MiniLM-L6-v2')

# Semantic similarity function
def semantic_similarity(text1, text2):
    emb1 = model.encode(text1)
    emb2 = model.encode(text2)
    return 1 - cosine(emb1, emb2)  # Cosine similarity

# Compression penalty function
def compression_penalty(original, simplified):
    orig_len = len(original.split())
    simp_len = len(simplified.split())
    return max(0, 1 - (simp_len / orig_len))  # Penalize longer texts

# Balanced Meaning Preservation Score (BMPS)
def balanced_meaning_preservation(original, simplified):
    sim_score = semantic_similarity(original, simplified)
    comp_penalty = compression_penalty(original, simplified)
    return sim_score * (1 - comp_penalty)  # Adjusted score

# Compute readability and BMPS scores for each LLM
results = []

for index, row in df.iterrows():
    original_text = row["Article"]  # The complex version

    for llm in llm_names:
        simplified_text = row[llm]

        # Compute readability metrics
        readability_metrics = {
            "Flesch-Kincaid Reading Ease": textstat.flesch_reading_ease(simplified_text),
            "Flesch-Kincaid Grade Level": textstat.flesch_kincaid_grade(simplified_text),
            "Gunning Fog Score": textstat.gunning_fog(simplified_text),
            "SMOG Index": textstat.smog_index(simplified_text)
        }

        # Compute BMPS
        bmps_score = balanced_meaning_preservation(original_text, simplified_text)

        # Store results
        results.append({
            "Article ID": index + 1,
            "LLM": llm,
            **readability_metrics,
            "BMPS": bmps_score
        })

# Convert results to DataFrame
df_results = pd.DataFrame(results)

# Readability metric columns
higher_is_better = ["Flesch-Kincaid Reading Ease", "BMPS"]  # Higher is better
higher_is_worse = ["Flesch-Kincaid Grade Level", "Gunning Fog Score", "SMOG Index"]  # Higher is worse

# Invert metrics where higher = worse (so higher is always better)
for col in higher_is_worse:
    max_val = df_results[col].max()
    min_val = df_results[col].min()
    df_results[col] = max_val + min_val - df_results[col]  # Flip scale

# Normalize readability metrics using Min-Max scaling
def normalize(df, columns):
    for col in columns:
        min_val = df[col].min()
        max_val = df[col].max()
        df[col] = (df[col] - min_val) / (max_val - min_val)  # Normalize between 0 and 1
    return df

# Apply normalization
all_metrics = higher_is_better + higher_is_worse
df_results = normalize(df_results, all_metrics)

# Aggregate scores for each LLM (mean across all articles)
df_agg = df_results.groupby("LLM")[all_metrics].mean()

# Compute a total score by averaging across all metrics (including BMPS)
df_agg["Total Score"] = df_agg.mean(axis=1)

# Rank the LLMs based on Total Score (higher score = better ranking)
df_agg["Rank"] = df_agg["Total Score"].rank(ascending=False, method="dense")

# Sort by rank
df_ranked = df_agg.sort_values(by="Rank")

# Display results in Jupyter Notebook
#tools.display_dataframe_to_user(name="LLM Rankings with BMPS", dataframe=df_ranked)
print(df_ranked)

            Flesch-Kincaid Reading Ease      BMPS  Flesch-Kincaid Grade Level  \
LLM                                                                             
GPT-4o                         0.943771  1.000000                    0.857143   
Gemini 2.0                     1.000000  0.998215                    1.000000   
Llama 3                        0.000000  0.000000                    0.000000   

            Gunning Fog Score  SMOG Index  Total Score  Rank  
LLM                                                           
GPT-4o               1.000000         NaN     0.950228   1.0  
Gemini 2.0           0.000000         NaN     0.749554   2.0  
Llama 3              0.110169         NaN     0.027542   3.0  


In [None]:
# Re-load the CSV file since execution state was reset
import pandas as pd
import textstat

# File path
file_path = "/content/dyslexia_friendly_texts_all_20.csv"

# Load the CSV file
df = pd.read_csv(file_path)

# Extract only the original articles
df_originals = df[['Article']].copy()

# Function to calculate readability metrics
def calculate_readability_metrics(text):x
    return {
        "Flesch-Kincaid Reading Ease": textstat.flesch_reading_ease(text),
        "Flesch-Kincaid Grade Level": textstat.flesch_kincaid_grade(text),
        "Gunning Fog Score": textstat.gunning_fog(text),
        "SMOG Index": textstat.smog_index(text)
    }

# Compute readability metrics for each original article
readability_results = []
for index, row in df_originals.iterrows():
    metrics = calculate_readability_metrics(row['Article'])
    metrics['Article Index'] = index + 1  # Keep track of the article number
    readability_results.append(metrics)

# Convert results into a DataFrame
df_readability = pd.DataFrame(readability_results)

# Normalize the readability scores for fair comparison
def normalize(df, columns):
    for col in columns:
        min_val = df[col].min()
        max_val = df[col].max()
        df[col] = (df[col] - min_val) / (max_val - min_val)  # Normalize between 0 and 1
    return df

# Invert metrics where lower = better (so all higher scores mean "easier for dyslexic readers")
harder_is_worse = ["Flesch-Kincaid Grade Level", "Gunning Fog Score", "SMOG Index"]
for col in harder_is_worse:
    max_val = df_readability[col].max()
    min_val = df_readability[col].min()
    df_readability[col] = max_val + min_val - df_readability[col]

# Normalize all metrics
all_metrics = ["Flesch-Kincaid Reading Ease"] + harder_is_worse
df_readability = normalize(df_readability, all_metrics)

# Compute an overall dyslexia-friendliness score (higher = easier to read)
df_readability["Dyslexia-Friendliness Score"] = df_readability[all_metrics].mean(axis=1)

# Rank articles based on their readability (higher score = more dyslexia-friendly)
df_readability["Rank"] = df_readability["Dyslexia-Friendliness Score"].rank(ascending=False, method="dense")

# Sort articles by rank
df_readability_sorted = df_readability.sort_values(by="Rank")

print(df_readability_sorted)
# Display results
#import ace_tools as tools
#tools.display_dataframe_to_user(name="Original Articles Readability Analysis", dataframe=df_readability_sorted)


IndentationError: unexpected indent (<ipython-input-9-bdfe26965f3d>, line 16)

In [None]:
import textstat

def count_words(text):
    return textstat.lexicon_count(text)

df['Word Count'] = df['Article'].apply(count_words)

# Create 'Article Index' column in df to match df_readability_sorted
df['Article Index'] = df.index + 1

df_readability_sorted = pd.merge(df_readability_sorted, df[['Article Index', 'Word Count']], on='Article Index', how='left')

print(df_readability_sorted)

NameError: name 'df_readability_sorted' is not defined