In [None]:
import os
import sys
from modelendpoints import query
import openai
import pandas as pd

from scipy.stats import scoreatpercentile,tmean,mode
from scipy.stats import spearmanr,pearsonr
from scipy.stats import scoreatpercentile

import json
from tqdm import tqdm

In [None]:
decompose_df = pd.read_parquet("INPUT_FILE")
decompose_df.head()

In [None]:
clustering_prompt_llm_judge="""
You will be given a short group of atomic claims (3-10 sentences) about a topic. 

Your task is to rate the group for the degree to which the sentences in the group form a cohesive cluster. Cohesion is defined in terms of each sentence in a Claim List conveying the same information

Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.

Cohesion (1-10) - cohesion is the degree to which the sentences in the Claim List for a cohesive set of claims. The most cohesive set of claims would be all sentences saying exactly the same thing. The least cohesive would be no sentences saying anything in common. Something in the middle (a 5) would be about half of the sentences conveying at least one thing in common.
**Important:** Repetitions that are near exact match indicate very high cohesion.

Evaluation Steps:

1. Read the Claim List carefully.
2. Compare each sentence in the Claim List and try to determine how similar each sentence is in terms of the distinct, atomic information that they convey
3. Assess the degree to which the Claim List is cohesive

Example:

Claim List: 

{claim_list}

Evaluation Form (score ONLY): -"""

KEYS_TO_MESSAGES = {}
KEYS_COUNTER = 1
NUM_DECISIONS = 8
ROLE = "system"

l = decompose_df.shape[0]
for i in range(l):
    row = decompose_df.iloc[i]
    claim_list = list(row["Cluster Text (To Rate)"])
    prompt_updated = clustering_prompt_llm_judge.format(claim_list=claim_list)
    dict_row = [{"role": ROLE, "content": prompt_updated}]
    KEYS_TO_MESSAGES[str(KEYS_COUNTER)]=dict_row
    KEYS_COUNTER +=1
print(len(KEYS_TO_MESSAGES),len(KEYS_TO_MESSAGES)*NUM_DECISIONS)

OPENAI_KEY = os.environ.get("OPENAI_API_KEY", "")
print(OPENAI_KEY)
client = openai.OpenAI(api_key=OPENAI_KEY)

decompose_output=query.openai_batch(client,keys_to_messages=KEYS_TO_MESSAGES,model="gpt-5",
                                reasoning_effort='minimal',
                                temperature=1,
                                top_p=1,
                                frequency_penalty=0,
                                presence_penalty=0,
                                stop=None,
                                n=NUM_DECISIONS
                                )

print(len(KEYS_TO_MESSAGES),len(decompose_output))
for i in range(len(decompose_output)):
    assert str(i+1) in decompose_output
    
with open("OUTPUT_FILE","w") as f:
    json.dump(decompose_output,f,indent=True)

In [None]:
l = decompose_df.shape[0]
decompose_df["GEVAL_GPT5_8"] = [-1]*l
decompose_df["GEVAL_GPT5_8"] = decompose_df["GEVAL_GPT5_8"].astype(float)
ill_formatted = 0
ill_formatted_rows = 0

for i in range(l):
    cluster_rating_llm = decompose_output.get(str(i+1),{}).get('text',[])
    cluster_rating_llm_new = []
    for claim in cluster_rating_llm:
        claim=claim.strip()
        try:
            if claim.isdigit():
                cluster_rating_llm_new.append(max(1,float(claim)/2))
            elif ":" in claim: # to capture rating values like "[Score: 2] or Score: 2"
                claim=claim.split(":")[-1]
                claim = claim[:-1].strip() if "]" in claim else claim.strip()
                cluster_rating_llm_new.append(max(1,float(claim)/2))
            elif "-" in claim:
                claim = claim.split("-")[-1].strip() # to capture rating values like "- 9"
                cluster_rating_llm_new.append(max(1,float(claim)/2))
        except Exception as e:
            ill_formatted +=1
    if cluster_rating_llm_new:
        cluster_rating_llm_new = float(tmean(cluster_rating_llm_new))
        decompose_df.at[i,"GEVAL_GPT5_8"]=cluster_rating_llm_new
    else:
        print(i)
        ill_formatted_rows+=1
    
print("ILL FORMATTED OUTPUT",ill_formatted)
print("ILL FORMATTED ROWS",ill_formatted_rows)
print(tmean(decompose_df["GEVAL_GPT5_8"]),scoreatpercentile(decompose_df["GEVAL_GPT5_8"],50),mode(decompose_df["GEVAL_GPT5_8"]))

decompose_df.to_parquet("LLM_judge_OUTPUT_FILE")