In [None]:
import os
import sys
from modelendpoints import query
import openai
import pandas as pd

import json
from tqdm import tqdm
from scipy.stats import tmean
from scipy.stats import scoreatpercentile, mode

In [None]:
decompose_df = pd.read_parquet("INPUT_FILE")
decompose_df.head()

In [None]:
missing_prompt_llm_judge="""
You will be given a short paragraph (3-5 sentences) of text about a topic, and a list of atomic claims extracted from it. 
Your task is to report the number of atomic claims present in the paragraph that are missing from the list. 

*Important:* You DO NOT have to return the text  of missing claims, JUST REPORT THE MISSING COUNT.
Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.

Evaluation Criteria:

0: Every single claim that could possibly be inferred from the original paragraph is present in the Claim List
1: At least 1 claim is missing from the Claim List
2: At least 2 claims are missing from the Claim List
3: At least 3 claims are missing from the Claim List
4: At least 4 claims are missing from the Claim List
5+: The same as above, for any number greater than or equal to 5

Evaluation Steps:

1. Read the paragraph and the claim list carefully.
2. Compare the provided list of atomic claims to the original paragraph.
3. Count all the atomic claims in the paragraph that are not included in the list. 

Example:
Topic: 

{topic}

Paragraph: 

{paragraph_chunk}

Claim List: 

{claim_list}


Evaluation Form (score ONLY): -"""

KEYS_TO_MESSAGES = {}
KEYS_COUNTER = 1
NUM_DECISIONS = 8 ## for gpt-5 this is the max allowed.
ROLE = "system"

l = decompose_df.shape[0]
for i in range(l):
    row = decompose_df.iloc[i]
    para = row["Chunk"].strip()
    claim_list = row["Claim List"]
    topic = row["Topic"].strip()
    claim_list = "\n".join(claim.strip() for claim in claim_list)
    prompt_updated = missing_prompt_llm_judge.format(paragraph_chunk=para,claim_list=claim_list,topic=topic)
    dict_row = [{"role": ROLE, "content": prompt_updated}]
    KEYS_TO_MESSAGES[str(KEYS_COUNTER)]=dict_row
    KEYS_COUNTER +=1
    
print(len(KEYS_TO_MESSAGES),len(KEYS_TO_MESSAGES)*NUM_DECISIONS)
print(KEYS_TO_MESSAGES["1"][0]["content"])


OPENAI_KEY = os.environ.get("OPENAI_API_KEY", "")
print(OPENAI_KEY)
client = openai.OpenAI(api_key=OPENAI_KEY)

decompose_output=query.openai_batch(client,keys_to_messages=KEYS_TO_MESSAGES,model="gpt-5",
                                reasoning_effort='minimal',
                                temperature=1,
                                top_p=1,
                                frequency_penalty=0,
                                presence_penalty=0,
                                stop=None,
                                n=NUM_DECISIONS
                                )

print(len(KEYS_TO_MESSAGES),len(decompose_output))
for i in range(len(decompose_output)):
    assert str(i+1) in decompose_output

with open("OUTPUT_FILE","w") as f:
    json.dump(decompose_output,f,indent=True)

In [None]:
l = decompose_df.shape[0]
decompose_df["GEVAL_GPT5_8"] = [-1]*l
decompose_df["GEVAL_GPT5_8"] = decompose_df["GEVAL_GPT5_8"].astype(float) 
ill_formatted = 0
ill_formatted_rows = 0
for i in range(l):
    claim_rating_llm = decompose_output.get(str(i+1),{}).get('text',[])
    claim_rating_llm_new = []
    for claim in claim_rating_llm:
        try:
            if claim.isdigit():
                claim_rating_llm_new.append(float(claim))
            else:
                claim_rating_llm_new.append(float(claim[:-1])) ## to capture rating values like "5+", "3+" 
        except Exception as e:
            # print(claim)
            ill_formatted +=1
    if claim_rating_llm_new:
        claim_rating_llm_new = float(tmean(claim_rating_llm_new)) ## output is a np.datatype hence converting to float for sanity.
        decompose_df.at[i,"GEVAL_GPT5_8"]=claim_rating_llm_new
    else:
        print(i)
        ill_formatted_rows+=1
        
print("ILL FORMATTED OUTPUT",ill_formatted)
print("ILL FORMATTED ROWS",ill_formatted_rows)

print(tmean(decompose_df["GEVAL_GPT5_8"]),scoreatpercentile(decompose_df["GEVAL_GPT5_8"],50),mode(decompose_df["GEVAL_GPT5_8"]))
decompose_df.to_parquet("LLM_judge_OUTPUT_FILE")