In [None]:
import os
import sys
from modelendpoints import query
import openai
import pandas as pd

import json
from tqdm import tqdm

from scipy.stats import tmean
from scipy.stats import scoreatpercentile, mode

In [None]:
decompose_df = pd.read_parquet("INPUT_FILE")
decompose_df.head()

In [None]:
decompose_prompt_llm_judge = """
You will be given a short paragraph (3-5 sentences) of text about the {topic} as a topic, and a single atomic claim obtained from that paragraph. 
Your task is to evaluate to what degree the claim is represented in the original short piece of text on a Likert scale.
**Important**: The claim should be directly and explictly derived from the text without needed any extra external knowledge.

**Important**: If the claim is partially but not fully represented in the paragraph, assign a score accordingly, rather than defaulting to extremes.

Please make sure you read and understand these instructions carefully. Please keep this document open while reviewing, and refer to it as needed.

Evaluation Criteria:

1 - The claim is totally irrelevant to the original paragraph OR does not explicitly talk about the provided TOPIC
2 - The claim is incomplete or somewhat included in the original paragraph and missing clearly important context
3 - The claim is included in the original paragraph but missing some potentially important context. This includes factoids which could be inferred from the original context but aren't explicitly stated (e.g., "The lifecycle of plastic includes production." being inferred from "It addresses the entire lifecycle of plastic, from production and consumption to disposal and recycling.")
4 - The claim is included in the original paragraph and is missing only unimportant context (the most important information is represented)
5 - The claim is included in the original paragraph and no context is missing

Evaluation Steps:

1. Read the paragraph carefully and the claim carefully. 
2. Assign a score for the claim based on the Evaluation Criteria and the Notes provided above. Please only respond with a single digit indicating the score.


Example:

Topic: 

{topic}

Paragraph: 

{paragraph_chunk}

Atomic claim: 

{atomic_claim}

Evaluation Form (score ONLY): -"""


KEYS_TO_MESSAGES = {}
KEYS_COUNTER = 1
NUM_DECISIONS = 8
ROLE = "system"

l = decompose_df.shape[0]
for i in range(l):
    row = decompose_df.iloc[i]
    para = row["Chunk"].strip()
    topic = row["Topic"].strip()
    for claim in row["Claim List"]:
        claim = claim.strip()
        prompt_updated = decompose_prompt_llm_judge.format(paragraph_chunk=para,atomic_claim=claim,topic=topic)
        dict_row = [{"role": ROLE, "content": prompt_updated}]
        KEYS_TO_MESSAGES[str(KEYS_COUNTER)]=dict_row
        KEYS_COUNTER +=1
        
print(len(KEYS_TO_MESSAGES),len(KEYS_TO_MESSAGES)*NUM_DECISIONS)
for i in range(len(KEYS_TO_MESSAGES)):
    assert str(i+1) in KEYS_TO_MESSAGES

OPENAI_KEY = os.environ.get("OPENAI_API_KEY", "")
print(OPENAI_KEY)
client = openai.OpenAI(api_key=OPENAI_KEY)

decompose_output=query.openai_batch(client,
                                    keys_to_messages=KEYS_TO_MESSAGES,
                                    model="gpt-5",
                                    reasoning_effort='minimal',
                                    temperature=1,
                                    top_p=1,
                                    frequency_penalty=0,
                                    presence_penalty=0,
                                    stop=None,
                                    n=NUM_DECISIONS
                                )

print(len(KEYS_TO_MESSAGES),len(decompose_output))
for i in range(len(decompose_output)):
    assert str(i+1) in decompose_output

with open("OUTPUT_FILE","w") as f:
    json.dump(decompose_output,f,indent=True)

In [None]:
l = decompose_df.shape[0]
decompose_df["GEVAL_GPT5_8"] = [-1]*l
## so that we can later assign a list, else dtype remains int but assignment becomes an object...gives error 
decompose_df["GEVAL_GPT5_8"] = decompose_df["GEVAL_GPT5_8"].astype(object) 
ill_formatted = 0
ill_formatted_rows = 0
OUTPUT_COUNTER = 1
for i in range(l):
    row = decompose_df.iloc[i]
    claim_list_human = row["Claim List"]
    ch = len(claim_list_human)
    claim_rating_list = []
    claim_rating_list_final=[]
    for _ in range(ch):
        response = decompose_output.get(str(OUTPUT_COUNTER),{}).get('text',[])
        claim_rating_list.append(response)            
        OUTPUT_COUNTER+=1

    for claim_l in claim_rating_list:
        claim_rating_llm_new = []
        for o in claim_l:
            try:
                claim_rating_llm_new.append(float(o))
            except Exception as e:
                ill_formatted +=1
        if claim_rating_llm_new:
            claim_rating_llm_new = float(tmean(claim_rating_llm_new)) ## output is a np.datatype hence converting to float for sanity.
        else:
            claim_rating_llm_new = -1
        claim_rating_list_final.append(claim_rating_llm_new)
    if claim_rating_list_final:
        decompose_df.at[i,"GEVAL_GPT5_8"]=claim_rating_list_final
    else:
        ill_formatted_rows+=1
print("ILL FORMATTED OUTPUT",ill_formatted)
print("ILL FORMATTED ROWS",ill_formatted_rows)

decompose_df.to_parquet("LLM_judge_OUTPUT_FILE")