In [1]:
import pandas as pd
import ir_measures
import ir_datasets as irds
from dotenv import load_dotenv
from openai import OpenAI
import os
import json

In [2]:
load_dotenv()

True

In [3]:
def umbrela(query, passage):
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
    response = client.responses.create(
        model="gpt-4o-mini",
        instructions="""You are a TREC assessor.""",
        input=f"""Given a query and a passage, you must provide a score on an integer scale of 0 to 3 with the following meanings: 
        0 = represent that the passage has nothing to do with the query, 
        1 = represents that the passage seems related to the query but does not answer it, 
        2 = represents that the passage has some answer for the query, but the answer may be a bit unclear, or hidden amongst extraneous 
        information and
        3 = represents that the passage is dedicated to the query and contains the exact answer.
        
        Important Instruction: Assign category 1 if the passage is somewhat related to the topic but not completely, category 2 if 
        passage presents something very important related to the entire topic but also has some extra information and category 3 if the
        passage only and entirely refers to the topic. If none of the above satisfies give it category 0.
        
        Query: {query}
        Passage: {passage}
        
        Split this problem into steps:
        Consider the underlying intent of the search.
        Measure how well the content matches a likely intent of the query (M). Measure how trustworthy the passage is (T). 
        Consider the aspects above and the relative importance of each, and decide on a final score (O). Final score must be an integer 
        value only. Do not provide any code in result. Provide each score as a single integer on the scale of 0 to 3 and nothing else
        without providing any reasoning.
        Output Final Score: """,
                text={
            "format": {
                "type": "json_schema",
                "name": "umbrela_score",
                "strict": True,
                "schema": {
                      "type": "object",
                      "properties": {
                        "final_score": {
                          "type": "integer",
                        }
                      },
                      "required": ["final_score"],
                      "additionalProperties": False
                },
            },
        },

    )
    
    return json.loads(response.output_text)['final_score']

In [4]:
def safe_umbrela_with_retry(query_text, doc_text, retries=5, delay=0.5):
    for attempt in range(retries):
        try:
            return umbrela(query_text, doc_text)
        except Exception:
            if attempt < retries - 1:
                time.sleep(delay)
    return 0  # fallback if all retries fail

In [5]:
judged = pd.read_csv('data/all_umbrela_judgements.tsv', sep='\t')

Unnamed: 0,query_id,doc_id,relevance,query_text,doc_text
0,1000000,1857541,2,where does real insulin come from,A nurse in 1938 checks the amount of insulin i...
1,1000000,1882294,2,where does real insulin come from,Insulin for injection used to come strictly fr...
2,1000000,1975289,3,where does real insulin come from,Insulin chemistry and etymology. Insulin is a ...
3,1000000,2877264,2,where does real insulin come from,Insulin can be made from the pancreas of pigs ...
4,1000000,3639919,3,where does real insulin come from,Insulin is produced in the islets of Langerhan...
...,...,...,...,...,...
651047,243139,351720,1,how long can you keep frozen meat in freezer,A fully stocked freezer will usually keep food...
651048,149142,7407503,1,"difference between vitamin a, c, e",Two types of vitamins are essential for proper...
651049,991952,28859,1,who did tennessee russell marry,Lisa Edelstein Marries Robert Russell. Move ov...
651050,185276,3780754,3,fastest car in the world motor tr,1 Hennessey Venom GT: 270.49mph (435.3km/h) As...


In [8]:
to_judge_data = [[706487, 8336833]]]
to_judge = pd.DataFrame(to_judge_data, columns=['query_id', 'doc_id'])

Unnamed: 0,query_id,doc_id
0,706487,8336833
1,1069131,5105601
2,1011630,3676130
3,135081,8110489
4,1082270,4143149
...,...,...
302,446952,937835
303,154970,214710
304,1039681,8029975
305,482800,3056636


In [9]:
dataset = irds.load("msmarco-passage/dev/judged")
docs = pd.DataFrame(dataset.docs_iter()).set_index("doc_id").text.to_dict()
queries = pd.DataFrame(dataset.queries_iter()).set_index("query_id").text.to_dict()

In [10]:
to_judge['query_text'] = to_judge['query_id'].astype(str).map(queries)
to_judge['doc_text'] = to_judge['doc_id'].astype(str).map(docs)

In [11]:
to_judge['relevance'] = to_judge.apply(lambda row: safe_umbrela_with_retry(row['query_text'], row['doc_text']), axis=1)

In [13]:
all_judged = pd.concat([judged, to_judge])
all_judged = all_judged.drop_duplicates(subset=['query_id', 'doc_id'])
all_judged.to_csv('data/all_umbrela_judgements.tsv', sep='\t', index=False)