In [1]:
import os  
# set env LOG_LEVEL=warning
os.environ["LOG_LEVEL"] = "warning"

In [2]:
import ast
import textwrap
from typing import List
import pandas as pd
from tqdm import tqdm
from glob import glob
from time import time


from openai import AzureOpenAI

from services.opensearch_index import OpenSearchService
from utils.path_utils import ensure_dir

In [3]:
correct_qa_df = pd.read_csv(
    "../data/live_rag_questions/dmds_combined_questions_1000.tsv", sep="\t", index_col=0
)
# convert strings of lists to actual lists
correct_qa_df[["document_ids", "context"]] = correct_qa_df[
    ["document_ids", "context"]
].map(ast.literal_eval)
correct_qa_df


Unnamed: 0_level_0,question,answer,context,document_ids,question_factuality,question_premise,question_phrasing,question_linguistic_variation,question_multi-doc,user_expertise-categorization,generation_timestamp,question_length,answer_length,context_length
qid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,exocrine pancreatic insufficiency symptoms cau...,Exocrine pancreatic insufficiency (EPI) is a c...,[Get inside Wall Street with StreetInsider Pre...,[<urn:uuid:3b710308-c2a0-45cc-a851-d7cf8689c7c...,factoid,with-premise,long-search-query,similar-to-document,three-doc,novice,2025-05-01T22:47:46.423955,8,65,1459
2,traditional kofte ingredients food poisoning r...,Traditional k√∂fte is made with minced beef or ...,[True bastions of Turkish cuisine: there‚Äôs a f...,[<urn:uuid:001884ec-61a4-42f0-a44b-46bd41f5331...,open-ended,direct,short-search-query,similar-to-document,multi-aspect,novice,2025-05-01T22:47:46.423955,6,98,2742
3,I'm interested in environmental organizations ...,The Commission for Environmental Cooperation (...,[Richard A. Morgan announced as new Executive ...,[<urn:uuid:dd1b1a2e-de6c-4333-a242-62b476f0231a>],open-ended,with-premise,verbose-and-natural,similar-to-document,single-doc,novice,2025-05-01T22:47:46.423955,25,69,528
4,As a mental health professional concerned with...,The patient journey typically begins with an a...,[Psychiatric Emergency Services (PES)\nThe Psy...,[<urn:uuid:396e3028-f0a8-40ba-bd3c-bee87fa30fb...,open-ended,with-premise,verbose-and-natural,similar-to-document,three-doc,expert,2025-05-01T22:47:46.423955,43,137,3404
5,As a culinary historian exploring ancient past...,The early cultures had distinct approaches to ...,"[Pasta is a universally enjoyed food, and almo...",[<urn:uuid:239fecec-95de-40d2-b243-4d9ba9bd53c...,factoid,with-premise,verbose-and-natural,distant-from-document,three-doc,expert,2025-05-01T22:47:46.423955,33,112,3750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
996,My uncle keeps saying he doesn't have a gambli...,"Such a meeting is called an intervention, and ...",[Although people most commonly associate inter...,[<urn:uuid:3dec898b-52e0-40a5-8507-16f518617d22>],open-ended,with-premise,verbose-and-natural,distant-from-document,single-doc,novice,2025-05-02T11:45:30.494741,36,91,583
997,ways export modified 3d object surface design ...,"For three-dimensional problems, surface plots ...",[How to Reuse a Deformed Shape as a Geometry I...,[<urn:uuid:4a723482-c3db-427c-b1c8-144f6217f60b>],open-ended,direct,long-search-query,distant-from-document,single-doc,novice,2025-05-02T11:45:30.494741,11,56,1420
998,I'm researching global environmental threats -...,Both CO2 absorption and chemical contamination...,[A third of the planet's agricultural land is ...,[<urn:uuid:6846b2ea-5334-4991-8c7b-a862537d742...,factoid,with-premise,verbose-and-natural,distant-from-document,comparison,expert,2025-05-02T11:45:30.494741,24,79,2313
999,benefits drawbacks peer mediated instruction d...,Peer-mediated interventions are beneficial for...,[This chapter provides a descriptive review of...,[<urn:uuid:ab80ee62-79aa-4e8b-860d-247f948e6ad...,factoid,with-premise,long-search-query,distant-from-document,multi-aspect,expert,2025-05-02T11:45:30.494741,8,78,2683


In [4]:
correct_qa_df = (
    correct_qa_df[["question", "context", "document_ids"]].explode("context").reset_index(drop=False)
)
correct_qa_df['document_ids'] = correct_qa_df['document_ids'].apply(set)
correct_qa_df

Unnamed: 0,qid,question,context,document_ids
0,1,exocrine pancreatic insufficiency symptoms cau...,Get inside Wall Street with StreetInsider Prem...,{<urn:uuid:9848238d-ca6c-4a85-8a99-55ac51736f1...
1,1,exocrine pancreatic insufficiency symptoms cau...,The Digestive Process: What Is the Role of You...,{<urn:uuid:9848238d-ca6c-4a85-8a99-55ac51736f1...
2,2,traditional kofte ingredients food poisoning r...,True bastions of Turkish cuisine: there‚Äôs a fe...,{<urn:uuid:001884ec-61a4-42f0-a44b-46bd41f5331...
3,2,traditional kofte ingredients food poisoning r...,While grocery shopping for the Father‚Äôs Day we...,{<urn:uuid:001884ec-61a4-42f0-a44b-46bd41f5331...
4,3,I'm interested in environmental organizations ...,Richard A. Morgan announced as new Executive D...,{<urn:uuid:dd1b1a2e-de6c-4333-a242-62b476f0231a>}
...,...,...,...,...
1564,998,I'm researching global environmental threats -...,A third of the planet's agricultural land is a...,{<urn:uuid:51ca6f21-f0f4-4343-8b14-a7f059f0fe6...
1565,998,I'm researching global environmental threats -...,What is Ocean Acidification?\nSince the beginn...,{<urn:uuid:51ca6f21-f0f4-4343-8b14-a7f059f0fe6...
1566,999,benefits drawbacks peer mediated instruction d...,This chapter provides a descriptive review of ...,{<urn:uuid:c1de7ed9-c63a-42ea-a367-6c7ffc1a20b...
1567,999,benefits drawbacks peer mediated instruction d...,STRATEGY: Ensure school-level expulsion and su...,{<urn:uuid:c1de7ed9-c63a-42ea-a367-6c7ffc1a20b...


In [5]:
correct_qa_df.iloc[0,3]

{'<urn:uuid:3b710308-c2a0-45cc-a851-d7cf8689c7c4>',
 '<urn:uuid:9848238d-ca6c-4a85-8a99-55ac51736f1c>'}

In [6]:
def run_search(
    queries: List[str],
    top_k: int = 100,
):
    # For each question, retrieve top_k documents from OpenSearch - takes 9 minutes
    service = OpenSearchService()

    res = []
    for query in tqdm(queries):
        search_res = service.query_opensearch(query=query, top_k=top_k)
        res.append(search_res)

    res_df = (
        pd.DataFrame(res)[["hits"]]
        .assign(question=queries)
        .explode("hits")
        .reset_index(drop=True)
    )

    res_df = res_df.assign(
        doc_text=res_df["hits"].apply(lambda x: x.source.text),
        doc_id=res_df["hits"].apply(lambda x: x.source.doc_id),
        chunk_id=res_df["hits"].apply(lambda x: x.id.split('::')[-1]),
    ).drop(columns=["hits"])

    res_df.to_csv(
        f"../data/live_rag_questions/dmds_combined_questions_{len(queries)}_opensearch_{top_k}-results_.tsv",
        sep="\t",
        index=False,
    )

    return res_df


# res_df = run_search(
#     queries=correct_qa_df["question"].unique(),
#     top_k=1000,
# )

res_df = pd.read_csv(
    "../data/live_rag_questions/dmds_combined_questions_1000_opensearch_1000-results.tsv",
    sep="\t",
)
res_df

Unnamed: 0,question,doc_text,doc_id
0,exocrine pancreatic insufficiency symptoms cau...,Exocrine pancreatic insufficiency (EPI) is cha...,<urn:uuid:a4c85ebf-b52e-4fa1-859d-eafa27d6a7da>
1,exocrine pancreatic insufficiency symptoms cau...,Biliary dyskinesia)\n- Pancreatic: Pancreatiti...,<urn:uuid:7e4d0cb7-233c-441c-95a8-da5fa32dfba3>
2,exocrine pancreatic insufficiency symptoms cau...,Different theories have been proposed why diab...,<urn:uuid:3f7da26e-b111-42b3-a34a-24329d8275a8>
3,exocrine pancreatic insufficiency symptoms cau...,Exocrine pancreatic insufficiency syndrome in ...,<urn:uuid:751de634-e494-428b-bf85-0569826de5cb>
4,exocrine pancreatic insufficiency symptoms cau...,The pancreas is a small pink organ located und...,<urn:uuid:d72602f5-aa76-410d-a9ce-970f470d2ec2>
...,...,...,...
999995,wga strike members representation negotiations,Posted BY: Justin O. Smith\nA railway strike h...,<urn:uuid:c7b3ba80-3ebc-420f-87b0-2d0a04e74bb0>
999996,wga strike members representation negotiations,"""These allegations are without merit, and this...",<urn:uuid:c5f468ba-d540-486f-ab8d-3ae4ec82bff0>
999997,wga strike members representation negotiations,"Moreover, this protocol details a robust and r...",<urn:uuid:450830e8-e063-4fc5-9b4f-995899ce138a>
999998,wga strike members representation negotiations,"Medicine, Issue 85, Transgenic mice, breast ca...",<urn:uuid:505d2f80-7355-4b6c-8627-c08a43728a0c>


In [7]:
def remove_relevant_docs_from_candidates(candidate_df, correct_df, nth=0):
    count = 0
    for row in candidate_df.itertuples():
        if row.doc_id in correct_df[correct_df["question"] == row.question]["document_ids"].values[0]:
            candidate_df.loc[row.Index, 'doc_id'] = None
            count += 1
    print(f"Number of reference documents: {count}")

    if count > 0:
        for row in candidate_df[candidate_df["doc_id"].isna()].itertuples():
            for _row in res_df[res_df["question"] == row.question].iloc[nth:].itertuples():
                if _row.doc_id in correct_df[correct_df["question"] == row.question]["document_ids"].values[0]:
                    continue
                else:
                    print(f"For question:\n {row.question},\n replacing {row.doc_id} with {_row.doc_id}")
                    print(f"Replacing {row.Index} with {_row.Index}")
                    candidate_df.loc[row.Index, 'doc_id'] = _row.doc_id
                    candidate_df.loc[row.Index, 'doc_text'] = _row.doc_text
                    print(f"Replacing:\n{row.doc_text}")
                    print(f"with:\n{_row.doc_text}")
                    count -= 1
                    break
    print(f"Number of reference documents in candidates: {count}")
    return candidate_df

# candidate_qa_df = res_df.groupby("question").nth(100).reset_index(drop=True)
# candidate_qa_df = remove_relevant_docs_from_candidates(candidate_qa_df, correct_qa_df)
# candidate_qa_df

In [8]:
def get_azure_openai_client():
    endpoint = os.getenv("ENDPOINT_URL", "https://relevance-judgments.openai.azure.com/")
    subscription_key = os.getenv("AZURE_OPENAI_API_KEY")

    # Initialize Azure OpenAI Service client with key-based authentication
    client = AzureOpenAI(
        azure_endpoint=endpoint,
        api_key=subscription_key,
        api_version="2025-01-01-preview",
    )
    return client


def prepare_chat_prompt(user_prompt, system_instructions=None):
    if system_instructions:
        chat_prompt = [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_instructions}],
                "role": "user",
                "content": [{"type": "text", "text": user_prompt}],
            }
        ]
    else:
        chat_prompt = [
            {"role": "user", "content": [{"type": "text", "text": user_prompt}]}
        ]
    return chat_prompt


def get_chat_response(client, messages, model="gpt-4o"):
    # Generate the completion
    completion = client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=50,
        temperature=0,  # 0.7 is the default, 0 is deterministic
        top_p=1,  # 0.95 is the default
        frequency_penalty=0.5,
        stop=None,
        stream=False,
    )
    return completion

In [9]:
user_instructions_prompt = """Please read the question and the passage below and indicate how relevant the passage is to the query. 
Use the following scale: 

* 3 for perfectly relevant: The passage is dedicated to the query and contains the exact answer. 
* 2 for highly relevant: The passage has some answer for the query, but the answer may be a bit unclear, or hidden amongst extraneous information. 
* 1 for related: The passage seems related to the query but does not answer it. 
* 0 for irrelevant: The passage has nothing to do with the query. 

Question: 
{query} 

Passage: 
{passage} 

Indicate how relevant the passage is, using the scale above. 
Give only a number, do not give any explanation."""


In [10]:
def judge_qa_pairs(df, model, output_dir=None, iteration=0):
    if output_dir is None:
        _dir = ensure_dir(f"../data/live_rag_questions/azure_openai_responses/{model}")
    else:
        _dir = ensure_dir(output_dir + f"/{model}")

    # Initialize the OpenAI client
    client = get_azure_openai_client()

    _df = df.copy()

    responses = []
    for row in tqdm(_df.itertuples(), total=len(_df)):
        query = row.question
        passage = row.doc_text

        # Prepare the chat prompt
        chat_prompt = prepare_chat_prompt(
            user_prompt=user_instructions_prompt.format(query=query, passage=passage)
        )
        try:
            # Get the chat response
            completion = get_chat_response(client, chat_prompt, model=model)
        except Exception as e:
            print(f"Error: {e}")
            print(f"Row: {row.Index}")
            print(f"Query: {query}")
            print(f"Passage: {passage}")
            with open(f"{_dir}/{row.Index}-{iteration}-{model}.error", "w") as f:
                f.write(str(e))
        else:
            # save the response to a json file
            with open(f"{_dir}/{row.Index}-{iteration}-{model}.json", "w") as f:
                f.write(completion.to_json())

        # Extract the response text
        response_text = None
        # cast the response to a dict
        try:
            completion = completion.to_dict()
        except Exception as e:
            print(f"Error: {e}")
            print(f"Row: {row.Index}")
            completion = {}
        choices = completion.get("choices")
        if choices:
            # Decode the response
            message = choices[0].get("message")
            if message:
                response_text = message.get("content")

        # Append the decoded response to the list
        responses.append(response_text)
    _df[f"{model}_response"] = responses
    _df.to_csv(
        f"{_dir}/dmds_combined_questions_{len(_df)}_opensearch_results_azure_openai_{model}_responses-{iteration}_{time}.tsv",
        sep="\t",
        index=False,
    )
    return _df


# # examine the responses
# def examine_responses(df):
#     for idx, row in df[df["gpt-4o_response"].fillna("7").astype(int) > 0].iterrows():
#         print(f"Row: {idx}")
#         print(f"Query: {textwrap.fill(row.question, 100)}\n")
#         print(f"Passage: {textwrap.fill(row.doc_text, 120)}\n")
#         print(f"Response: {row['gpt-4o_response']}")
#         print("-" * 50)

In [11]:
def select_negative_samples(
    res_df,
    correct_qa_df,
    max_iterations=10,
    start_document_rank=45,
    model="gpt-4.1-mini",
):
    negative_samples_df = pd.DataFrame()

    # get the first candidates
    n_questions = res_df["question"].nunique()
    _df = res_df.groupby("question").nth(start_document_rank).reset_index(drop=True)
    _df = remove_relevant_docs_from_candidates(
        _df, correct_qa_df, nth=start_document_rank
    )

    for i in range(1, max_iterations + 1):

        # judge the candidates
        _df = judge_qa_pairs(_df, model=model, iteration=i, output_dir="../data/live_rag_questions/azure_openai_responses_negative_runs")
        _df[f"{model}_response"] = _df[f"{model}_response"].fillna("7").astype(int)

        print(
            f"Iteration {i}, judging results: {_df.value_counts(f'{model}_response')}"
        )

        negative_samples_df = pd.concat(
            [
                negative_samples_df,
                _df.loc[_df[f"{model}_response"] == 0],
            ],
            ignore_index=True,
        )
        print(f"Iteration {i}: {len(negative_samples_df)} negative samples")

        if len(negative_samples_df) < 10:
            return negative_samples_df, _df

        if len(negative_samples_df) >= n_questions:
            print(f"Found {len(negative_samples_df)} negative samples, stopping.")
            break

        next_candidates = _df.loc[_df[f"{model}_response"] > 0].copy()

        # set the next document rank to start from
        _nth = start_document_rank * (i + 1)

        next_candidates = (
            res_df.loc[res_df["question"].isin(next_candidates["question"])]
            .groupby("question")
            .nth(_nth)
            .reset_index(drop=True)
        )

        next_candidates = remove_relevant_docs_from_candidates(
            next_candidates, correct_qa_df, nth=_nth
        )

        _df = next_candidates.copy()
    else:
        print(f"Found {len(negative_samples_df)} negative samples, stopping.")
        print(f"Concatenating all last samples with the negative samples")
        negative_samples_df = pd.concat(
            [
                negative_samples_df,
                _df,
            ],
            ignore_index=True,
        )

    # save the negative samples
    negative_samples_df.to_csv(
        f"../data/live_rag_questions/negative_samples_{len(negative_samples_df)}-{model}.tsv",
        sep="\t",
        index=False,
    )
    return negative_samples_df


# negative_samples_df_1 = select_negative_samples(
#     res_df,
#     correct_qa_df,
#     max_iterations=15,
#     start_document_rank=65,
#     model="gpt-4.1-mini",
# )
negative_samples_df_1 = pd.read_csv(
    "../data/live_rag_questions/negative_samples_954-gpt-4.1-mini.tsv", sep="\t"
)
negative_samples_df_1

Unnamed: 0,question,doc_text,doc_id,gpt-4.1-mini_response
0,exocrine pancreatic insufficiency symptoms cau...,About 94% of pancreatic cancers are classified...,<urn:uuid:2713c691-cd16-42dd-a9ee-55c819f37976>,0.0
1,traditional kofte ingredients food poisoning r...,"THURSDAY, Jan. 8, 2015 (HealthDay News) -- The...",<urn:uuid:fbd1d404-24b9-48f0-8d7b-f233e6af2c0e>,0.0
2,I'm interested in environmental organizations ...,There are lots of organizations out there that...,<urn:uuid:9ffe53ea-c8c6-42e1-973e-7a09fdf84629>,0.0
3,Where was the Christmas message filmed?,We've just launched the MyHeritage Christmas c...,<urn:uuid:2526619d-b41f-4e63-bff4-73a7647fccbb>,0.0
4,I'm researching teacher training programs and ...,DOEE offers a suite of trainings that help Dis...,<urn:uuid:92f510ca-e614-4810-8a1c-f49832ebd5a0>,0.0
...,...,...,...,...
949,color space conversion between rgb cmyk differ...,Volume Pricing (Log In to see Member Pricing)\...,<urn:uuid:89f65520-1518-4da7-99e5-9a2272cd6a16>,0.0
950,nft digital art market impact benefits artists...,One of the primary features of the platform is...,<urn:uuid:8cee8941-3c6f-4420-baf8-f828a8d4b560>,0.0
951,body temperature comparison hypothermia frostb...,Fever Treatment : Fever is caused due to an in...,<urn:uuid:11177517-2317-4704-85b0-5cd3038f1333>,0.0
952,I'm managing a team and want to know - what ar...,Four steps to a sound electrical safety cultur...,<urn:uuid:2da7feda-8587-4745-8fff-948efaf89d2e>,0.0


In [12]:
# negative_samples_df_2 = select_negative_samples(
#     res_df,
#     correct_qa_df,
#     max_iterations=15,
#     start_document_rank=65,
#     model="gpt-4o",
# )
negative_samples_df_2 = pd.read_csv(
    "../data/live_rag_questions/negative_samples_991-gpt-4o.tsv", sep="\t"
)
negative_samples_df_2

Unnamed: 0,question,doc_text,doc_id,gpt-4o_response
0,exocrine pancreatic insufficiency symptoms cau...,About 94% of pancreatic cancers are classified...,<urn:uuid:2713c691-cd16-42dd-a9ee-55c819f37976>,0.0
1,traditional kofte ingredients food poisoning r...,"THURSDAY, Jan. 8, 2015 (HealthDay News) -- The...",<urn:uuid:fbd1d404-24b9-48f0-8d7b-f233e6af2c0e>,0.0
2,I'm interested in environmental organizations ...,There are lots of organizations out there that...,<urn:uuid:9ffe53ea-c8c6-42e1-973e-7a09fdf84629>,0.0
3,As a culinary historian exploring ancient past...,"ravioli and xiaolongbao\nseafood and tobiko, m...",<urn:uuid:8d0b135b-6322-40bc-8472-166a5be3c7ec>,0.0
4,Where was the Christmas message filmed?,We've just launched the MyHeritage Christmas c...,<urn:uuid:2526619d-b41f-4e63-bff4-73a7647fccbb>,0.0
...,...,...,...,...
986,inattentive adhd anxiety disorder physical sym...,Study: Methylphenidate is Effective Long-Term ...,<urn:uuid:ef1b5868-0d7a-4387-b864-7246ca1e945a>,0.0
987,ways start discussion cyberbullying teens,Free discussion questions are available for bo...,<urn:uuid:a9b9b431-6985-483a-89a2-94e8563f7e39>,0.0
988,preferred structures habitat fish locations,Monarch butterfly habitat focus of new environ...,<urn:uuid:12b54b80-b4d3-4140-aa70-bc02ebc45e7b>,0.0
989,ramsar wetlands environmental threats sustaina...,It will also assess how WFO could set up a pre...,<urn:uuid:acd7eb18-cdd8-4a88-bdcb-b465151f9322>,0.0


In [13]:

# negative_samples_df_3 = select_negative_samples(
#     res_df,
#     correct_qa_df,
#     max_iterations=15,
#     start_document_rank=64,
#     model="gpt-4o",
# )
negative_samples_df_3 = pd.read_csv(
    "../data/live_rag_questions/negative_samples_995-gpt-4o.tsv", sep="\t"
)
negative_samples_df_3

Unnamed: 0,question,doc_text,doc_id,gpt-4o_response
0,I'm interested in environmental organizations ...,"In the first part of this article, the author ...",<urn:uuid:8b4989f8-1859-42a2-b91b-c6a0e1abaf71>,0.0
1,As a culinary historian exploring ancient past...,The year was 1908 and a small trattoria in Rom...,<urn:uuid:af2e0c3d-1996-4682-8eed-5315970c68d6>,0.0
2,doctor measured my blood pressure today got nu...,My father was being treated for HBP for a coup...,<urn:uuid:c155f296-fe57-47d2-bd02-c349f21cfda7>,0.0
3,I'm researching teacher training programs and ...,"No catches, no fine print just unadulterated b...",<urn:uuid:a558795d-4182-4114-8d87-6add30fe28fd>,0.0
4,What factors are important for successful cabb...,"Just a matter of finely shredding the cabbage,...",<urn:uuid:4399f5ff-2935-4048-8275-56e3f58fafbf>,0.0
...,...,...,...,...
990,"Being interested in distributed systems, I'd l...",The attacks were clearly planned well in advan...,<urn:uuid:df8b61d7-3843-4f70-941e-5b7d86608362>,0.0
991,security private cloud public cloud difference...,Navigating Data Security in an Era of Hybrid W...,<urn:uuid:9395cd26-0714-469e-a032-62ba545eeec4>,0.0
992,marine animal food chain pollution effects,All precautions and rules are already being fo...,<urn:uuid:72b741fa-5d81-4b98-8f60-fe7b5eff80e5>,0.0
993,looking for camera with good screen what is lc...,You can also play your movies and slideshows o...,<urn:uuid:0340acea-146d-4bd5-9c18-a00789e5ac04>,0.0


In [14]:
negative_samples_df_1.set_index("question").merge(
    negative_samples_df_2.set_index("question"),
    how="outer",
    left_index=True,
    right_index=True,
)

Unnamed: 0_level_0,doc_text_x,doc_id_x,gpt-4.1-mini_response,doc_text_y,doc_id_y,gpt-4o_response
question,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Are social media and advocacy approaches similar between Hispanic and Native heritage celebrations?,Chiquita's top leader spoke of resilience and ...,<urn:uuid:1d70d876-017c-4a74-ad3c-37fb67c2f8de>,0.0,I embrace my roots and enlighten others about ...,<urn:uuid:5e13681f-078c-4c6e-a9f5-e0ae128d4c2f>,0.0
Are solar pumps good for small farms?,The city has been a leader in sustainable prac...,<urn:uuid:f602598b-6564-4e0f-bd4f-6f74e1697233>,0.0,Tag Archives: NJ\nNew Jersey is turning eyesor...,<urn:uuid:c2971b9d-d32b-4401-a812-1c81626d4869>,0.0
Are ticks or raccoons more dangerous for disease transmission?,,,,Items 1-24 of 106\nFlea & Tick Treatments For ...,<urn:uuid:4ab73bd0-cce1-4645-b3ab-c213a29a36e5>,0.0
"As a GIS specialist interested in data modeling comparisons, I'm curious about the similarities between GIS data models and BIM - do both approaches focus on representing physical objects with detailed attributes throughout their lifecycle?",The certificate will be awarded when a student...,<urn:uuid:cf1a889c-b045-4d97-9e4e-bd09d8b60a2a>,0.0,The certificate will be awarded when a student...,<urn:uuid:cf1a889c-b045-4d97-9e4e-bd09d8b60a2a>,0.0
"As a breast cancer researcher studying viral links, how does MMTV transmission and treatment differ from BRCA-related cancers?",In findings that are fundamentally reshaping t...,<urn:uuid:d8c8d2ca-007c-477e-9ed2-9372e897a526>,0.0,From Our 2007 Archives\nBreast Cancer Genes Do...,<urn:uuid:a3034fa2-a7e0-417d-b27f-11cc89127857>,0.0
...,...,...,...,...,...,...
young criminal monitoring options effectiveness,Your doctor may want to test your liver functi...,<urn:uuid:32635e61-774c-426b-bd89-84c17457cdaa>,0.0,Your doctor may want to test your liver functi...,<urn:uuid:32635e61-774c-426b-bd89-84c17457cdaa>,0.0
young people mental health stress climate change comparison,"Community Mental Health Journal\nR.E., Essock,...",<urn:uuid:310edc27-e4a1-475b-a74a-48d86633db84>,0.0,From the Young Lawyers Division newsletter‚Ä¶.\n...,<urn:uuid:662c009c-c85e-4e6e-ad4a-c803cdccd621>,0.0
youth group leader need quiet fun movement activities tips indoor spaces small area suggestions silent group games,Our mission is to provide youth a place where ...,<urn:uuid:0ee5dd30-c373-4bca-b139-5c41e981ebd0>,0.0,Our mission is to provide youth a place where ...,<urn:uuid:0ee5dd30-c373-4bca-b139-5c41e981ebd0>,0.0
zebra mussels lake winnipeg first detection,4oz Jar of Eelsnot to protect the hull and the...,<urn:uuid:83447a59-a495-4a6a-9d13-a89fb5baa01f>,0.0,Aquatic invasive species (AIS) are an economic...,<urn:uuid:f30eb84a-f6b9-490e-8ded-93c658a09c4a>,0.0


In [15]:
gpt41_judgments_files = glob(
    "../data/live_rag_questions/dmds_combined_questions_*_opensearch_results_azure_openai_gpt-4.1-mini_responses-*.tsv"
)
print(f"Found {len(gpt41_judgments_files)} GPT-4.1 negative files")
gpt41_judgments_df = pd.concat(
    [pd.read_csv(f, sep="\t") for f in gpt41_judgments_files],
    ignore_index=True,
)
gpt41_judgments_df

gpt4o_judgments_files = glob(
    "../data/live_rag_questions/dmds_combined_questions_*_opensearch_results_azure_openai_gpt-4o_responses-*.tsv"
)
print(f"Found {len(gpt4o_judgments_files)} GPT-4o negative files")
gpt4o_judgments_df = pd.concat(
    [pd.read_csv(f, sep="\t") for f in gpt4o_judgments_files],
    ignore_index=True,
)
gpt4o_judgments_df

Found 29 GPT-4.1 negative files
Found 15 GPT-4o negative files


Unnamed: 0,question,doc_text,doc_id,gpt-4o_response
0,How can companies protect their intellectual p...,A technology industry organization says digita...,<urn:uuid:84538267-159d-4436-b306-62ec8f0f6a2f>,2.0
1,color space conversion between rgb cmyk differ...,This is a great online color tester and design...,<urn:uuid:91159e73-0b04-4022-9e25-dda1cfb288bf>,1.0
2,cbt vs cft techniques treating mental health,"I‚Äôm especially not ok with this, because as a ...",<urn:uuid:d2b904f8-ea05-4c25-a0d0-99078ae1ce33>,1.0
3,ways start discussion cyberbullying teens,Free discussion questions are available for bo...,<urn:uuid:a9b9b431-6985-483a-89a2-94e8563f7e39>,0.0
4,manufacturing process monitoring tools applica...,The Remsdaq SCADA systems division designs and...,<urn:uuid:889621be-3fa1-4465-b192-4c56ef0a831d>,1.0
...,...,...,...,...
1971,long term dental bridge 4 implants outcomes,All-on-4 Dental Implants Specialist in Anaheim...,<urn:uuid:8f2f502f-4d53-41f1-82d9-96d33e1d57f7>,1.0
1972,I'm comparing water conservation methods: buck...,: I am building a new home in U.K. and I would...,<urn:uuid:c1544db3-d4f1-4b98-aab2-4bd9c977e9bd>,1.0
1973,prevention techniques treatment options torn a...,"Receiving Jefferson Barracks, MO Sports Therap...",<urn:uuid:243391d3-7bd1-44bd-a1b4-9d0dbe57635d>,2.0
1974,reef tourism business environmental effects,"For example, in the center of the below photo,...",<urn:uuid:5de2164a-0948-424d-bd7f-7575f8d50471>,1.0


In [16]:
gpt41_judgments_df.groupby("question")["gpt-4.1-mini_response"].sum().sort_values(ascending=False)

question
real illusion photography lighting techniques                                                                                                                                                                  32.0
How can team members help each other work better together?                                                                                                                                                     29.0
How can I make my garden plants grow healthy?                                                                                                                                                                  27.0
intellectual property protection business contract conditions                                                                                                                                                  27.0
mental activity physical exercise dementia prevention                                                                                          

In [17]:
gpt4o_judgments_df.groupby("question")["gpt-4o_response"].sum().sort_values(ascending=False)

question
How can I make my garden plants grow healthy?                                                      29.0
How can team members help each other work better together?                                         29.0
manufacturing process monitoring tools applications                                                23.0
How can companies protect their intellectual property?                                             23.0
forest conservation measures extinction prevention biological diversity preservation approaches    22.0
                                                                                                   ... 
What defined Teddy Roosevelt's understanding of international relations?                            0.0
What defines a tree in graph theory terms?                                                          0.0
What did Padmasambhava teach about Vajrakƒ´laya meditation?                                          0.0
What different observations require a thorough sputum 

In [18]:
combined_judgments_df = gpt41_judgments_df.set_index(["question", "doc_text"]).merge(
    gpt4o_judgments_df.set_index(["question", "doc_text"]),
    left_index=True,
    right_index=True,
    how="outer",
)

_scores_sum = combined_judgments_df.fillna(0).max(1, numeric_only=True).sort_values(
    ascending=False
)
not_relevant_pairs_df = _scores_sum.loc[_scores_sum < 2]
for i in not_relevant_pairs_df.index[0]:
    print(textwrap.fill(i, 100))
    print("-" * 50)
    
# write the not relevant pairs to a file
not_relevant_pairs_df.index.to_frame().to_json(
    "../data/live_rag_questions/not_relevant_pairs-1358.json",
    orient="records",
    lines=True,
)
not_relevant_pairs_df

what ancient building inspired construction design taj mahal india
--------------------------------------------------
It was during the birth of their last child called Gauhara Begum in 1631, while accompanying her
husband on a military campaign in Burhanpur, central India, that Mumtaz Mahal died at the age of 38.
In 1631, Shah Jahan was three years into his reign and fighting against a rebellion led by Khan
Jahan Lodi. Mumtaz Mahal often joined her husband on military campaigns and was also at his side
during this one despite being heavily pregnant. During the war, she gave birth to a healthy baby
girl and although things looked fine at first, her condition soon worsened and she died from
complications. Mumtaz Mahal died one day after the birth of her daughter in the arms of Shah Jahan.
Some reports say that the emperor cried for 8 days without stopping. Mumtaz Mahal was temporarily
buried at the encampment in Burbanpur until Shah Jahan was able to defeat the rebellion. She was
then t

question                                                                                                                                                                   doc_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

In [19]:
_scores_sum = combined_judgments_df.fillna(0).sum(1, numeric_only=True).sort_values(
    ascending=False
)
relevant_pairs_df = _scores_sum.loc[_scores_sum > 2]
relevant_pairs_df

question                                                                                                                                                           doc_text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             

In [20]:
def select_positive_samples(
    res_df,
    correct_qa_df,
    max_iterations=10,
    start_document_rank=0,
    model="gpt-4.1-mini",
):
    positive_samples_df = pd.DataFrame()

    # get the first candidates
    n_questions = res_df["question"].nunique()
    _df = res_df.groupby("question").nth(start_document_rank).reset_index(drop=True)

    for i in range(1, max_iterations + 1):

        # judge the candidates
        _df = judge_qa_pairs(_df, model=model, iteration=i, output_dir="../data/live_rag_questions/azure_openai_responses_positive_runs")
        _df[f"{model}_response"] = _df[f"{model}_response"].fillna("0").astype(int)

        print(
            f"Iteration {i}, judging results: {_df.value_counts(f'{model}_response')}"
        )

        positive_samples_df = pd.concat(
            [
                positive_samples_df,
                _df.loc[_df[f"{model}_response"] > 1],
            ],
            ignore_index=True,
        )
        print(f"Iteration {i}: {len(positive_samples_df)} positive samples")

        if len(positive_samples_df) < 10:
            return positive_samples_df, _df

        if len(positive_samples_df) >= n_questions:
            print(f"Found {len(positive_samples_df)} positive samples, stopping.")
            break

        next_candidates = _df.loc[_df[f"{model}_response"] <= 1].copy()

        # set the next document rank to start from
        _nth = start_document_rank + i

        next_candidates = (
            res_df.loc[res_df["question"].isin(next_candidates["question"])]
            .groupby("question")
            .nth(_nth)
            .reset_index(drop=True)
        )

        _df = next_candidates.copy()
    else:
        print(f"Reached the maximum number of iterations: {max_iterations}")
        print(f"Found {len(positive_samples_df)} positive samples, stopping.")
        print("Saving the file and returning the dataframe")

    # save the positive samples
    positive_samples_df.to_csv(
        f"../data/live_rag_questions/positive_samples_{len(positive_samples_df)}-{model}.tsv",
        sep="\t",
        index=False,
    )
    return positive_samples_df


# positive_samples_df_1 = select_positive_samples(
#     res_df,
#     correct_qa_df,
#     max_iterations=15,
#     start_document_rank=0,
#     model="gpt-4.1-mini",
# )
positive_samples_df_1 = pd.read_csv(
    "../data/live_rag_questions/positive_samples_874-gpt-4.1-mini.tsv", sep="\t"
)
positive_samples_df_1

Unnamed: 0,question,doc_text,doc_id,gpt-4.1-mini_response
0,exocrine pancreatic insufficiency symptoms cau...,Exocrine pancreatic insufficiency (EPI) is cha...,<urn:uuid:a4c85ebf-b52e-4fa1-859d-eafa27d6a7da>,2
1,traditional kofte ingredients food poisoning r...,Yummy Food Mexican Peanuts üç≤Kofte or Chana Sal...,<urn:uuid:842d2b2e-a668-410b-a670-13138fff29d7>,2
2,I'm interested in environmental organizations ...,CEC spearheading North American pilot to measu...,<urn:uuid:9ec5ad0b-6de4-47c8-8fd9-eb7c04b69993>,3
3,As a mental health professional concerned with...,With the increase in awareness about mental he...,<urn:uuid:a5449e48-6d36-4694-b41d-4704a47b53d9>,2
4,As a culinary historian exploring ancient past...,"Pasta is a universally enjoyed food, and almos...",<urn:uuid:239fecec-95de-40d2-b243-4d9ba9bd53c5>,2
...,...,...,...,...
869,What are the main symptoms someone should look...,Diarrhoea in Infants\nDiarrhoea is the rapid m...,<urn:uuid:b6cf29f5-41fc-4bbf-ae36-72f876efc032>,3
870,How did ancient leaders die while defending th...,Erdogan grows more radical\nAuthor: Kadri Gurs...,<urn:uuid:6db8ef91-186d-4c43-8946-00da8885e0d3>,2
871,What happened to the stock market crashes afte...,That‚Äôs been true after the 1929 crash and the ...,<urn:uuid:889c9b61-0066-45d6-9ef8-f0789e2a04ed>,2
872,How fast can an E. coli bacteria swim?,Understanding The Behavior Of Predatory Bacter...,<urn:uuid:5048cd83-9888-4e8e-a60e-92b53c2042a6>,2


In [21]:
# positive_samples_df_2 = select_positive_samples(
#     res_df, correct_qa_df, max_iterations=15, start_document_rank=0, model="gpt-4o"
# )
positive_samples_df_2 = pd.read_csv(
    "../data/live_rag_questions/positive_samples_792-gpt-4o.tsv", sep="\t"
)
positive_samples_df_2

Unnamed: 0,question,doc_text,doc_id,gpt-4o_response
0,I'm interested in environmental organizations ...,CEC spearheading North American pilot to measu...,<urn:uuid:9ec5ad0b-6de4-47c8-8fd9-eb7c04b69993>,3
1,As a mental health professional concerned with...,With the increase in awareness about mental he...,<urn:uuid:a5449e48-6d36-4694-b41d-4704a47b53d9>,2
2,doctor measured my blood pressure today got nu...,We've already noted how many tech companies ar...,<urn:uuid:35259052-5c38-4e25-888b-741ed4f6657f>,3
3,ancient modern ocean pH changes coral impacts,"When any change in temperature, light or nutri...",<urn:uuid:f8600bdc-b13a-4424-b54a-63f2ae368526>,2
4,nose blockage and yellow mucus not going away ...,Yellow mucus results from a draining infection...,<urn:uuid:e8c9503e-9253-4e91-ac6f-8a1d5aabc526>,3
...,...,...,...,...
787,I've been studying different philosophical met...,"In 1784, Kant (1724 ‚Äì 1804) wrote a short essa...",<urn:uuid:bbc6d9dc-c4fb-4d1e-a3ac-0ff0911f30b1>,2
788,How fast can an E. coli bacteria swim?,Understanding The Behavior Of Predatory Bacter...,<urn:uuid:5048cd83-9888-4e8e-a60e-92b53c2042a6>,2
789,environmental social implications waterway tra...,"Professor, Civil and Environmental Engineering...",<urn:uuid:2886fe33-a78c-4223-b2a5-9dd4ff5c55e5>,2
790,real illusion photography lighting techniques,Learn the concepts of Black Art with 9 new ill...,<urn:uuid:4be67136-aca6-4450-a71d-f38581759dda>,2


In [22]:
# positive_samples_df_3 = select_positive_samples(
#     res_df, correct_qa_df, max_iterations=15, start_document_rank=1, model="gpt-4o"
# )
positive_samples_df_3 = pd.read_csv(
    "../data/live_rag_questions/positive_samples_727-gpt-4o.tsv", sep="\t"
)
positive_samples_df_3

Unnamed: 0,question,doc_text,doc_id,gpt-4o_response
0,traditional kofte ingredients food poisoning r...,Sadlly the supermarket has run out of lentils ...,<urn:uuid:32f4585a-b002-4df3-ac2f-a9d57877a7a0>,2
1,I'm interested in environmental organizations ...,About the CEC\nThe Commission for Environmenta...,<urn:uuid:5ceeb4fa-6cb8-4bba-a642-9e590013b1f8>,3
2,As a mental health professional concerned with...,Helping to provide the appropriate level of ca...,<urn:uuid:6cea4668-5e61-4b3b-a28d-78f0b85aa51e>,2
3,doctor measured my blood pressure today got nu...,It‚Äôs May Measure Month - a health campaign aim...,<urn:uuid:425afb56-a823-421f-bb99-57f486f0ebaf>,3
4,Where was the Christmas message filmed?,In her annual Christmas Day speech the Queen t...,<urn:uuid:71b1ab0c-3c0b-4db7-997d-79bf84b00ca1>,3
...,...,...,...,...
722,indigenous runners achievements preservation d...,"He passed away on January 9, 1949.\nIn 1951, h...",<urn:uuid:ff4c9f2a-2e1d-42e4-b885-1db77d539114>,2
723,What's the key difference between how ground p...,NIOSHTIC-2 Publications Search\nElectromagneti...,<urn:uuid:86e7fc91-23eb-4b08-af87-b659a53a89f1>,2
724,What makes gophers come into people's yards an...,"To get rid of gophers using mothballs, locate ...",<urn:uuid:e2286f18-d9b7-43a2-8db9-d6024dd88047>,3
725,dishwasher pipe making squealing sound fix,If your business has a commercial dishwasher o...,<urn:uuid:12518fe4-25c7-425e-b8a7-0cb99c0acf75>,2


In [23]:
gpt41_judgments_files_1 = glob(
    "../data/live_rag_questions/*/*/dmds_combined_questions_*_opensearch_results_azure_openai_gpt-4.1-mini_responses-*.tsv"
)

print(f"Found {len(gpt41_judgments_files_1)} GPT-4.1 files")

gpt41_judgments_files_2 = glob(
    "../data/live_rag_questions/dmds_combined_questions_*_opensearch_results_azure_openai_gpt-4.1-mini_responses-*.tsv"
)
print(f"Found {len(gpt41_judgments_files_2)} GPT-4.1 files")


gpt41_judgments_df = pd.concat(
    [
        pd.read_csv(f, sep="\t")
        for f in gpt41_judgments_files_1 + gpt41_judgments_files_2
    ],
    ignore_index=True,
).dropna()


gpt4o_judgments_files_1 = glob(
    "../data/live_rag_questions/*/*/dmds_combined_questions_*_opensearch_results_azure_openai_gpt-4o_responses-*.tsv"
)
print(f"Found {len(gpt4o_judgments_files_1)} GPT-4o files")

gpt4o_judgments_files_2 = glob(
    "../data/live_rag_questions/dmds_combined_questions_*_opensearch_results_azure_openai_gpt-4o_responses-*.tsv"
)
print(f"Found {len(gpt4o_judgments_files_2)} GPT-4o files")

gpt4o_judgments_df = pd.concat(
    [
        pd.read_csv(f, sep="\t")
        for f in gpt4o_judgments_files_1 + gpt4o_judgments_files_2
    ],
    ignore_index=True,
).dropna()

print(gpt4o_judgments_df.value_counts(["question", "doc_text"]).describe())
print(gpt41_judgments_df.value_counts(["question", "doc_text"]).describe())

Found 15 GPT-4.1 files
Found 29 GPT-4.1 files
Found 44 GPT-4o files
Found 15 GPT-4o files
count    10464.000000
mean         1.390959
std          0.494410
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max          4.000000
Name: count, dtype: float64
count    6450.000000
mean        1.576124
std         0.501683
min         1.000000
25%         1.000000
50%         2.000000
75%         2.000000
max         4.000000
Name: count, dtype: float64


In [None]:
combined_judgments_df = gpt41_judgments_df.set_index(["question", "doc_text"]).merge(
    gpt4o_judgments_df.set_index(["question", "doc_text"]),
    left_index=True,
    right_index=True,
    how="outer",
)

combined_judgments_df = combined_judgments_df.assign(
    score=combined_judgments_df[["gpt-4.1-mini_response", "gpt-4o_response"]].min(1)
).reset_index()

combined_judgments_df = combined_judgments_df.assign(
    doc_id=combined_judgments_df.apply(
        lambda x: x["doc_id_x"] if pd.notna(x["doc_id_x"]) else x["doc_id_y"], axis=1
    )
).drop(columns=["doc_id_x", "doc_id_y", "gpt-4.1-mini_response", "gpt-4o_response"])

# for any duplicate question-doc_text pairs, keep the lowest score
combined_judgments_df = (
    combined_judgments_df[["question", "doc_text", "doc_id", "score"]]
    .groupby(["question", "doc_text"])
    .min()

).sort_values(
    by=["question", "score", "doc_text"],
    ascending=[True, False, True],
).reset_index(drop=True)
combined_judgments_df

Unnamed: 0,index,question,doc_text,doc_id,score
0,1,Are social media and advocacy approaches simil...,Calendar of Events\nAmigos Latinx Gala\nThe Am...,<urn:uuid:bc646e14-8ab6-4f32-bd11-65a9c5d9f0da>,1.0
1,4,Are social media and advocacy approaches simil...,"For this reason, people who are Hispanic may v...",<urn:uuid:26fba4c7-cba7-4b05-bac5-db8287a20720>,1.0
2,6,Are social media and advocacy approaches simil...,Hispanic Heritage Month\nHispanic Heritage Mon...,<urn:uuid:58425b70-9a34-4f06-bf7c-16f82eb3a776>,1.0
3,10,Are social media and advocacy approaches simil...,Increase the overall brand awareness of TLMODA...,<urn:uuid:ccde4cc2-c792-450d-990f-51eef46bc00d>,1.0
4,15,Are social media and advocacy approaches simil...,Providing Cultural Competency Education\nThea ...,<urn:uuid:21adf8af-538a-40ff-93bf-0387c3223227>,1.0
...,...,...,...,...,...
13096,13095,zebrafish genetic studies disease applications...,"Overall, systematically identifying and analyz...",<urn:uuid:476d1202-06d9-4393-8e40-ad25d33d6252>,1.0
13097,13100,zebrafish genetic studies disease applications...,We recently identified thrombocyte micropartic...,<urn:uuid:4b25f14f-4865-4d8f-8a2b-87491a51d683>,1.0
13098,13086,zebrafish genetic studies disease applications...,- Methodology article\n- Open Access\nMolecula...,<urn:uuid:c1bc28ef-c88b-496c-88bc-9d257dc0f644>,0.0
13099,13088,zebrafish genetic studies disease applications...,Click to have a closer look\nAbout this book\n...,<urn:uuid:927a2272-c402-4436-93d2-dac171e1f49b>,0.0


In [25]:
qrels_df = pd.concat([relevant_pairs_df, not_relevant_pairs_df], axis=0).rename('score').reset_index()
qrels_df['score'] = qrels_df['score'].map(lambda x: 1 if x >=2 else 0)

In [26]:
qrels_df.sort_values(by=['question', 'score', 'doc_text'], ascending=[True, False, True]).groupby('question')['score'].sum().value_counts()

score
0     686
1     229
2      36
3      24
4       8
5       4
10      3
6       3
8       3
12      1
14      1
9       1
7       1
Name: count, dtype: int64