In [1]:
import pandas as pd
import minsearch_xtra as minsearch
import os
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
from openai import OpenAI
import json
from tqdm.auto import tqdm
import pickle

# Ingestion

In [2]:
pd.set_option('display.max_colwidth', None)  # Show all columns
df = pd.read_csv("../data/stoic_zen_document.csv")
df.insert(0,'id',df.index)

In [3]:
documents = df.to_dict(orient="records")
print("lenght of the documents:", len(documents))

index = minsearch.Index(
    text_fields=["category", "question", "answer"],
    keyword_fields=['id',"ideology"]
)

index.fit(documents)

lenght of the documents: 820


<minsearch.Index at 0x7f2e8e75f810>

# Rag Flow

In [4]:
client = OpenAI()

prompt_template = """
You emulate a user of our philosophy assistant application.
Formulate 5 questions this user might ask based on a provided answer and quote.
Make the questions specific to this answer.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

category: {category}
question: {question}
answer: {answer}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [5]:
results = {}

for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/820 [00:00<?, ?it/s]

In [6]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [7]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])
df_results.to_csv('../data/ground_truth_retrieval.csv', index=False)