# Loading groundtruth

In [3]:
import requests
import pandas as pd

data = pd.read_csv(r'Data_prep/data.csv', sep=',')

data.head()

Unnamed: 0,Category,Question,Answer
0,General Information,What is syndicated research?,Syndicated research is a type of market resear...
1,General Information,How often is the data updated?,The data is typically updated on a monthly bas...
2,General Information,What FMCG categories are covered in this resea...,The research covers a wide range of FMCG categ...
3,General Information,Who can benefit from this syndicated research?,This research is valuable for FMCG manufacture...
4,General Information,What geographic regions are covered in this re...,"The research includes data from North America,..."


# Adding unique ID for each document

In [4]:
import pandas as pd
import hashlib


# Function to create a hash for each row
def create_doc_id(row):
    # Concatenate the row data into a single string (you can customize which columns to include)
    row_string = ''.join(row.astype(str))
    
    # Generate a SHA-256 hash of the concatenated string (but only picks first 8 digits for a shorted id)
    return hashlib.sha256(row_string.encode()).hexdigest()[:8]

# Apply the function to each row to create a 'doc_id' column
data['doc_id'] = data.apply(create_doc_id, axis=1)

data.head()



Unnamed: 0,Category,Question,Answer,doc_id
0,General Information,What is syndicated research?,Syndicated research is a type of market resear...,2dd9200a
1,General Information,How often is the data updated?,The data is typically updated on a monthly bas...,1f0a30a7
2,General Information,What FMCG categories are covered in this resea...,The research covers a wide range of FMCG categ...,a899e27a
3,General Information,Who can benefit from this syndicated research?,This research is valuable for FMCG manufacture...,219af693
4,General Information,What geographic regions are covered in this re...,"The research includes data from North America,...",d7826b0e


In [5]:
final_data = data.to_dict(orient='records')
final_data[0]

{'Category': 'General Information',
 'Question': 'What is syndicated research?',
 'Answer': 'Syndicated research is a type of market research where data and findings are collected and compiled by a research agency and then sold to multiple clients. It provides insights into consumer behavior, market trends, and product performance across various FMCG categories.',
 'doc_id': '2dd9200a'}

In [6]:
prompt_template = """
You emulate a client who wants to know more about the syndicated research coverage by a company.
Formulate 5 questions this client might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

category: {Category}
question: {Question}
answer: {Answer}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [7]:
prompt_template.format(**final_data[0])

'You emulate a client who wants to know more about the syndicated research coverage by a compant.\nFormulate 5 questions this client might ask based on a FAQ record. The record\nshould contain the answer to the questions, and the questions should be complete and not too short.\nIf possible, use as fewer words as possible from the record. \n\nThe record:\n\ncategory: General Information\nquestion: What is syndicated research?\nanswer: Syndicated research is a type of market research where data and findings are collected and compiled by a research agency and then sold to multiple clients. It provides insights into consumer behavior, market trends, and product performance across various FMCG categories.\n\nProvide the output in parsable JSON without using code blocks:\n\n["question1", "question2", ..., "question5"]'

In [8]:
# Using Open AI to generate 5 questions for each record

In [9]:
import os

# Attempt to retrieve the API key from environment variables
api_key = os.getenv("OPENAI_API_KEY")

if api_key:
    print("OpenAI API key is available.")
else:
    print("OpenAI API key is not set.")


OpenAI API key is available.


In [2]:
from openai import OpenAI
client = OpenAI()

In [10]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [11]:
from tqdm.auto import tqdm


In [17]:
results = {}

for doc in tqdm(final_data): 
    doc_id = doc['doc_id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/260 [00:00<?, ?it/s]

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-ydfmbhxCIOvRJShMYHPLx27R on requests per day (RPD): Limit 200, Used 200, Requested 1. Please try again in 7m12s. Visit https://platform.openai.com/account/rate-limits to learn more. You can increase your rate limit by adding a payment method to your account at https://platform.openai.com/account/billing.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}

In [19]:
len(results)

1

In [None]:
import pickle
with open('results.bin', 'rb') as f_in:
    results = pickle.load(f_in)

In [None]:
parsed_resulst = {}

for doc_id, json_questions in results.items():
    parsed_resulst[doc_id] = json.loads(json_questions)

In [None]:
doc_index = {d['doc_id']: d for d in final_data}


In [None]:
final_results = []

for doc_id, questions in parsed_resulst.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

In [None]:
import pandas as pd
df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])
df.to_csv('ground_truth_data.csv', index=False)


In [None]:
!head ground-truth-data.csv
