# Loading groundtruth

In [1]:
import requests
import pandas as pd

data = pd.read_csv('data.csv', sep=',')

data.head()

Unnamed: 0,Category,Question,Answer
0,General Information,What is syndicated research?,Syndicated research is a type of market resear...
1,General Information,How often is the data updated?,The data is typically updated on a monthly bas...
2,General Information,What FMCG categories are covered in this resea...,The research covers a wide range of FMCG categ...
3,General Information,Who can benefit from this syndicated research?,This research is valuable for FMCG manufacture...
4,General Information,What geographic regions are covered in this re...,"The research includes data from North America,..."


# Adding unique ID for each document

In [2]:
import pandas as pd
import hashlib


# Function to create a hash for each row
def create_doc_id(row):
    # Concatenate the row data into a single string (you can customize which columns to include)
    row_string = ''.join(row.astype(str))
    
    # Generate a SHA-256 hash of the concatenated string (but only picks first 8 digits for a shorted id)
    return hashlib.sha256(row_string.encode()).hexdigest()[:8]

# Apply the function to each row to create a 'doc_id' column
data['doc_id'] = data.apply(create_doc_id, axis=1)

data.head()



Unnamed: 0,Category,Question,Answer,doc_id
0,General Information,What is syndicated research?,Syndicated research is a type of market resear...,2dd9200a
1,General Information,How often is the data updated?,The data is typically updated on a monthly bas...,1f0a30a7
2,General Information,What FMCG categories are covered in this resea...,The research covers a wide range of FMCG categ...,a899e27a
3,General Information,Who can benefit from this syndicated research?,This research is valuable for FMCG manufacture...,219af693
4,General Information,What geographic regions are covered in this re...,"The research includes data from North America,...",d7826b0e


In [3]:
final_data = data.to_dict(orient='records')
final_data[0]

{'Category': 'General Information',
 'Question': 'What is syndicated research?',
 'Answer': 'Syndicated research is a type of market research where data and findings are collected and compiled by a research agency and then sold to multiple clients. It provides insights into consumer behavior, market trends, and product performance across various FMCG categories.',
 'doc_id': '2dd9200a'}

In [4]:
prompt_template = """
You emulate a client who wants to know more about the syndicated research coverage by a company.
Formulate 5 questions this client might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

category: {Category}
question: {Question}
answer: {Answer}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [5]:
prompt_template.format(**final_data[0])

'You emulate a client who wants to know more about the syndicated research coverage by a company.\nFormulate 5 questions this client might ask based on a FAQ record. The record\nshould contain the answer to the questions, and the questions should be complete and not too short.\nIf possible, use as fewer words as possible from the record. \n\nThe record:\n\ncategory: General Information\nquestion: What is syndicated research?\nanswer: Syndicated research is a type of market research where data and findings are collected and compiled by a research agency and then sold to multiple clients. It provides insights into consumer behavior, market trends, and product performance across various FMCG categories.\n\nProvide the output in parsable JSON without using code blocks:\n\n["question1", "question2", ..., "question5"]'

In [12]:
final_data[0]

{'Category': 'General Information',
 'Question': 'What is syndicated research?',
 'Answer': 'Syndicated research is a type of market research where data and findings are collected and compiled by a research agency and then sold to multiple clients. It provides insights into consumer behavior, market trends, and product performance across various FMCG categories.',
 'doc_id': '2dd9200a'}

# Using Open AI to generate 5 questions for each record

In [6]:
#pip install typing_extensions


import os
import time
import json
import requests
from pprint import pprint
from typing import Iterator
from openai import AzureOpenAI

  
os.environ["AZURE_OPENAI_ENDPOINT"] = ""
os.environ["AZURE_OPENAI_API_KEY"] = ""

model="MSRP_GPT4_0Mini"
 

client = AzureOpenAI(
    azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key= os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-05-01-preview"
)



In [13]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

#### Testing one one record

In [15]:
final_data[0]

{'Category': 'General Information',
 'Question': 'What is syndicated research?',
 'Answer': 'Syndicated research is a type of market research where data and findings are collected and compiled by a research agency and then sold to multiple clients. It provides insights into consumer behavior, market trends, and product performance across various FMCG categories.',
 'doc_id': '2dd9200a'}

In [16]:
generate_questions(final_data[0])

'[\n    "Can you explain what syndicated research entails?",\n    "How is the data in syndicated research collected and compiled?",\n    "Who typically benefits from syndicated research services?",\n    "What kind of insights can I expect from syndicated research?",\n    "In which categories does syndicated research provide market analysis?"\n]'

In [None]:
# Now generating for all records

In [9]:
from tqdm.auto import tqdm

In [17]:
results = {}

for doc in tqdm(final_data): 
    doc_id = doc['doc_id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/260 [00:00<?, ?it/s]

In [18]:
len(results)

260

In [20]:
import pickle

# Save the results dictionary to a file
with open('results.bin', 'wb') as f_out:
    pickle.dump(results, f_out)


In [21]:
#import pickle
with open('results.bin', 'rb') as f_in:
    results = pickle.load(f_in)

In [25]:
parsed_results = {}

for doc_id, json_questions in results.items():
    parsed_results[doc_id] = json.loads(json_questions)

In [28]:
parsed_results['2dd9200a']

['Can you explain what syndicated research entails and how it is conducted?',
 'Who typically benefits from syndicated research findings within the market?',
 'What kind of insights can one expect to gain from syndicated research?',
 'Is syndicated research limited to specific industries or product categories?',
 'How does syndicated research differ from custom research tailored for a single client?']

In [29]:
doc_index = {d['doc_id']: d for d in final_data}

In [33]:
doc_index['2dd9200a']

{'Category': 'General Information',
 'Question': 'What is syndicated research?',
 'Answer': 'Syndicated research is a type of market research where data and findings are collected and compiled by a research agency and then sold to multiple clients. It provides insights into consumer behavior, market trends, and product performance across various FMCG categories.',
 'doc_id': '2dd9200a'}

In [36]:
final_results = []

for doc_id, questions in parsed_results.items():
    Category = doc_index[doc_id]['Category']
    for q in questions:
        final_results.append((q, Category, doc_id))

In [46]:
import pandas as pd
df = pd.DataFrame(final_results, columns=['Question', 'Category', 'Document'])

df.head()

Unnamed: 0,Question,Category,Document
0,Can you explain what syndicated research entai...,General Information,2dd9200a
1,Who typically benefits from syndicated researc...,General Information,2dd9200a
2,What kind of insights can one expect to gain f...,General Information,2dd9200a
3,Is syndicated research limited to specific ind...,General Information,2dd9200a
4,How does syndicated research differ from custo...,General Information,2dd9200a


In [47]:
df.shape

(1300, 3)

In [48]:
df.to_csv('ground_truth_data.csv', index=False)


#!head ground_truth_data.csv


Question,Category,Document
Can you explain what syndicated research entails and how it is conducted?,General Information,2dd9200a
Who typically benefits from syndicated research findings within the market?,General Information,2dd9200a
What kind of insights can one expect to gain from syndicated research?,General Information,2dd9200a
Is syndicated research limited to specific industries or product categories?,General Information,2dd9200a
How does syndicated research differ from custom research tailored for a single client?,General Information,2dd9200a
What is the frequency of data updates for the research coverage?,General Information,1f0a30a7
Are there any quarterly or annual summaries provided in addition to the monthly updates?,General Information,1f0a30a7
Can you explain if real-time updates are part of any subscription options?,General Information,1f0a30a7
Does the subscription level affect how often the data is refreshed?,General Information,1f0a30a7
