In [1]:
# llama-parse is async-first, running the sync code in a notebook requires the use of nest_asyncio
import nest_asyncio
nest_asyncio.apply()

In [2]:
import pandas as pd
from llama_index.core.schema import TextNode
from llama_index.core.llama_dataset.generator import RagDatasetGenerator
import json
import os

from dotenv import load_dotenv
load_dotenv()

True

In [3]:
from openai import OpenAI
clientDS = OpenAI(api_key=os.environ['DEEPSEEK_API_KEY'],base_url="https://api.deepseek.com")
clientNV = OpenAI(api_key=os.environ['NVIDIA_API_KEY'],base_url="https://integrate.api.nvidia.com/v1")

In [4]:
json_dir = './S02_SemanticChunkedJson/'
lsjson = [f for f in os.listdir(json_dir) if f.endswith('.json')]
lsjson

['Industrial_Gas_Turbines_AMY_Razak.json',
 'msd_servo_drive.json',
 'pcs7_compendium_part_a_en-US_en-US.json',
 'pcs7_compendium_part_b_en-US_en-US.json']

In [None]:
i=2

jsonName = lsjson[i]
# READ JSON
jsonPath = json_dir + jsonName
with open(jsonPath) as f:
    nodes_json = json.load(f) 
# CREATE FOLDER IF NOT EXISTS
folderName = jsonName.replace('.json','')
if not os.path.exists(f"./S03_QuestionGeneration/{folderName}/"):
    os.makedirs(f"./S03_QuestionGeneration/{folderName}/")
# CONVERT TO LLAMA NODES
nodes = [TextNode.from_dict(node_dict) for node_dict in nodes_json]
# LOOP GEN QUESTION
for j in range(len(nodes)):
    # PARSE
    nodej = nodes[j]
    node_file_name = nodej.metadata['file_name']
    node_hash = nodej.hash
    node_text = nodej.text
    print(jsonName,j,len(nodes))
    # ONLY WORK IF NOT ALREADY EXISTS
    if not os.path.isfile(f"./S03_QuestionGeneration/{folderName}/{j}_{node_hash}.csv"):
        # BUILD PROMPT
        system_prompt = f"""
        ### TASK ###
        User will provide chunked text from semantic chunking from the documents.
        Generate 10 diverse questions (factual, inferential, or analytical) based on the input text.
        For each question, provide 5 correct answers with different answering styles (e.g., concise, detailed, rephrased, or structured differently).
        Ensure all answers are factually accurate and derived from the input text.
        Try to reference the book name if it makes sense to do so, note that there could be multiple books for reference, 
        so try to be specific if generating something about the book/publisher/inside the book

        ### BAD QUESTION EXAMPLE ###
        Bad Question1: What rights does the author assert in the book?
        Correct way1: What rights does the author assert in the book '--bookname--'?
        Bad Question2: What does the CD-ROM included with the book contain?
        Correct way2: What does the CD-ROM included with the book '--bookname--' contain?
        Bad Question3: What type of information sources does the book use?
        Correct way3: What type of information sources does the book '--bookname--' use?

        """
        system_prompt += '''
        ### EXAMPLE JSON OUTPUT ###
        {
            'OUT1':{'Q':'generated Question',
                    'A1':'generated Answer Style1',
                    'A2':'generated Answer Style2',
                    'A3':'generated Answer Style3',
                    'A4':'generated Answer Style4',
                    'A5':'generated Answer Style5'},
            'OUT2':{'Q':'generated Question',
                    'A1':'generated Answer Style1',
                    'A2':'generated Answer Style2',
                    'A3':'generated Answer Style3',
                    'A4':'generated Answer Style4',
                    'A5':'generated Answer Style5'},
            ...
            'OUT10':{'Q':'generated Question',
                    'A1':'generated Answer Style1',
                    'A2':'generated Answer Style2',
                    'A3':'generated Answer Style3',
                    'A4':'generated Answer Style4',
                    'A5':'generated Answer Style5'}
        }
        '''
        user_text = node_text
        messages = [{"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_text}]
        # CALL API
        response = clientDS.chat.completions.create(model="deepseek-chat",
                                                    messages=messages,
                                                    response_format={'type': 'json_object'})
        response_json = json.loads(response.choices[0].message.content)
        # STORE IN DF
        df = pd.DataFrame.from_dict(response_json, orient='index').reset_index(drop=True)
        df['NODE_FILE_NAME'] = node_file_name
        df['NODE_NUMBER'] = j
        df['NODE_HASH'] = node_hash
        df['NODE_TEXT'] = node_text
        df = df[['NODE_FILE_NAME','NODE_NUMBER','NODE_HASH','NODE_TEXT',
                'Q','A1','A2','A3','A4','A5']]
        df.to_csv(f"./S03_QuestionGeneration/{folderName}/{j}_{node_hash}.csv")
    


pcs7_compendium_part_a_en-US_en-US.json 0 116


In [None]:
# messages = [{"role": "user", "content": response_json['OUT1']['Q']},
#             {"role": "assistant", "content": response_json['OUT1']['A1']}]
# response = clientNV.chat.completions.create(model="nvidia/nemotron-4-340b-reward", messages=messages)
# # response.choices[0].logprobs.content
# def get_scores_from_response(openai_response_template):
#     logprobs = openai_response_template.choices[0].logprobs.content
#     score_dict = {}
#     for score in logprobs:
#         score_dict[score.token] = score.logprob
#     return score_dict
# get_scores_from_response(response)

[{'role': 'user',
  'content': "What are the main components of an industrial gas turbine as mentioned in the book 'Industrial Gas Turbines: Performance and Operability' by A. M. Y Razak?"},
 {'role': 'assistant',
  'content': 'The main components include a heat exchanger, combustor, compressor, gas supply, and heat sinks.'}]

In [None]:
# def get_response_and_scores(client, model, question, response_content):
#     messages = [
#         {
#             "role": "user",
#             "content": question
#         },
#         {
#             "role": "assistant",
#             "content": response_content
#         },
#     ]

#     response = client.chat.completions.create(
#         model=model,
#         messages=messages,
#     )

#     scores = get_scores_from_response(response)
#     return scores