In [9]:
# llama-parse is async-first, running the sync code in a notebook requires the use of nest_asyncio
import nest_asyncio
nest_asyncio.apply()

from dotenv import load_dotenv
load_dotenv()

import os
import json
from llama_index.core.schema import TextNode
from llama_index.core.llama_dataset.generator import RagDatasetGenerator

In [37]:
# from openai import OpenAI
# clientDS  = OpenAI(api_key=os.environ['DEEPSEEK_API_KEY'], base_url="https://api.deepseek.com")
# clientNV  = OpenAI(api_key=os.environ['NVIDIA_API_KEY'], base_url="https://integrate.api.nvidia.com/v1")
from llama_index.llms.openai import OpenAI
gpt_35_llm = OpenAI(model="gpt-4o-mini", temperature=0.3)

In [49]:
json_dir = './S02_SemanticChunkedJson/'
lsjson = [f for f in os.listdir(json_dir) if f.endswith('.json')]
lsjson

['Industrial_Gas_Turbines_AMY_Razak.json',
 'msd_servo_drive.json',
 'pcs7_compendium_part_a_en-US_en-US.json',
 'pcs7_compendium_part_b_en-US_en-US.json']

In [None]:
i=0

jsonName = lsjson[i]
# READ JSON
jsonPath = json_dir + jsonName
with open(jsonPath) as f:
    nodes_json = json.load(f) 
# CONVERT TO LLAMA NODES
nodes = [TextNode.from_dict(node_dict) for node_dict in nodes_json]
# CREATE FOLDER IF NOT EXISTS
folderName = jsonName.replace('.json','')
if not os.path.exists(f"./S03_QuestionGeneration/{folderName}/"):
    os.makedirs(f"./S03_QuestionGeneration/{folderName}/")
# LOOP GEN QUESTION
for j in range(len(nodes)):
    # PARSE
    nodej = nodes[j]
    node_file_name = nodej.metadata['file_name']
    node_hash = nodej.hash
    node_text = nodej.text
    print(jsonName,j,len(nodes))
    # ONLY WORK IF NOT ALREADY EXISTS
    if not os.path.isfile(f"./S03_QuestionGeneration/{folderName}/{j}_{node_hash}.csv"):
        # INIT DATASET GENERATOR
        dataset_generator = RagDatasetGenerator.from_documents(documents=[nodes[j]],
                                                                llm=gpt_35_llm,
                                                                num_questions_per_chunk=20,
                                                                show_progress=False)
        # REPLACE QUERY
        str_query = f"""
        You are a Teacher/ Professor. Your task is to setup a quiz/examination.
        Using the provided context, formulate {dataset_generator.num_questions_per_chunk} that captures an important fact from the context. 
        You MUST obey the following criteria:
        - Restrict the question to the context information provided.
        - Do NOT create a question that cannot be answered from the context.
        - Phrase the question so that it does NOT refer to specific context. For instance, do NOT put phrases like "given provided context" or "in this work" in the question, because if the question is asked elsewhere it wouldn't be provided specific context. Replace these terms with specific details.

        BAD questions:
        - What did the author do in his childhood
        - What were the main findings in this report

        GOOD questions:
        - What did Barack Obama do in his childhood
        - What were the main findings in the original Transformers paper by Vaswani et al.

        Skills Focus: Consider questions that test:
        - Application of Knowledge: How can students use what they have learned in practical scenarios?
        - Critical Thinking: Can they analyze or evaluate different concepts?
        - Technical Skills: Can they demonstrate proficiency with tools, systems, or frameworks covered in the content?

        Generate the questions below:
        """
        dataset_generator.question_gen_query = str_query
        # GENERATE QUESTIONS
        rag_dataset = dataset_generator.generate_dataset_from_nodes()
        # SAVE DATASET
        df = rag_dataset.to_pandas()
        df['NODE_FILE_NAME'] = node_file_name
        df['NODE_NUMBER'] = j
        df['NODE_HASH'] = node_hash
        df['NODE_TEXT'] = node_text
        df = df[['NODE_FILE_NAME','NODE_NUMBER','NODE_HASH','query','reference_answer',]]
        df = df.rename(columns={'query':'QUESTION','reference_answer':'ANSWER'})
        df.to_csv(f"./S03_QuestionGeneration/{folderName}/{j}_{node_hash}.csv", index=False)



Industrial_Gas_Turbines_AMY_Razak.json 0 381
Industrial_Gas_Turbines_AMY_Razak.json 1 381
Industrial_Gas_Turbines_AMY_Razak.json 2 381
Industrial_Gas_Turbines_AMY_Razak.json 3 381


In [None]:
# def get_response_and_scores(client, model, question, response_content):
#     messages = [
#         {
#             "role": "user",
#             "content": question
#         },
#         {
#             "role": "assistant",
#             "content": response_content
#         },
#     ]

#     response = client.chat.completions.create(
#         model=model,
#         messages=messages,
#     )

#     scores = get_scores_from_response(response)
#     return scores