## PDF and Elements Setting

In [1]:
from unstructured.partition.pdf import partition_pdf


def extract_pdf_elements(filepath):
    return partition_pdf(
        filename=filepath,
        strategy="fast", 
        extract_images_in_pdf=False,
        infer_table_structure=False,
        chunking_strategy="by_title",
        max_characters=4000,
        new_after_n_chars=3800,
        combine_text_under_n_chars=2000,
    )

In [2]:
adnd_pdf_files = [
    "Application_Development_and_Delivery/Application_Development_and_Delivery_Admission_Requirements.pdf",
    "Application_Development_and_Delivery/Application_Development_and_Delivery_Awards_and_Scholarships.pdf",
    "Application_Development_and_Delivery/Application_Development_and_Delivery_CO_OP_Practicum_Information.pdf",
    "Application_Development_and_Delivery/Application_Development_and_Delivery_Computer_Laptop_Requirements.pdf",
    "Application_Development_and_Delivery/Application_Development_and_Delivery_Courses_and_Descriptions.pdf",
    "Application_Development_and_Delivery/Application_Development_and_Delivery_Employment_Potential.pdf",
    "Application_Development_and_Delivery/Application_Development_and_Delivery_English_Language_Assessments.pdf",
    "Application_Development_and_Delivery/Application_Development_and_Delivery_Graduation_Requirements.pdf",
    "Application_Development_and_Delivery/Application_Development_and_Delivery_Locations_Dates_and_Fees.pdf",
    "Application_Development_and_Delivery/Application_Development_and_Delivery_Overview.pdf",
    "Application_Development_and_Delivery/Application_Development_and_Delivery_Recognition_of_Prior_Learning.pdf",
    "Application_Development_and_Delivery/Application_Development_and_Delivery_Who_Should_Enrol.pdf",
    "Application_Development_and_Delivery/Application_Development_and_Delivery_Print_Friendly.pdf",
]

In [3]:
import os

for filepath in adnd_pdf_files:
    if os.path.exists(filepath):
        try:
            elements = extract_pdf_elements(filepath)
            var_name = os.path.basename(filepath).replace(".pdf", "_elements")
            globals()[var_name] = elements
            print(f"Saved: {var_name}")
        except Exception as e:
            print(f"Error: {filepath}: {e}")
    else:
        print(f"Not Found: {filepath}")

Saved: Application_Development_and_Delivery_Admission_Requirements_elements
Saved: Application_Development_and_Delivery_Awards_and_Scholarships_elements
Saved: Application_Development_and_Delivery_CO_OP_Practicum_Information_elements
Saved: Application_Development_and_Delivery_Computer_Laptop_Requirements_elements
Saved: Application_Development_and_Delivery_Courses_and_Descriptions_elements
Saved: Application_Development_and_Delivery_Employment_Potential_elements
Saved: Application_Development_and_Delivery_English_Language_Assessments_elements
Saved: Application_Development_and_Delivery_Graduation_Requirements_elements
Saved: Application_Development_and_Delivery_Locations_Dates_and_Fees_elements
Saved: Application_Development_and_Delivery_Overview_elements
Saved: Application_Development_and_Delivery_Recognition_of_Prior_Learning_elements
Saved: Application_Development_and_Delivery_Who_Should_Enrol_elements
Saved: Application_Development_and_Delivery_Print_Friendly_elements


In [4]:
elements_vars = [var for var in globals() if var.startswith("Application_Development_and_Delivery_") and var.endswith("_elements")]

for var_name in elements_vars:
    elements = globals()[var_name]
    print(f"{var_name}: {len(elements)} elements")

Application_Development_and_Delivery_Admission_Requirements_elements: 3 elements
Application_Development_and_Delivery_Awards_and_Scholarships_elements: 25 elements
Application_Development_and_Delivery_CO_OP_Practicum_Information_elements: 2 elements
Application_Development_and_Delivery_Computer_Laptop_Requirements_elements: 1 elements
Application_Development_and_Delivery_Courses_and_Descriptions_elements: 9 elements
Application_Development_and_Delivery_Employment_Potential_elements: 1 elements
Application_Development_and_Delivery_English_Language_Assessments_elements: 2 elements
Application_Development_and_Delivery_Graduation_Requirements_elements: 1 elements
Application_Development_and_Delivery_Locations_Dates_and_Fees_elements: 1 elements
Application_Development_and_Delivery_Overview_elements: 2 elements
Application_Development_and_Delivery_Recognition_of_Prior_Learning_elements: 1 elements
Application_Development_and_Delivery_Who_Should_Enrol_elements: 1 elements
Application_Develop

In [5]:
Admission_Requirements_var_name = "Application_Development_and_Delivery_Admission_Requirements_elements"
Awards_and_Scholarships_var_name = "Application_Development_and_Delivery_Awards_and_Scholarships_elements"
CO_OP_Practicum_Information_var_name = "Application_Development_and_Delivery_CO_OP_Practicum_Information_elements"
Computer_Laptop_Requirements_var_name = "Application_Development_and_Delivery_Computer_Laptop_Requirements_elements"
Courses_and_Descriptions_var_name = "Application_Development_and_Delivery_Courses_and_Descriptions_elements"
Employment_Potential_var_name = "Application_Development_and_Delivery_Employment_Potential_elements"
English_Language_Assessments_var_name = "Application_Development_and_Delivery_English_Language_Assessments_elements"
Graduation_Requirements_var_name = "Application_Development_and_Delivery_Graduation_Requirements_elements"
Locations_Dates_and_Fees_var_name = "Application_Development_and_Delivery_Locations_Dates_and_Fees_elements"
Overview_var_name = "Application_Development_and_Delivery_Overview_elements"
Recognition_of_Prior_Learning_var_name = "Application_Development_and_Delivery_Recognition_of_Prior_Learning_elements"
Who_Should_Enrol_var_name = "Application_Development_and_Delivery_Who_Should_Enrol_elements"
Print_Friendly_var_name = "Application_Development_and_Delivery_Print_Friendly_elements"

Application_Development_and_Delivery_Admission_Requirements_elements = globals()[Admission_Requirements_var_name]
Application_Development_and_Delivery_Awards_and_Scholarships_elements = globals()[Awards_and_Scholarships_var_name]
Application_Development_and_Delivery_CO_OP_Practicum_Information_elements = globals()[CO_OP_Practicum_Information_var_name]
Application_Development_and_Delivery_Computer_Laptop_Requirements_elements = globals()[Computer_Laptop_Requirements_var_name]
Application_Development_and_Delivery_Courses_and_Descriptions_elements = globals()[Courses_and_Descriptions_var_name]
Application_Development_and_Delivery_Employment_Potential_elements = globals()[Employment_Potential_var_name]
Application_Development_and_Delivery_English_Language_Assessments_elements = globals()[English_Language_Assessments_var_name]
Application_Development_and_Delivery_Graduation_Requirements_elements = globals()[Graduation_Requirements_var_name]
Application_Development_and_Delivery_Locations_Dates_and_Fees_elements = globals()[Locations_Dates_and_Fees_var_name]
Application_Development_and_Delivery_Overview_elements = globals()[Overview_var_name]
Application_Development_and_Delivery_Recognition_of_Prior_Learning_elements = globals()[Recognition_of_Prior_Learning_var_name]
Application_Development_and_Delivery_Who_Should_Enrol_elements = globals()[Who_Should_Enrol_var_name]
Application_Development_and_Delivery_Print_Friendly_elements = globals()[Print_Friendly_var_name]

In [6]:
print(len(Application_Development_and_Delivery_Admission_Requirements_elements))
print(len(Application_Development_and_Delivery_Awards_and_Scholarships_elements))
print(len(Application_Development_and_Delivery_CO_OP_Practicum_Information_elements))
print(len(Application_Development_and_Delivery_Computer_Laptop_Requirements_elements))
print(len(Application_Development_and_Delivery_Courses_and_Descriptions_elements))
print(len(Application_Development_and_Delivery_Employment_Potential_elements))
print(len(Application_Development_and_Delivery_English_Language_Assessments_elements))
print(len(Application_Development_and_Delivery_Graduation_Requirements_elements))
print(len(Application_Development_and_Delivery_Locations_Dates_and_Fees_elements))
print(len(Application_Development_and_Delivery_Overview_elements))
print(len(Application_Development_and_Delivery_Recognition_of_Prior_Learning_elements))
print(len(Application_Development_and_Delivery_Who_Should_Enrol_elements))
print(len(Application_Development_and_Delivery_Print_Friendly_elements))

3
25
2
1
9
1
2
1
1
2
1
1
14


## Generator

In [7]:
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate.from_template(
    """Context information is below. You are only aware of this context and nothing else.
---------------------

{context}

---------------------
Given this context, generate only questions based on the below query.
You are an Teacher/Professor in {domain}. 
Your task is to provide exactly **{num_questions}** question(s) for an upcoming quiz/examination.
Each question must contain the phrase `{program_name} program`.
You are not to provide more or less than this number of questions. 
The question(s) should be diverse in nature across the document. 
The purpose of question(s) is to test the understanding of the students on the context information provided.
You must also provide the answer to each question. The answer should be based on the context information provided only.s

Restrict the question(s) to the context information provided only.
QUESTION and ANSWER should be written in English. response in JSON format which contains the `question` and `answer`.
DO NOT USE List in JSON format.
ANSWER should be a complete sentence.

#Format:
```json
{{
    "QUESTION": "Where is the campus located for Application Development and Delivery program?",
    "ANSWER": "Application Development and Delivery program is held at the Exchange District Campus in Winnipeg, Manitoba."
}},
{{
    "QUESTION": "What is the focus of COMP-1334 Design Thinking and Innovation in the Application Development and Delivery program?",
    "ANSWER": "In the Application Development and Delivery program, COMP-1334 teaches students how to use iterative design thinking to empathize with users, redefine problems, develop and test prototypes, and pitch innovative solutions."
}},
{{
    "QUESTION": "Who can apply under Mature Student admission requirements for Application Development and Delivery program?",
    "ANSWER": "If you are 19 years of age or older and have been out of high school for a minimum of one year at time of application, and you do not meet the regular admission requirements, you may apply under the Mature Student admission requirements."
}}
```
"""
)

In [8]:
import json
import re

def custom_json_parser(response):
    raw = str(response.content) if hasattr(response, 'content') else str(response)
    
    code_blocks = re.findall(r"```json\s*(.*?)\s*```", raw, re.DOTALL)
    json_content = code_blocks[0].strip() if code_blocks else raw
    
    json_objects = []
    brace_count = 0
    start_pos = -1
    
    for i, char in enumerate(json_content):
        if char == '{':
            if brace_count == 0:
                start_pos = i
            brace_count += 1
        elif char == '}':
            brace_count -= 1
            if brace_count == 0 and start_pos != -1:
                obj_text = json_content[start_pos:i+1]
                try:
                    parsed_obj = json.loads(obj_text)
                    if 'QUESTION' in parsed_obj and 'ANSWER' in parsed_obj:
                        json_objects.append(parsed_obj)
                except json.JSONDecodeError:
                    pass
                start_pos = -1
                
    return json_objects

In [9]:
from langchain_core.runnables import RunnableLambda
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

load_dotenv()

parser_runnable = RunnableLambda(custom_json_parser)

chain = (
    prompt
    | ChatOpenAI(
        model="gpt-4o",
        temperature=0,
        streaming=True,
        api_key=os.getenv("OPENAI_API_KEY"),
    )
    | parser_runnable
)

In [10]:
selected_elements = Application_Development_and_Delivery_Print_Friendly_elements
element_name = "Application_Development_and_Delivery_Print_Friendly_elements"

qa_pairs = []

for element in selected_elements:
    try:
        result = chain.invoke({
            "context": element.text,
            "domain": "Application Development and Delivery Program and Course Catalogue",
            "num_questions": "20",
            "program_name": "Application Development and Delivery"
        })
        if result:
            qa_pairs.extend(result)
    except Exception:
        continue

In [11]:
import json
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
with open(f"jsonl/{element_name}_{timestamp}.jsonl", "w", encoding="utf-8") as f:
    for qa in qa_pairs:
        qa_modified = {
            "instruction": qa["QUESTION"],
            "input": "",
            "output": qa["ANSWER"],
        }
        f.write(json.dumps(qa_modified, ensure_ascii=False) + "\n")
print(f"QA pairs saved to json1/{element_name}_{timestamp}.jsonl")

QA pairs saved to json1/Application_Development_and_Delivery_Print_Friendly_elements_20250601_191827.jsonl


In [12]:
def save_qa_to_json_simple(qa_pairs, filename=None):
    
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(qa_pairs, f, ensure_ascii=False, indent=2)
        
        print(f"Filename: {filename}")
        print(f"Questions: {len(qa_pairs)}")
        print(f"Saved")
        return filename

    except Exception as e:
        print(f"Error: {e}")
        return None

In [13]:
saved_file = save_qa_to_json_simple(qa_pairs, f"dataset/{element_name}_{timestamp}.json")

print(f"Completed: {element_name}")

Filename: dataset/Application_Development_and_Delivery_Print_Friendly_elements_20250601_191827.json
Questions: 280
Saved
Completed: Application_Development_and_Delivery_Print_Friendly_elements
