## PDF and Elements Setting

In [1]:
from unstructured.partition.pdf import partition_pdf


def extract_pdf_elements(filepath):
    return partition_pdf(
        filename=filepath,
        strategy="fast", 
        extract_images_in_pdf=False,
        infer_table_structure=False,
        chunking_strategy="by_title",
        max_characters=4000,
        new_after_n_chars=3800,
        combine_text_under_n_chars=2000,
    )

In [2]:
import os

pdf_dir = "RRC"
adnd_pdf_files = [
    os.path.join(pdf_dir, f)
    for f in os.listdir(pdf_dir)
    if f.endswith(".pdf")
]

In [3]:
import os

for filepath in adnd_pdf_files:
    if os.path.exists(filepath):
        try:
            elements = extract_pdf_elements(filepath)
            var_name = os.path.basename(filepath).replace(".pdf", "_elements")
            globals()[var_name] = elements
            print(f"Saved: {var_name}")
        except Exception as e:
            print(f"Error: {filepath}: {e}")
    else:
        print(f"Not Found: {filepath}")

Saved: ADD_Print_Friendly_elements
Saved: ADMASSI_Print_Friendly_elements
Saved: ADVMANU_Print_Friendly_elements
Saved: AEROMANU_Print_Friendly_elements
Saved: AIRMANUEN_Print_Friendly_elements
Saved: ANIMI_Print_Friendly_elements
Saved: APPACC_Print_Friendly_elements
Saved: BUSADM_Print_Friendly_elements
Saved: COMMDES_Print_Friendly_elements
Saved: CREATCOM_Print_Friendly_elements
Saved: CULIA_Print_Friendly_elements
Saved: CYBER_Print_Friendly_elements
Saved: DFMP_Print_Friendly_elements
Saved: DMDDD_Print_Friendly_elements
Saved: DMDMG_Print_Friendly_elements
Saved: DMD_Print_Friendly_elements
Saved: DSML_Print_Friendly_elements
Saved: ECEW_Print_Friendly_elements
Saved: EET_Print_Friendly_elements
Saved: ENTPRO_Print_Friendly_elements
Saved: FULLSTACK_Print_Friendly_elements
Saved: GAMEDEVA_Print_Friendly_elements
Saved: GAMEDEVP_Print_Friendly_elements
Saved: GP_Print_Friendly_elements
Saved: HOSTM_Print_Friendly_elements
Saved: INFOSEC_Print_Friendly_elements
Saved: INTROEET_Pri

Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is an invalid float value
Cannot set gray stroke color because /'P5' is an invalid float value
Cannot set gray stroke color because /'P6' is an invalid float value


Saved: rrc-viewbook-v1qros_Print_Friendly_elements
Saved: SOCICDEV_Print_Friendly_elements


In [4]:
elements_vars = [var for var in globals() if var.endswith("_Print_Friendly_elements")]

for var_name in elements_vars:
    elements = globals()[var_name]
    print(f"{var_name}: {len(elements)} elements")

ADD_Print_Friendly_elements: 14 elements
ADMASSI_Print_Friendly_elements: 9 elements
ADVMANU_Print_Friendly_elements: 9 elements
AEROMANU_Print_Friendly_elements: 7 elements
AIRMANUEN_Print_Friendly_elements: 10 elements
ANIMI_Print_Friendly_elements: 12 elements
APPACC_Print_Friendly_elements: 9 elements
BUSADM_Print_Friendly_elements: 9 elements
COMMDES_Print_Friendly_elements: 9 elements
CREATCOM_Print_Friendly_elements: 20 elements
CULIA_Print_Friendly_elements: 14 elements
CYBER_Print_Friendly_elements: 11 elements
DFMP_Print_Friendly_elements: 9 elements
DMDDD_Print_Friendly_elements: 8 elements
DMDMG_Print_Friendly_elements: 7 elements
DMD_Print_Friendly_elements: 28 elements
DSML_Print_Friendly_elements: 12 elements
ECEW_Print_Friendly_elements: 16 elements
EET_Print_Friendly_elements: 10 elements
ENTPRO_Print_Friendly_elements: 2 elements
FULLSTACK_Print_Friendly_elements: 10 elements
GAMEDEVA_Print_Friendly_elements: 12 elements
GAMEDEVP_Print_Friendly_elements: 12 elements
G

In [6]:
program_keys = [
    "ADD", "ADMASSI", "ADVMANU", "AEROMANU", "AIRMANUEN", "ANIMI", "APPACC", "BUSADM",
    "COMMDES", "CREATCOM", "CULIA", "CYBER", "DFMP", "DMD", "DMDDD", "DMDMG", "DSML",
    "ECEW", "EET", "ENTPRO", "FULLSTACK", "GAMEDEVA", "GAMEDEVP", "GP", "HOSTM", "INFOSEC",
    "INTROEET", "ITOPS", "NETSYS", "PAITP", "PROBP", "PROSSMAR", "SOCICDEV"
]

for key in program_keys:
    var_name = f"{key}_Print_Friendly_elements"
    var = globals().get(var_name)
    if var is not None:
        print(len(var), f"{key} elements")
    else:
        print(f"{key} not found")

14 ADD elements
9 ADMASSI elements
9 ADVMANU elements
7 AEROMANU elements
10 AIRMANUEN elements
12 ANIMI elements
9 APPACC elements
9 BUSADM elements
9 COMMDES elements
20 CREATCOM elements
14 CULIA elements
11 CYBER elements
9 DFMP elements
28 DMD elements
8 DMDDD elements
7 DMDMG elements
12 DSML elements
16 ECEW elements
10 EET elements
2 ENTPRO elements
10 FULLSTACK elements
12 GAMEDEVA elements
12 GAMEDEVP elements
15 GP elements
11 HOSTM elements
10 INFOSEC elements
25 INTROEET elements
11 ITOPS elements
8 NETSYS elements
6 PAITP elements
11 PROBP elements
9 PROSSMAR elements
17 SOCICDEV elements


## Generator

In [None]:
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate.from_template(
    """Context information is below. You are only aware of this context and nothing else.
---------------------

{context}

---------------------
Given this context, generate only questions based on the below query.
You are an Teacher/Professor in {domain}. 
Your task is to provide exactly **{num_questions}** question(s) for an upcoming quiz/examination.
Each question must contain the phrase `{program_name} program`.
You are not to provide more or less than this number of questions. 
The question(s) should be diverse in nature across the document. 
The purpose of question(s) is to test the understanding of the students on the context information provided.
You must also provide the answer to each question. The answer should be based on the context information provided only.s

Restrict the question(s) to the context information provided only.
QUESTION and ANSWER should be written in English. response in JSON format which contains the `question` and `answer`.
DO NOT USE List in JSON format.
ANSWER should be a complete sentence.

#Format:
```json
{{
    "QUESTION": "What is the duration of the Administrative Assistant program?",
    "ANSWER": "The Administrative Assistant program is an eighteen-week certificate program that includes a four-week practicum."
}},
{{
    "QUESTION": "Where is the Administrative Assistant program offered?",
    "ANSWER": "The program is offered full-time or part-time at multiple campuses including Notre Dame (Winnipeg), Interlake (Selkirk), Peguis-Fisher River (Peguis), Portage (Portage la Prairie), Steinbach (Steinbach), and Winkler (Winkler)."
}},
{{
    "QUESTION": "What skills are taught in the Customer Service (BUSA-1081) in the Administrative Assistant program?",
    "ANSWER": "The Customer Service course focuses on listening, empathy, customer motivation, communication, conflict resolution, and interpersonal communication skills."
}}
```
"""
)

In [8]:
import json
import re

def custom_json_parser(response):
    raw = str(response.content) if hasattr(response, 'content') else str(response)
    
    code_blocks = re.findall(r"```json\s*(.*?)\s*```", raw, re.DOTALL)
    json_content = code_blocks[0].strip() if code_blocks else raw
    
    json_objects = []
    brace_count = 0
    start_pos = -1
    
    for i, char in enumerate(json_content):
        if char == '{':
            if brace_count == 0:
                start_pos = i
            brace_count += 1
        elif char == '}':
            brace_count -= 1
            if brace_count == 0 and start_pos != -1:
                obj_text = json_content[start_pos:i+1]
                try:
                    parsed_obj = json.loads(obj_text)
                    if 'QUESTION' in parsed_obj and 'ANSWER' in parsed_obj:
                        json_objects.append(parsed_obj)
                except json.JSONDecodeError:
                    pass
                start_pos = -1
                
    return json_objects

In [9]:
from langchain_core.runnables import RunnableLambda
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

load_dotenv()

parser_runnable = RunnableLambda(custom_json_parser)

chain = (
    prompt
    | ChatOpenAI(
        model="gpt-4o",
        temperature=0,
        streaming=True,
        api_key=os.getenv("OPENAI_API_KEY"),
    )
    | parser_runnable
)

In [None]:
selected_elements = ADMASSI_Print_Friendly_elements
element_name = "ADMASSI_Print_Friendly_elements"

qa_pairs = []

for element in selected_elements:
    try:
        result = chain.invoke({
            "context": element.text,
            "domain": "Administrative Assistant Program and Course Catalogue",
            "num_questions": "20",
            "program_name": "Administrative Assistant"
        })
        if result:
            qa_pairs.extend(result)
    except Exception:
        continue

In [None]:
import json
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
with open(f"jsonl/{element_name}_{timestamp}.jsonl", "w", encoding="utf-8") as f:
    for qa in qa_pairs:
        qa_modified = {
            "instruction": qa["QUESTION"],
            "input": "",
            "output": qa["ANSWER"],
        }
        f.write(json.dumps(qa_modified, ensure_ascii=False) + "\n")
print(f"QA pairs saved to json1/{element_name}_{timestamp}.jsonl")

QA pairs saved to json1/rrc_viewbook_Print_Friendly_elements_20250602_154513.jsonl


In [None]:
def save_qa_to_json_simple(qa_pairs, filename=None):
    
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(qa_pairs, f, ensure_ascii=False, indent=2)
        
        print(f"Filename: {filename}")
        print(f"Questions: {len(qa_pairs)}")
        print(f"Saved")
        return filename

    except Exception as e:
        print(f"Error: {e}")
        return None

In [None]:
saved_file = save_qa_to_json_simple(qa_pairs, f"dataset/{element_name}_{timestamp}.json")

print(f"Completed: {element_name}")

Filename: dataset/rrc_viewbook_Print_Friendly_elements_20250602_154513.json
Questions: 740
Saved
Completed: rrc_viewbook_Print_Friendly_elements
