## PDF and Elements Setting

In [8]:
from unstructured.partition.pdf import partition_pdf


def extract_pdf_elements(filepath):
    return partition_pdf(
        filename=filepath,
        strategy="fast", 
        extract_images_in_pdf=False,
        infer_table_structure=False,
        chunking_strategy="by_title",
        max_characters=4000,
        new_after_n_chars=3800,
        combine_text_under_n_chars=2000,
    )

In [9]:
adnd_pdf_files = [
    "RRC/ADD_Print_Friendly.pdf",
    "RRC/DSML_Print_Friendly.pdf",
    "RRC/FULLSTACK_Print_Friendly.pdf",
    "RRC/INFOSEC_Print_Friendly.pdf",
    "RRC/CYBER_Print_Friendly.pdf",
    "RRC/ITOPS_Print_Friendly.pdf",
    "RRC/NETSYS_Print_Friendly.pdf",
    "RRC/rrc-viewbook-v1qros_Print_Friendly.pdf",
]

In [10]:
import os

for filepath in adnd_pdf_files:
    if os.path.exists(filepath):
        try:
            elements = extract_pdf_elements(filepath)
            var_name = os.path.basename(filepath).replace(".pdf", "_elements")
            globals()[var_name] = elements
            print(f"Saved: {var_name}")
        except Exception as e:
            print(f"Error: {filepath}: {e}")
    else:
        print(f"Not Found: {filepath}")

Saved: ADD_Print_Friendly_elements
Saved: DSML_Print_Friendly_elements
Saved: FULLSTACK_Print_Friendly_elements
Saved: INFOSEC_Print_Friendly_elements
Saved: CYBER_Print_Friendly_elements
Saved: ITOPS_Print_Friendly_elements
Saved: NETSYS_Print_Friendly_elements


Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P0' is an invalid float value
Cannot set gray non-stroke color because /'P1' is an invalid float value
Cannot set gray non-stroke color because /'P2' is an invalid float value
Cannot set gray non-stroke color because /'P3' is an invalid float value
Cannot set gray non-stroke color because /'P4' is an invalid float value
Cannot set gray stroke color because /'P5' is an invalid float value
Cannot set gray stroke color because /'P6' is an invalid float value


Saved: rrc-viewbook-v1qros_Print_Friendly_elements


In [11]:
elements_vars = [var for var in globals() if var.endswith("_Print_Friendly_elements")]

for var_name in elements_vars:
    elements = globals()[var_name]
    print(f"{var_name}: {len(elements)} elements")

ADD_Print_Friendly_elements: 14 elements
DSML_Print_Friendly_elements: 12 elements
FULLSTACK_Print_Friendly_elements: 10 elements
INFOSEC_Print_Friendly_elements: 10 elements
CYBER_Print_Friendly_elements: 11 elements
ITOPS_Print_Friendly_elements: 11 elements
NETSYS_Print_Friendly_elements: 8 elements
rrc-viewbook-v1qros_Print_Friendly_elements: 37 elements
rrc_viewbook_Print_Friendly_elements: 37 elements


In [12]:
ADD_var_name = "ADD_Print_Friendly_elements"
DSML_var_name = "DSML_Print_Friendly_elements"
FULLSTACK_var_name = "FULLSTACK_Print_Friendly_elements"
INFOSEC_var_name = "INFOSEC_Print_Friendly_elements"
CYBER_var_name = "CYBER_Print_Friendly_elements"
ITOPS_var_name = "ITOPS_Print_Friendly_elements"
NETSYS_var_name = "NETSYS_Print_Friendly_elements"
rrc_viewbook_var_name = "rrc-viewbook-v1qros_Print_Friendly_elements"



ADD_Print_Friendly_elements = globals()[ADD_var_name]
DSML_Print_Friendly_elements = globals()[DSML_var_name]
FULLSTACK_Print_Friendly_elements = globals()[FULLSTACK_var_name]
INFOSEC_Print_Friendly_elements = globals()[INFOSEC_var_name]
CYBER_Print_Friendly_elements = globals()[CYBER_var_name]
ITOPS_Print_Friendly_elements = globals()[ITOPS_var_name]
NETSYS_Print_Friendly_elements = globals()[NETSYS_var_name]
rrc_viewbook_Print_Friendly_elements = globals()[rrc_viewbook_var_name]

In [13]:
print(len(ADD_Print_Friendly_elements), "ADD elements")
print(len(DSML_Print_Friendly_elements), "DSML elements")
print(len(FULLSTACK_Print_Friendly_elements), "FULLSTACK elements")
print(len(INFOSEC_Print_Friendly_elements), "INFOSEC elements")
print(len(CYBER_Print_Friendly_elements), "CYBER elements")
print(len(ITOPS_Print_Friendly_elements), "ITOPS elements")
print(len(NETSYS_Print_Friendly_elements), "NETSYS elements")
print(len(rrc_viewbook_Print_Friendly_elements), "RRC Viewbook elements")

14 ADD elements
12 DSML elements
10 FULLSTACK elements
10 INFOSEC elements
11 CYBER elements
11 ITOPS elements
8 NETSYS elements
37 RRC Viewbook elements


## Generator

In [67]:
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate.from_template(
    """Context information is below. You are only aware of this context and nothing else.
---------------------

{context}

---------------------
Given this context, generate only questions based on the below query.
You are an Teacher/Professor in {domain}. 
Your task is to provide exactly **{num_questions}** question(s) for an upcoming quiz/examination.
You are not to provide more or less than this number of questions. 
The question(s) should be diverse in nature across the document. 
The purpose of question(s) is to test the understanding of the students on the context information provided.
You must also provide the answer to each question. The answer should be based on the context information provided only.s

Restrict the question(s) to the context information provided only.
QUESTION and ANSWER should be written in English. response in JSON format which contains the `question` and `answer`.
DO NOT USE List in JSON format.
ANSWER should be a complete sentence.

#Format:
```json
{{
    "QUESTION": "Where are the Student Service Centres located at Red River College?",
    "ANSWER": "The Student Service Centres are located at the Notre Dame and Exchange District Campuses of Red River College."
}},
{{
    "QUESTION": "What services does RRC provide for student mental health and accessibility?",
    "ANSWER": "RRC offers counselling for personal, academic, and financial issues, and provides accessibility services like interpreting, note-taking, and adaptive technology for students with diagnosed disabilities."
}},
{{
    "QUESTION": "Where can students access library resources at RRC?",
    "ANSWER": "Libraries are located at both Notre Dame and Exchange District Campuses, offering books, databases, media services, and reciprocal borrowing from U of M and U of W."
}}
```
"""
)

In [68]:
import json
import re

def custom_json_parser(response):
    raw = str(response.content) if hasattr(response, 'content') else str(response)
    
    code_blocks = re.findall(r"```json\s*(.*?)\s*```", raw, re.DOTALL)
    json_content = code_blocks[0].strip() if code_blocks else raw
    
    json_objects = []
    brace_count = 0
    start_pos = -1
    
    for i, char in enumerate(json_content):
        if char == '{':
            if brace_count == 0:
                start_pos = i
            brace_count += 1
        elif char == '}':
            brace_count -= 1
            if brace_count == 0 and start_pos != -1:
                obj_text = json_content[start_pos:i+1]
                try:
                    parsed_obj = json.loads(obj_text)
                    if 'QUESTION' in parsed_obj and 'ANSWER' in parsed_obj:
                        json_objects.append(parsed_obj)
                except json.JSONDecodeError:
                    pass
                start_pos = -1
                
    return json_objects

In [69]:
from langchain_core.runnables import RunnableLambda
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

load_dotenv()

parser_runnable = RunnableLambda(custom_json_parser)

chain = (
    prompt
    | ChatOpenAI(
        model="gpt-4o",
        temperature=0,
        streaming=True,
        api_key=os.getenv("OPENAI_API_KEY"),
    )
    | parser_runnable
)

In [70]:
selected_elements = rrc_viewbook_Print_Friendly_elements
element_name = "rrc_viewbook_Print_Friendly_elements"

qa_pairs = []

for element in selected_elements:
    try:
        result = chain.invoke({
            "context": element.text,
            "domain": "Red River College Viewbook",
            "num_questions": "20",
            # "program_name": "Red River College"
        })
        if result:
            qa_pairs.extend(result)
    except Exception:
        continue

In [71]:
import json
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
with open(f"jsonl/{element_name}_{timestamp}.jsonl", "w", encoding="utf-8") as f:
    for qa in qa_pairs:
        qa_modified = {
            "instruction": qa["QUESTION"],
            "input": "",
            "output": qa["ANSWER"],
        }
        f.write(json.dumps(qa_modified, ensure_ascii=False) + "\n")
print(f"QA pairs saved to json1/{element_name}_{timestamp}.jsonl")

QA pairs saved to json1/rrc_viewbook_Print_Friendly_elements_20250602_154513.jsonl


In [72]:
def save_qa_to_json_simple(qa_pairs, filename=None):
    
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(qa_pairs, f, ensure_ascii=False, indent=2)
        
        print(f"Filename: {filename}")
        print(f"Questions: {len(qa_pairs)}")
        print(f"Saved")
        return filename

    except Exception as e:
        print(f"Error: {e}")
        return None

In [73]:
saved_file = save_qa_to_json_simple(qa_pairs, f"dataset/{element_name}_{timestamp}.json")

print(f"Completed: {element_name}")

Filename: dataset/rrc_viewbook_Print_Friendly_elements_20250602_154513.json
Questions: 740
Saved
Completed: rrc_viewbook_Print_Friendly_elements
