In [24]:
import pandas as pd
import json
from generate_graph import get_propositions, generateEdges, createGraph, get_propositions_nosplit
from refine_graph import refine
from query_graph import QueryGraph
from tqdm import tqdm
tqdm.pandas()

## Dataset

In [25]:
from PyPDF2 import PdfReader
# provide the path of  pdf file/files.
pdfreader = PdfReader('datasets/employeemanual/employeemanual.pdf')

In [26]:
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [27]:
import tiktoken

# Use GPT-4o encoding
enc = tiktoken.encoding_for_model("gpt-4o-mini")
tokens = enc.encode(raw_text)
print(f"Total tokens: {len(raw_text)}")

Total tokens: 376636


In [5]:
page_texts = []

for page in pdfreader.pages:
    text = page.extract_text() or ""
    lines = text.splitlines()

    page_texts.append("\n".join(lines))
    
print(page_texts)



In [6]:
for i in page_texts:
    print(i)
    print('-------------------')

 
  
-------------------
Title:  
EMPLOYEE MANUAL  Reference No.  OPR – HRD – D – M – 001 (01) 
DEPARTMENT  HUMAN RESOURCES  
This is a controlled document. Photocopying or printing of this document, without the controlled mark, shall make this document an 
uncontrolled copy.  
 
Date of Effectivity  1 August  2023 Control Mark  
 Page 1 of 155 
Supersedes  1 December 2019  
  
 
 
Education that works.  
 
 
 
 
 
EMPLOYEE  
MANUAL  
  
 
 
 
-------------------
Title:  
EMPLOYEE MANUAL  Reference No.  OPR – HRD – D – M – 001 (01) 
DEPARTMENT  HUMAN RESOURCES  
This is a controlled document. Photocopying or printing of this document, without the controlled mark, shall make this document an 
uncontrolled copy.  
 
Date of Effectivity  1 August  2023 Control Mark  
 Page 2 of 155 
Supersedes  1 December 2019  
  
MANUAL VERSION CONTROL LOG  
Version 
Control No.  Description  Date of 
Effectivity  Remarks  
2015 version  Staff Manual    
ADM -HR-D-
M-001(02)  • Renam ing of document fro

In [36]:
# Title:  
# EMPLOYEE MANUAL  Reference No.  OPR – HRD – D – M – 001 (01) 
# DEPARTMENT  HUMAN RESOURCES  
# This is a controlled document. Photocopying or printing of this document, without the controlled mark, shall make this document an 
# uncontrolled copy.  

# Define lists of known header/footer phrases
common_headers = ["Title:", "EMPLOYEE MANUAL  Reference No.", "DEPARTMENT  HUMAN RESOURCES", "This is a controlled document.", "uncontrolled copy."]
common_footers = ["Date of Effectivity", "Page", "Supersedes"]
page_texts = []

for page in pdfreader.pages:
    text = page.extract_text() or ""
    lines = text.splitlines()

    # Filter out lines that contain any header/footer keywords
    filtered = [
        line for line in lines
        if not any(keyword in line for keyword in common_headers + common_footers)
    ]

    page_texts.append("\n".join(filtered))

In [37]:
page_texts[0:10]
# len(page_texts)

[' \n  ',
 ' \n  \n \n \nEducation that works.  \n \n \n \n \n \nEMPLOYEE  \nMANUAL  \n  \n \n \n ',
 ' \n  \nMANUAL VERSION CONTROL LOG  \nVersion \nControl No.  Description  Date of \nEffectivity  Remarks  \n2015 version  Staff Manual    \nADM -HR-D-\nM-001(02)  • Renam ing of document from Staff \nManual to Employee Manual  \n• Revision of the major contents to align \nwith the  2019 Faculty Manual  \n• Revision of annexed HR Forms   \n 1 December 2019  Prepared by \nSolutions \nFoundry, Inc.  \nOPR -HRD-D-\nM-001 • Revisions due to the updating of \ncurrent practices and changes made \nduring & after the COVID -19 \npandemic . \n• Revision of the Organizational \nStructure in reference to the new \nInstitutional Table of Organization \neffective 2nd Term AY 2023 -2024 \n• Revision of the Annexes which forms \npart of the HR Forms  (SPES for Faculty \nand ASP) and the overview of the \ndivisions of National University  \n 1 August  2023   \n \n  ',
 ' \n \nMANUAL APPROVAL CONTROL LO

## Experiments

### Initial Experiment

In [38]:
propositions = []

for context in enumerate(tqdm(page_texts[0:10])):
    get_propositions_nosplit(context, propositions)

  prompt = loads(json.dumps(prompt_object.manifest))
100%|██████████| 10/10 [02:35<00:00, 15.50s/it]


In [43]:
propositions = []

for context in enumerate(tqdm(page_texts)):
    get_propositions_nosplit(context, propositions)

100%|██████████| 156/156 [49:22<00:00, 18.99s/it]


took 49m and 22s

In [44]:
# Open the file in read mode
with open('propositions_employeemanual.txt', 'r') as file:
    # Read all lines and store them in a list
    propositions_from_file = [line.strip() for line in file]

print(propositions_from_file)



In [45]:
len(propositions_from_file)

3909

In [46]:
list_of_edges = generateEdges(propositions_from_file)

100%|██████████| 3909/3909 [3:48:06<00:00,  3.50s/it]   


took 228m and 6.9s

In [47]:
if createGraph(list_of_edges):
    print("Success")
else:
    print("Failed")

Success


took 247m and 11.7s

In [None]:
data = [
    {
        "ID": 1,
        "Question": "What is the effective date of the Employee Manual?",
        "Answer": "The Employee Manual is effective starting August 1, 2023."
    },
    {
        "ID": 2,
        "Question": "Who founded National University and when?",
        "Answer": "National University was founded by Don Mariano Fortunato Jhocson on August 1, 1900."
    },
    {
        "ID": 3,
        "Question": "What is the National University's vision?",
        "Answer": "National University's vision is to be a dynamic private institution committed to nation-building, recognized internationally in education and research."
    },
    {
        "ID": 4,
        "Question": "What are the \"5 Commandments\" for data privacy compliance emphasized by the Commission on Higher Education (CHED) as adopted by National University?",
        "Answer": "The \"5 Commandments\" include: Appointing a Data Protection Officer (DPO), Conducting a Privacy Impact Assessment, Creating a Privacy Management Program, Implementing privacy and data protection measures, and Performing Breach Reporting Procedure."
    },
    {
        "ID": 5,
        "Question": "What happens if an employment contract is not given a written notice of end of contract at least fifteen (15) days before its expiration?",
        "Answer": "In the absence of a written notice of end of contract at least fifteen (15) days before its expiration, the contract is deemed automatically renewed."
    },
    {
        "ID": 6,
        "Question": "How are salaries and benefits paid to employees of National University?",
        "Answer": "The payment of salaries and benefits is made through BDO/China Bank ATM Debit Card, and all employees are required to secure one at the onset of their employment."
    },
    {
        "ID": 7,
        "Question": "Who is responsible for managing all aspects of the educational program and day-to-day operations and governance of the campuses?",
        "Answer": "The Vice President for Operations leads operations for NU Main, and the Executive Director leads operations for other campuses. They manage the educational program, daily operations, and campus governance."
    },
    {
        "ID": 8,
        "Question": "Where should student-related concerns like grades, faculty concerns, discipline, or behavior of other students and faculty members be referred?",
        "Answer": "Student-related concerns should be referred to the Student Development and Activities Office or its equivalent in other campuses."
    },
    {
        "ID": 9,
        "Question": "What are the sanctions for four or more instances of tardiness (and/or undertime) per pay period for three consecutive times within a calendar year?",
        "Answer": "1st offense: Written reprimand and counseling. 2nd offense: Written reprimand with warning of Dismissal. 3rd offense: Dismissal."
    },
    {
        "ID": 10,
        "Question": "What are some of National University's major achievements as of 2022-2023?",
        "Answer": "Major achievements include: 3 QS Stars Rating in 2022, CHEd Autonomous Status in 2019, Center of Excellence in IT Education, 100% of 24 programs accredited (54.16% Level IV), ISO 9001:2015 Certification, UAAP titles, and establishment of 8 new campuses."
    }
]

df = pd.DataFrame(data)