In [121]:
import pandas as pd
import json
from generate_graph import get_propositions, generateEdges, createGraph, get_propositions_nosplit
from refine_graph import refine
from query_graph import QueryGraph
from tqdm import tqdm
tqdm.pandas()

## Dataset

In [3]:
from PyPDF2 import PdfReader
# provide the path of  pdf file/files.
pdfreader = PdfReader('datasets/researchmanual/researchmanual.pdf')

In [4]:
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [5]:
import tiktoken

# Use GPT-4o encoding
enc = tiktoken.encoding_for_model("gpt-4o-mini")
tokens = enc.encode(raw_text)
print(f"Total tokens: {len(raw_text)}")

Total tokens: 131258


In [6]:
page_texts = []

for page in pdfreader.pages:
    text = page.extract_text() or ""
    lines = text.splitlines()

    page_texts.append("\n".join(lines))
    
print(page_texts)

['  \n \n \n \n \nRESEARCH  MANUAL  \n  ', ' \nTitle:  \nRESEARCH  \n MANUAL  Reference No.  RAD – RS – D – M – 001  \nDEPARTMENT  OFFICE OF THE VICE PRESIDENT  FOR \nRESEARCH AND DEVELOP MENT  \nCopies To  All Concerned Units  \nThis is a controlled document. Photocopying or printing of this document, without the controlled mark, shall make this document an uncontrolled \ncopy.  \n \nDate of Effectivity  4 April 2022  Control Mark  \nPage 2 of 60 \nSupersedes  None  \n \n \nMANUAL VERSION CONTROL LOG  \nVersion Control \nNo. Description  Date  Remarks  \nRAD – RI – D – M \n– 001 Guidelines on the conduct of Research efforts \nwithin National University  November 1, \n2020  Approved on  \n              \n2020 \nRAD – RS – D – M \n– 001 Guidelines on the conduct of Research efforts \nwithin National University  April 4, 2022  Approved on  \n2022  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n ', ' \nTitle:  \nRESEARCH  \n MANUAL  Reference No.  RAD – RS – D – M – 001  \nDEP

In [7]:
for i in page_texts:
    print(i)
    print('-------------------')

  
 
 
 
 
RESEARCH  MANUAL  
  
-------------------
 
Title:  
RESEARCH  
 MANUAL  Reference No.  RAD – RS – D – M – 001  
DEPARTMENT  OFFICE OF THE VICE PRESIDENT  FOR 
RESEARCH AND DEVELOP MENT  
Copies To  All Concerned Units  
This is a controlled document. Photocopying or printing of this document, without the controlled mark, shall make this document an uncontrolled 
copy.  
 
Date of Effectivity  4 April 2022  Control Mark  
Page 2 of 60 
Supersedes  None  
 
 
MANUAL VERSION CONTROL LOG  
Version Control 
No. Description  Date  Remarks  
RAD – RI – D – M 
– 001 Guidelines on the conduct of Research efforts 
within National University  November 1, 
2020  Approved on  
              
2020 
RAD – RS – D – M 
– 001 Guidelines on the conduct of Research efforts 
within National University  April 4, 2022  Approved on  
2022  
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
-------------------
 
Title:  
RESEARCH  
 MANUAL  Reference No.  RAD – RS – D – M – 001  
DEPARTMENT  OFFICE OF THE VI

In [10]:
# Title:  
# RESEARCH  
#  MANUAL  Reference No.  RAD – RS – D – M – 001  
# DEPARTMENT  OFFICE OF THE VICE PRESIDENT  FOR 
# RESEARCH AND DEVELOP MENT  
# Copies To  All Concerned Units  
# This is a controlled document. Photocopying or printing of this document, without the controlled mark, shall make this document an uncontrolled 
# copy.  
 
# Date of Effectivity  4 April 2022  Control Mark  
# Page 3 of 60 
# Supersedes  None   

# Define lists of known header/footer phrases
common_headers = ["Title:", 
                  "RESEARCH", 
                  " MANUAL  Reference No.", 
                  "DEPARTMENT  OFFICE OF THE VICE PRESIDENT", 
                  "RESEARCH AND DEVELOP MENT ",
                  "Copies To  All Concerned Units",
                  "This is a controlled document.", 
                  "copy."]

common_footers = ["Date of Effectivity", "Page", "Supersedes"]

page_texts = []

for page in pdfreader.pages:
    text = page.extract_text() or ""
    lines = text.splitlines()

    # Filter out lines that contain any header/footer keywords
    filtered = [
        line for line in lines
        if not any(keyword in line for keyword in common_headers + common_footers)
    ]

    page_texts.append("\n".join(filtered))

In [11]:
page_texts[0:10]
# len(page_texts)

['  \n \n \n \n \n  ',
 ' \n \n \n \nMANUAL VERSION CONTROL LOG  \nVersion Control \nNo. Description  Date  Remarks  \nRAD – RI – D – M \n– 001 Guidelines on the conduct of Research efforts \nwithin National University  November 1, \n2020  Approved on  \n              \n2020 \nRAD – RS – D – M \n– 001 Guidelines on the conduct of Research efforts \nwithin National University  April 4, 2022  Approved on  \n2022  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n ',
 ' \n \n \n \nMANUAL APPROVAL AND DISTRIBUTION LIST  \nPrepared by:  Checked by:  \n \n \nCenter for Research   \nJohanna G. Minglana , LPT  \nQuality Management Office Director  \nDate:  April 4, 2022  Date:  April 4, 2022  \n \n \nConcurred by:  Approved by:  \n \n  \n \n \nRenato Carlos H. Ermita, Jr., Ph.D.  \nPresident  \nDate:  April 4, 2022  Date:  April 4, 2022  \n \n \nDISTRIBUTION LIST  COPY  RECEIVED  \nQuality Management Office  Original   \nOffice of the President  Copy 1   \nOffice of the Vice -Presiden

## Experiments

### Initial Experiment

In [12]:
propositions = []

for context in enumerate(tqdm(page_texts)):
    get_propositions_nosplit(context, propositions)

100%|██████████| 60/60 [16:38<00:00, 16.64s/it]


took 44m and 22s

In [13]:
# Open the file in read mode
with open('propositions_researchmanual.txt', 'r') as file:
    # Read all lines and store them in a list
    propositions_from_file = [line.strip() for line in file]

print(propositions_from_file)

['The document is a manual version control log.', "The title of the document is 'Version Control'.", 'The log contains entries with a number, description, date, and remarks.', 'The first entry is RAD – RI – D – M – 001.', "The description of the first entry is 'Guidelines on the conduct of Research efforts within National University'.", 'The date of the first entry is November 1, 2020.', 'The remarks for the first entry state that it was approved on 2020.', 'The second entry is RAD – RS – D – M – 001.', "The description of the second entry is 'Guidelines on the conduct of Research efforts within National University'.", 'The date of the second entry is April 4, 2022.', 'The remarks for the second entry state that it was approved on 2022.', 'The manual approval and distribution list was prepared by Johanna G. Minglana.', 'Johanna G. Minglana holds the title of Quality Management Office Director.', 'The manual approval and distribution list was checked by Johanna G. Minglana.', 'The date 

In [14]:
len(propositions_from_file)

1453

In [15]:
list_of_edges = generateEdges(propositions_from_file)

100%|██████████| 1453/1453 [1:04:55<00:00,  2.68s/it]


took 176m and 13.1s

In [16]:
if createGraph(list_of_edges):
    print("Success")
else:
    print("Failed")

Success


# Query Graph

In [122]:
from langchain_ollama import ChatOllama

10 Questions

In [123]:
data = [
    {
        "ID": 1,
        "Question": "What is the primary purpose of the National University Research Manual?",
        "Answer": "The Manual provides an overview of National University's policies, processes, and regulations for the conduct of research, aiming to familiarize constituents with important information about Research and Innovation, and provide guidelines for efficient and ethical research within NU Manila and other NU campuses."
    },
    {
        "ID": 2,
        "Question": "What is the reference number of this Research Manual?",
        "Answer": "The reference number of the Research Manual is RAD-RS-D-M-001."
    },
    {
        "ID": 3,
        "Question": "What are the three university-wide research centers established by National University?",
        "Answer": "The three university-wide research centers are the Center for Research, the Center for Innovation and Entrepreneurship, and the Center for Resilient Philippines."
    },
    {
        "ID": 4,
        "Question": "What is the vision of the Center for Entrepreneurship?",
        "Answer": "The Center for Entrepreneurship is envisioned to be an inclusive, realistic, and collaborative community."
    },
    {
        "ID": 5,
        "Question": "What are some of the core competencies identified by the Center for Resilient Philippines (CRP)?",
        "Answer": "The core competencies of the CRP include disaster resilience from social/political, economic, and physical sciences perspectives; new technology-enabled mechanisms for resilient community planning and design; development of innovative national and local resilience policies; private sector engagement in disaster resilience; capacity building for disaster mitigation and reconstruction; networking with the academic community; and community engagement and participation in reconstruction."
    },
    {
        "ID": 6,
        "Question": "Name at least three research goals of the National University Research Agenda.",
        "Answer": "Some research goals include: to recruit, develop, and retain faculty researchers; to provide proactive service to the academic community on research-related endeavors; to assist faculty researchers in publishing outputs in top international journals; to obtain and manage externally funded research projects; and to foster collaboration with local and international higher education institutions."
    },
    {
        "ID": 7,
        "Question": "What are some of the Research Themes outlined in the National University Research Agenda?",
        "Answer": "Research Themes include Food, Nutrition, and Health; Emerging Industries on the Fourth Industrial Revolution; Development of vaccines and diagnostic kits using indigenous materials; Disaster risk management; Tourism and Pollution control; Climate change; Sports Technology; Education and learning innovations; and Business sophistication."
    },
    {
        "ID": 8,
        "Question": "Which colleges or campuses have their research agenda defined in the manual?",
        "Answer": "The manual defines the research agendas for the College of Education, Arts and Sciences (CEAS), College of Business and Accountancy (CBA), College of Architecture (COA), College of Engineering (COE), College of Computing and Information Technologies (CCIT), College of Allied Health (CAH), College of Dentistry (COD), and College of Tourism and Hospitality Management (CTHM)."
    },
    {
        "ID": 9,
        "Question": "What is the Journal of Sciences, Technology, and Arts (JSTAR)?",
        "Answer": "JSTAR is the official, peer-reviewed, open-access publication of National University (Philippines), published annually by the Center for Research, providing a venue for students, faculty, non-teaching personnel, and industry practitioners to share research works, empirical studies, and theories related to science, technology, and arts."
    },
    {
        "ID": 10,
        "Question": "According to the Authorship Policy, what is the minimum total points an individual needs in the co-authorship scoring system to share authorship?",
        "Answer": "Anyone achieving a total of 25 points in the co-authorship scoring system shares authorship."
    },
    {
        "ID": 11,
        "Question": "What is the current budget allocated for the Center for Innovation and Entrepreneurship for the upcoming fiscal year?",
        "Answer": ""
    },
    {
        "ID": 12,
        "Question": "Who is the specific contact person for submitting research proposals for the College of Allied Health, including their email address and phone number?",
        "Answer": ""
    },
    {
        "ID": 13,
        "Question": "What are the detailed procedures and forms required for intellectual property registration for a patent developed by a faculty member?",
        "Answer": ""
    },
    {
        "ID": 14,
        "Question": "How many research projects were successfully published in international journals last year, broken down by college?",
        "Answer": ""
    },
    {
        "ID": 15,
        "Question": "What is the average duration of a research project from proposal submission to final dissemination of results?",
        "Answer": ""
    },
    {
        "ID": 16,
        "Question": "Are there any specific grants or funding opportunities available exclusively for student-led research initiatives?",
        "Answer": ""
    },
    {
        "ID": 17,
        "Question": "What is the protocol for handling research misconduct cases involving external collaborators?",
        "Answer": ""
    },
    {
        "ID": 18,
        "Question": "Which specific software or tools are recommended by the Office of the Vice President for Research and Development for data analysis in research projects?",
        "Answer": ""
    },
    {
        "ID": 19,
        "Question": "What are the benefits or incentives offered to faculty members who successfully obtain external funding for their research?",
        "Answer": ""
    },
    {
        "ID": 20,
        "Question": "When is the next scheduled workshop or training session for new faculty members on research ethics and compliance?",
        "Answer": ""
    }
]

df = pd.DataFrame(data)

In [124]:
df.head(20)

Unnamed: 0,ID,Question,Answer
0,1,What is the primary purpose of the National Un...,The Manual provides an overview of National Un...
1,2,What is the reference number of this Research ...,The reference number of the Research Manual is...
2,3,What are the three university-wide research ce...,The three university-wide research centers are...
3,4,What is the vision of the Center for Entrepren...,The Center for Entrepreneurship is envisioned ...
4,5,What are some of the core competencies identif...,The core competencies of the CRP include disas...
5,6,Name at least three research goals of the Nati...,"Some research goals include: to recruit, devel..."
6,7,What are some of the Research Themes outlined ...,"Research Themes include Food, Nutrition, and H..."
7,8,Which colleges or campuses have their research...,The manual defines the research agendas for th...
8,9,"What is the Journal of Sciences, Technology, a...","JSTAR is the official, peer-reviewed, open-acc..."
9,10,"According to the Authorship Policy, what is th...",Anyone achieving a total of 25 points in the c...


### Model 1: gemma3:12b-it-qat

In [125]:
ollama_llm = ChatOllama(
    model = "gemma3:12b-it-qat",
    temperature = 0.8,
    num_predict = 256,
)

qg = QueryGraph(lm = ollama_llm)

In [127]:
mylist = []

# Define a function to apply to each row
def print_qa(row):
    
    question = row['Question']
    questionid = row['ID']
    realanswer = row['Answer']
    req = qg.get_requirements(question)
    result = qg.answer_question(question, req.content)
    
    my_dict = {}
    
    my_dict.update({"Question": question})
    my_dict.update({"GroundTruth": realanswer})
    
    if result is not None:
        
        model_answer = result['result']

        if "don't know the answer" in model_answer:
            model_answer = ""
    
        if len(row['Answer']) > 0:
            real_answer = row['Answer']
        else:
            real_answer = "\"\""

        if len(result['intermediate_steps']) > 0:
            cypher_query = result['intermediate_steps'][0]['query']
            context = result['intermediate_steps'][1]['context']
       
            if(len(context) < 1):
                model_answer = "\"\""
                my_dict.update({"Prediction": ""})
            else:
                my_dict.update({"Prediction": model_answer})
                
        with open("researchmanual.txt", "a") as preds:
            preds.write("question: " + question + "\n")
            preds.write("real_answer: " + real_answer + "\n")
            preds.write("model_answer: " + model_answer + "\n")  
            preds.write("cypher_query: " + cypher_query + "\n")
            preds.write("======================" + "\n")  
        preds.close()

    else: 
        my_dict.update({"Prediction": ""})
        
    mylist.append(my_dict)
        

In [129]:
# Apply the function to each row
df.progress_apply(print_qa, axis=1)

print(mylist)

 75%|███████▌  | 15/20 [13:52<05:32, 66.54s/it]

Generated Cypher Statement is not valid
{code: Neo.ClientError.Statement.SyntaxError} {message: Unknown function 'year' (line 16, column 14 (offset: 501))
"    w.year = year(date('now')) - 1"
              ^}


 95%|█████████▌| 19/20 [18:42<01:11, 71.04s/it]

{code: Neo.DatabaseError.Statement.ExecutionFailed} {message: Failed to parse query ````cypher
MATCH (w)-[r1]-(x)
WHERE (
    toLower(r1.metadata) =~ '^.*\\b(software|tools)\\w*\\b.*$' OR
    toLower(r1.description) =~ '^.*\\b(software|tools)\\w*\\b.*$'
)
AND (
    toLower(r1.metadata) =~ '^.*\\b(data analysis)\\w*\\b.*$' OR
    toLower(r1.description) =~ '^.*\\b(data analysis)\\w*\\b.*$'
)
AND (
    toLower(r1.metadata) =~ '^.*\\b(research projects)\\w*\\b.*$' OR
    toLower(r1.description) =~ '^.*\\b(research projects)\\w*\\b.*$'
)
AND (
    toLower(r1.metadata) =~ '^.*\\b(office of the vice president for research and development)\\w*\\b.*$' OR
    toLower(r1.description) =~ '^.*\\b(office of the vice president for research and development)\\w*\\b.*$'
)
`.}


100%|██████████| 20/20 [21:15<00:00, 63.77s/it]

[{'Question': 'What is the primary purpose of the National University Research Manual?', 'GroundTruth': "The Manual provides an overview of National University's policies, processes, and regulations for the conduct of research, aiming to familiarize constituents with important information about Research and Innovation, and provide guidelines for efficient and ethical research within NU Manila and other NU campuses.", 'Prediction': ''}, {'Question': 'What is the reference number of this Research Manual?', 'GroundTruth': 'The reference number of the Research Manual is RAD-RS-D-M-001.', 'Prediction': 'None\n'}, {'Question': 'What is the primary purpose of the National University Research Manual?', 'GroundTruth': "The Manual provides an overview of National University's policies, processes, and regulations for the conduct of research, aiming to familiarize constituents with important information about Research and Innovation, and provide guidelines for efficient and ethical research within




In [119]:
df.head(2)

Unnamed: 0,ID,Question,Answer
0,1,What is the primary purpose of the National Un...,The Manual provides an overview of National Un...
1,2,What is the reference number of this Research ...,The reference number of the Research Manual is...


In [130]:
for i in mylist:
    print(i)
    print('-------------------')

{'Question': 'What is the primary purpose of the National University Research Manual?', 'GroundTruth': "The Manual provides an overview of National University's policies, processes, and regulations for the conduct of research, aiming to familiarize constituents with important information about Research and Innovation, and provide guidelines for efficient and ethical research within NU Manila and other NU campuses.", 'Prediction': ''}
-------------------
{'Question': 'What is the reference number of this Research Manual?', 'GroundTruth': 'The reference number of the Research Manual is RAD-RS-D-M-001.', 'Prediction': 'None\n'}
-------------------
{'Question': 'What is the primary purpose of the National University Research Manual?', 'GroundTruth': "The Manual provides an overview of National University's policies, processes, and regulations for the conduct of research, aiming to familiarize constituents with important information about Research and Innovation, and provide guidelines for 

### Evaluate

In [140]:
import re
import string

def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punctuation(text):
        return text.translate(str.maketrans('', '', string.punctuation))

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punctuation(lower(s))))

In [141]:
def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)

In [142]:
# Exact Match 
data = [
    {
        "Question": "Who founded National University and when?",
        "Prediction": "Don Mariano Fortunato Jhocson on August 1, 1900",
        "GroundTruth": "National University was founded by Don Mariano Fortunato Jhocson on August 1, 1900."
    },
    {
        "Question": "What is the National University's vision?",
        "Prediction": "To be a dynamic private institution committed to nation-building.",
        "GroundTruth": "National University's vision is to be a dynamic private institution committed to nation-building, recognized internationally in education and research."
    }
]

# Compute EM score per QA pair
em_scores = [exact_match_score(item["Prediction"], item["GroundTruth"]) for item in mylist]

# Compute overall EM accuracy
em_accuracy = sum(em_scores) / len(em_scores)

print(f"Exact Match Accuracy: {em_accuracy:.2%}")


Exact Match Accuracy: 45.45%


In [71]:
# Rouge L

### Model 2: llama3.2:3b

In [None]:
ollama_llm = ChatOllama(
    model = "llama3.2:3b",
    temperature = 0.8,
    num_predict = 256,
)

qg = QueryGraph(lm = ollama_llm)