In [47]:
import os
import json
import pandas as pd
from langchain_core.prompts import ChatPromptTemplate
from graphs import get_nx_graph, verbalize_neighbors_triples_from_triples, verbalize_neighbors_triples_from_graph
from models import KnowledgeGraphLLM

In [27]:
# load extracted triples
candidate_triples = []
for line in open(f'output/test/step-01.jsonl', 'r'):
    t = json.loads(line)
    candidate_triples.append((t['s'], t['p'], t['o']))
print(f'Loaded {len(candidate_triples)} triples.')
print(candidate_triples[:5])

Loaded 4 triples.
[('OCR post-correction', 'Compare', 'spelling correction'), ('OCR post-correction', 'Part-of', 'sequence-to-sequence model'), ('Neural network models', 'Compare', 'Multi-task learning'), ('Neural network models', 'Evaluate-for', 'Shared layers')]


In [28]:
# loading human-selected concepts
id_2_concept = {i: str(c['concept']) for i, c in
                pd.read_csv('data/refined_concepts.tsv', sep='|', header=None,
                            names=['id', 'concept'], index_col=0).iterrows()}
concept_2_id = {v: k for k, v in id_2_concept.items()}
print(id_2_concept[1])

spelling correction


In [29]:
# load text data
data = json.load(open('data/concept_abstracts_sample.json', 'r'))

In [30]:
# load relation types
relation_def = json.load(open('data/relation_types.json'))
relation_types = list(relation_def.keys())
relation_2_id = {v: k for k, v in enumerate(relation_types)}
id_2_relation = {k: v for k, v in enumerate(relation_types)}

In [31]:
# build the prerequisite-of graph
prerequisite_of_triples = []
with open('data/prerequisite-of_graph.tsv', 'r') as f:
    for line in f:
        s, p, o = line.strip().split('\t')
        prerequisite_of_triples.append((str(s), str(p), str(o)))

prerequisite_of_graph = get_nx_graph(prerequisite_of_triples, concept_2_id, relation_2_id)


In [33]:
# initialize the prompt template
prompt_template_txt = open("prompts/prompt_fusion.txt").read()

prompt_template = ChatPromptTemplate.from_messages([
    ("system", "You are a knowledge graph builder."),
    ("user", prompt_template_txt)
])

In [34]:
# verbalize the candidate triples
candidate_concept = "spelling correction"
candidate_subgraph = verbalize_neighbors_triples_from_triples(candidate_triples, candidate_concept)

print(f'Candidate subgraph of {candidate_concept}: \n', candidate_subgraph)

Candidate subgraph of spelling correction: 
 (OCR post-correction,Compare,spelling correction)



In [44]:
prerequisite_of_graph_subgraph = verbalize_neighbors_triples_from_graph(
        prerequisite_of_graph, candidate_concept, concept_2_id, id_2_concept, mode='bidirectional')
print(f'Prerequisite-of subgraph of {candidate_concept}: \n', prerequisite_of_graph_subgraph)

Prerequisite-of subgraph of spelling correction: 
 (spelling correction,Is-a-Prerequisite-of,linguistics basics)
(spelling correction,Is-a-Prerequisite-of,chinese nlp)
(spelling correction,Is-a-Prerequisite-of,natural language processing intro)



In [45]:
abstracts = ' '.join(
    data[candidate_concept]['abstracts']) if candidate_concept in data else ''


In [48]:
os.environ["OPENAI_API_KEY"] = json.load(open('private_config.json'))['OPENAI_API_KEY']
# init the model
model = KnowledgeGraphLLM(model_name="gpt-3.5-turbo",
                              max_tokens=400)

In [49]:
prompt = prompt_template.invoke(
            {"concept": candidate_concept,
             "graph1": candidate_subgraph,
             "graph2": prerequisite_of_graph_subgraph,
             "background": abstracts,
             "relation_definitions": '\n'.join(
                 [f"{rel_type}: {rel_data['description']}" for rel_type, rel_data in
                  relation_def.items()])})

# query the model
response = model.invoke(prompt)

In [53]:
for triple in json.loads(response):
    print(', '.join([triple['s'], triple['p'], triple['o']]))

OCR post-correction, Compare, spelling correction
spelling correction, Is-a-Prerequisite-of, linguistics basics
spelling correction, Is-a-Prerequisite-of, chinese nlp
spelling correction, Is-a-Prerequisite-of, natural language processing intro
