In [35]:
import os
import json
from langchain_core.prompts import ChatPromptTemplate  
from models import KnowledgeGraphLLM

In [5]:
# load sample data
data = json.load(open('data/test/concept_abstracts_sample.json', 'r'))
relation_def = json.load(open('data/relation_types.json'))
relation_types = list(relation_def.keys())

In [43]:
# initialize the prompt template
prompt_template_txt = open("prompts/prompt_step_01.txt").read()
prompt_template = ChatPromptTemplate.from_messages([
    ("system", "You are a knowledge graph builder."),
    ("user", prompt_template_txt)
])

print('Prompt template', prompt_template)

Prompt template input_variables=['abstracts', 'concepts', 'relation_definitions'] messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are a knowledge graph builder.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['abstracts', 'concepts', 'relation_definitions'], template='### Instruction:\nYou are a domain expert in computer science, natural language processing, and now you are building a knowledge graph in this domain. Given a context (### Content), and a query concept (### Concept), do the following:\n1. Extract the query concept and some in-domain concepts from the context, these concepts should be fine-grained: could be introduced by a lecture slide page, or a whole lecture, or possibly to have a Wikipedia page.\n2. Determine the relationships between the query concept and the extracted concepts from Step 1, in a triplet format: (<head concept>, <relation>, <tail concept>). The relationship should be functional, aiding

In [34]:
concept_name = "spelling correction"
print('Sample concept:', concept_name)
abstracts = ' '.join(data[concept_name]['abstracts'])
print('Used abstracts:', abstracts[:500])

# instantiate the prompt template
prompt = prompt_template.invoke(
    {"abstracts": abstracts[:10000],
     "concepts": [concept_name],
     "relation_definitions": '\n'.join(
         [f"{rel_type}: {rel_data['description']}" for rel_type, rel_data in
          relation_def.items()])})

Sample concept: spelling correction
Used abstracts: We propose a novel approach to OCR post-correction that exploits repeated texts in large corpora both as a source of noisy target outputs for unsupervised training and as a source of evidence when decoding. A sequence-to-sequence model with attention is applied for single-input correction, and a new decoder with multi-input attention averaging is developed to search for consensus among multiple sequences. We design two ways of training the correction model without human annotation, either traini


In [44]:
print('Prompt:', prompt)

Prompt: messages=[SystemMessage(content='You are a knowledge graph builder.'), HumanMessage(content='### Instruction:\nYou are a domain expert in computer science, natural language processing, and now you are building a knowledge graph in this domain. Given a context (### Content), and a query concept (### Concept), do the following:\n1. Extract the query concept and some in-domain concepts from the context, these concepts should be fine-grained: could be introduced by a lecture slide page, or a whole lecture, or possibly to have a Wikipedia page.\n2. Determine the relationships between the query concept and the extracted concepts from Step 1, in a triplet format: (<head concept>, <relation>, <tail concept>). The relationship should be functional, aiding learners in understanding the knowledge. The query concept can be the head concept or tail concept. We define 7 types of the relations:\n    Compare: Represents a relationship between two or more entities where a comparison is being ma

In [36]:
os.environ["OPENAI_API_KEY"] = json.load(open('private_config.json'))['OPENAI_API_KEY']
# init the model
model = KnowledgeGraphLLM(model_name="nlp_gpt-3.5-turbo",
                              max_tokens=400)

# query the model
response = model.invoke(prompt)

In [42]:
print('Extracted triples:')
for triple in json.loads(response):
    print(', '.join([triple['s'], triple['p'], triple['o']]))

Extracted triples:
OCR post-correction, Compare, spelling correction
OCR post-correction, Evaluate-for, training the correction model without human annotation
OCR post-correction, Evaluate-for, bootstrapping from a uniform error model
OCR post-correction, Used-for, cutting the character and word error rates
counseling intervention, Is-a-Prerequisite-of, active collaboration between clients and counselors
counseling intervention, Compare, collaboration process during counseling conversations
counseling intervention, Compare, differences between high-quality and low-quality counseling
counseling intervention, Evaluate-for, examining participants’ turn-by-turn interaction
counseling intervention, Evaluate-for, deriving linguistic features to capture differences
counseling intervention, Used-for, building automatic classifiers to predict counseling quality
Minecraft Collaborative Building Task, Compare, two-player game
Minecraft Collaborative Building Task, Compare, predicting correct acti