In [19]:
import json
import pandas as pd
from graphs import get_nx_graph, get_neighbors, get_2hop_neighbors, verbalize_neighbors_triples_from_graph, verbalize_neighbors_triples_from_triples

In [12]:
# load sample data
# candidate graph
candidate_triples = []
STEP01_OUTPUT_FILE = f'output/test/step-01.jsonl'
for line in open(STEP01_OUTPUT_FILE, 'r'):
    t = json.loads(line)
    candidate_triples.append((t['s'], t['p'], t['o']))
id_2_concept = {i: c['concept'] for i, c in
                pd.read_csv('data/refined_concepts.tsv', sep='|', header=None,
                            names=['id', 'concept'], index_col=0).iterrows()}
concept_2_id = {c: i for i, c in id_2_concept.items()}
 
relation_def = json.load(open('data/relation_types.json'))
relation_types = list(relation_def.keys())
relation_2_id = {v: k for k, v in enumerate(relation_types)}
    
# gold standard annotated graph
prerequisite_of_triples = []
with open('data/prerequisite-of_graph.tsv', 'r') as f:
    for line in f:
        s, p, o = line.strip().split('\t')
        prerequisite_of_triples.append((s, p, o))

In [13]:
# create nx graph from list of triples
graph = get_nx_graph(prerequisite_of_triples, concept_2_id, relation_2_id)

In [14]:
print(get_neighbors(graph, 'named entity recognition', concept_2_id, id_2_concept))
print(get_neighbors(graph, 'natural language processing intro', concept_2_id, id_2_concept, mode='outgoing'))
print(get_neighbors(graph, 'named entity recognition', concept_2_id, id_2_concept, mode='ingoing'))

['linguistics basics', 'natural language processing intro']
['spelling correction', 'word sense disambiguation', 'semantic role labeling', 'chomsky hierarchy', 'named entity recognition', 'shallow parsing', 'grammar checker', 'language identification', 'information extraction', 'dialog systems', 'event detection', 'cky parsing', 'propositional logic', 'automated essay scoring', 'kernels', 'nlp for the humanities', 'semantic parsing', 'shift-reduce parsing', 'knowledge representation', 'entailment', 'machine translation', 'word embedding', 'chinese nlp', 'speech processing', 'discourse analysis', 'parsing', 'regular expressions', 'Sequence to sequence', 'sentence boundary recognition', 'document representation', 'penn treebank', 'lexicography', 'text generation', 'bio text mining', 'recommendation system', 'morphology and lexicon', 'edit distance', 'context free grammars', 'probabilistic context free grammars', 'graph-based nlp', 'sentence simplification', 'relation extraction', 'course

In [16]:
get_2hop_neighbors(graph, 'named entity recognition', concept_2_id, id_2_concept)

['spelling correction',
 'word sense disambiguation',
 'semantic role labeling',
 'chomsky hierarchy',
 'shallow parsing',
 'grammar checker',
 'language identification',
 'information extraction',
 'dialog systems',
 'event detection',
 'cky parsing',
 'propositional logic',
 'automated essay scoring',
 'kernels',
 'nlp for the humanities',
 'semantic parsing',
 'shift-reduce parsing',
 'knowledge representation',
 'entailment',
 'computational phonology',
 'chinese nlp',
 'speech processing',
 'discourse analysis',
 'prosody',
 'Sequence to sequence',
 'sentence simplification',
 'tokenization',
 'machine translation',
 'word embedding',
 'parsing',
 'regular expressions',
 'sentence boundary recognition',
 'document representation',
 'penn treebank',
 'lexicography',
 'text generation',
 'bio text mining',
 'recommendation system',
 'morphology and lexicon',
 'edit distance',
 'context free grammars',
 'probabilistic context free grammars',
 'graph-based nlp',
 'speech synthesis',
 

In [21]:
print(verbalize_neighbors_triples_from_graph(graph, 'natural language processing intro', concept_2_id, id_2_concept, mode='outgoing'))
print(verbalize_neighbors_triples_from_graph(graph, 'named entity recognition', concept_2_id, id_2_concept))

(natural language processing intro,Is-a-Prerequisite-of,spelling correction)
(natural language processing intro,Is-a-Prerequisite-of,word sense disambiguation)
(natural language processing intro,Is-a-Prerequisite-of,semantic role labeling)
(natural language processing intro,Is-a-Prerequisite-of,chomsky hierarchy)
(natural language processing intro,Is-a-Prerequisite-of,named entity recognition)
(natural language processing intro,Is-a-Prerequisite-of,shallow parsing)
(natural language processing intro,Is-a-Prerequisite-of,grammar checker)
(natural language processing intro,Is-a-Prerequisite-of,language identification)
(natural language processing intro,Is-a-Prerequisite-of,information extraction)
(natural language processing intro,Is-a-Prerequisite-of,dialog systems)
(natural language processing intro,Is-a-Prerequisite-of,event detection)
(natural language processing intro,Is-a-Prerequisite-of,cky parsing)
(natural language processing intro,Is-a-Prerequisite-of,propositional logic)
(natu

In [20]:
concept_name = 'OCR post-correction'
verbalize_neighbors_triples_from_triples(candidate_triples, concept_name)

'(OCR post-correction,Compare,spelling correction)\n(OCR post-correction,Part-of,sequence-to-sequence model)\n'