In [1]:
import re
import json
import ontospy
import networkx as nx
import matplotlib.pyplot as plt

with open('/home/rbshaffer/Downloads/thesaurus.json', 'r') as f:
    thesaurus = json.loads(f.read())
    thesaurus = thesaurus[0]
    
constitute = ontospy.Ontospy('/home/rbshaffer/Downloads/ontology.owl')

In [2]:
# some functions to help manipulate the Constitute ontology

def build_constitute_edgelist(node, edgelist=[]):
    parent_lab = str(node.uri).split('/')[-1]
    children = node.children()
    
    for child in children:
        child_lab = str(child.uri).split('/')[-1]
        edgelist.append([parent_lab, child_lab])
        edgelist = build_constitute_edgelist(child, edgelist)
        
    
    return edgelist

def build_constitute_nodedic(node, nodedic={}):
    parent_lab = str(node.uri).split('/')[-1]
    if parent_lab not in nodedic:
        nodedic[parent_lab] = node.uri
        
    children = node.children()
    
    for child in children:
        nodedic = build_constitute_nodedic(child, nodedic)
        
    return nodedic

In [3]:
# some functions to manipulate the Venice thesaurus
def build_thesaurus_nodedic(node, nodedic={}):
    parent_lab = node.get('label')
    
    if parent_lab and parent_lab not in nodedic:
        parent_lab = re.sub('\<IT[-+]\>', '', parent_lab)
        nodedic[parent_lab] = {'id': node['id'], 'synonyms': node['synonyms']}
    
    for child in node['nodes']:
        nodedic = build_thesaurus_nodedic(child, nodedic)
        
    return nodedic

In [20]:
# class to try out various matching approaches
class Match:
    def __init__(self, reference):
        self.ref = reference
        
    def str_match(self, term):
        term = self._preprocess(term)
        ref_list = [self._preprocess(ref) for ref in self.ref]
        
        matches = [ref for ref in ref_list if ref == term]
        
        return matches
        
    def _preprocess(self, term):
        import string
        
        term = re.sub('[' + string.punctuation + ']', '', term)
        term = term.lower()
        
        return term
    

In [18]:
constitute_base_node = constitute.getClass('/Topics')[0]
constitute_nodes = build_constitute_nodedic(constitute_base_node)

# renaming node key in the top-level element to make recursion easier
try:
    thesaurus['nodes'] = thesaurus.pop('data')
except KeyError:
    pass

thesaurus_nodes = build_thesaurus_nodedic(thesaurus)

In [21]:
match = Match(constitute_nodes)

matched_terms = []
for node in thesaurus_nodes:
    matched_terms += match.str_match(node)
    
print('String matches:')
print(matched_terms)



['opinion', 'executive', 'flag', 'elections', 'finance', 'religion', 'press', 'evidence', 'budget', 'motto', 'immunity', 'language', 'quorum']


In [30]:
import gensim

word2vec_model = gensim.models.Word2Vec.load('word2vec_model_200_no_stem.pydata')



In [29]:
word2vec_model.most_similar('quorum')

[('still', 0.4892862141132355),
 ('adjourned', 0.48248758912086487),
 ('sitting', 0.47218817472457886),
 ('meeting', 0.4566667377948761),
 ('tie', 0.4562295079231262),
 ('threequarters', 0.4511076807975769),
 ('ascertains', 0.4487970769405365),
 ('signatories', 0.4469468891620636),
 ('assembled', 0.4467349052429199),
 ('fourfifths', 0.4423031508922577)]