In [1]:
import re
import json
import ontospy
import networkx as nx
import matplotlib.pyplot as plt

with open('data/thesaurus.json', 'r') as f:
    thesaurus = json.loads(f.read())
    thesaurus = thesaurus[0]
    
constitute = ontospy.Ontospy('data/ontology.owl')

In [2]:
# some functions to help manipulate the Constitute ontology

def build_constitute_edgelist(node, edgelist=[]):
    parent_lab = str(node.uri).split('/')[-1]
    children = node.children()
    
    for child in children:
        child_lab = str(child.uri).split('/')[-1]
        edgelist.append([parent_lab, child_lab])
        edgelist = build_constitute_edgelist(child, edgelist)
        
    
    return edgelist

def build_constitute_node_dic(node, node_dic={}):
    parent_lab = str(node.uri).split('/')[-1]
    if parent_lab not in node_dic:
        node_dic[parent_lab] = node.uri
        
    children = node.children()
    
    for child in children:
        node_dic = build_constitute_node_dic(child, node_dic)
        
    return node_dic

def build_constitute_label_dic(node, label_dic={}):
    node_label = constitute.getEntity(uri=node.uri).bestLabel().strip()
    if node_label not in label_dic:
        label_dic[node_label] = node.uri
        
    children = node.children()
    
    for child in children:
        label_dic = build_constitute_label_dic(child, label_dic)
        
    return label_dic

In [3]:
# some functions to manipulate the Venice thesaurus
def build_thesaurus_node_dic(node, node_dic={}):
    parent_lab = node.get('label')
    
    if parent_lab and parent_lab not in node_dic:
        parent_lab = re.sub('\<IT[-+]\>', '', parent_lab)
        node_dic[parent_lab] = {'id': node['id'], 'synonyms': node['synonyms']}
    
    for child in node['nodes']:
        node_dic = build_thesaurus_node_dic(child, node_dic)
        
    return node_dic

In [4]:
# class to try out various matching approaches

# base class
class MatchBase:
    def __init__(self, reference, word2vec_dir = None):
        import gensim
        self.ref_list = reference
        
        if word2vec_dir:
            self.word2vec_mod = gensim.models.Word2Vec.load(word2vec_dir)
            
        else:
            self.word2vec_mod = None
            
    def _preprocess(self, term):
        import string
        
        term = re.sub('[' + string.punctuation + ']', '', term)
        term = term.lower()
        
        return term

# matching on the node variable name
class NodeMatch(MatchBase):
    def str_match(self, word):
        # just look for simple equality between node IDs (after preprocessing)
        word = self._preprocess(word)
        preprocessed_ref = [self._preprocess(ref) for ref in self.ref_list]
        
        matches = [ref for ref in preprocessed_ref if ref == word]
        
        return matches
    
    def word2vec_match(self, word, cutoff=0.5):
        # use a pretrained word2vec model to calculate similarity
        # treat all values over cutoff as a match
        
        if not self.word2vec_mod:
            print('Be sure to load a word2vec model before using this function!')
        else:
            word = self._preprocess(word)
            preprocessed_ref = [self._preprocess(ref) for ref in self.ref_list]
            
            sims = {}
            
            for ref in preprocessed_ref:
                try:
                    sims[ref] = self.word2vec_mod.similarity(ref, word)
                except:
                    sims[ref] = None
            
            sims = {s: sims[s] for s in sims if sims[s] and sims[s] > cutoff}
            return sims
        
# matching on plain-text label
class LabelMatch(MatchBase):
    def word2vec_match(self, word, cutoff=0.9):
        # use a pretrained word2vec model to generate vectors for labels and terms
        # for multiword labels/terms, sum vectors for each term (not a great solution, but ok for testing)
        # treat all values over cutoff as a match
        
        from scipy.spatial.distance import cosine
    
        if not self.word2vec_mod:
            print('Be sure to load a word2vec model before using this function!')
        else:
            preprocessed_word = self._preprocess(word)
            summed_word_vec = sum([self.word2vec_mod[w] for w in preprocessed_word.split()
                                   if w in self.word2vec_mod])
            
            # sum() casts None to 0
            # so, checking for int is a hack to make sure at least one token in the word was in the word2vec vocab
            if type(summed_word_vec) != int:
                ref_vectors = {}
                for ref in self.ref_list:
                    preprocessed_ref = self._preprocess(ref)
                    summed_label = sum([self.word2vec_mod[w] for w in preprocessed_ref.split() 
                                        if w in self.word2vec_mod])
                    
                    # see above
                    if type(summed_label) != int:
                        ref_vectors[ref] = summed_label

                # note cosine = distance, not similarity w/ scipy
                sims = {ref: 1 - cosine(summed_word_vec, ref_vectors[ref]) for ref in ref_vectors}

                sims = {ref: sims[ref] for ref in sims if sims[ref] > cutoff}
            
                return sims
            
            else:
                return {}
    

In [5]:
# build the various helper dictionaries
constitute_base_node = constitute.getClass('/Topics')[0]
constitute_nodes = build_constitute_node_dic(constitute_base_node)
constitute_labels = build_constitute_label_dic(constitute_base_node)

# renaming node key in the top-level element to make recursion easier
try:
    thesaurus['nodes'] = thesaurus.pop('data')
except KeyError:
    pass

thesaurus_nodes = build_thesaurus_node_dic(thesaurus)

In [6]:
# run the two nodematch approaches
node_match = NodeMatch(constitute_nodes, 'data/word2vec_model_200_no_stem.pydata')

str_matches = []
word2vec_node_matches = {}
for node in thesaurus_nodes:
    str_matches += node_match.str_match(node)

    word2vec_match = node_match.word2vec_match(node)
    if word2vec_match:
        word2vec_node_matches[node] = word2vec_match

In [7]:
# Simple string match on node variable names works, but very limiting
str_matches

['press',
 'quorum',
 'elections',
 'budget',
 'religion',
 'language',
 'motto',
 'immunity',
 'flag',
 'finance',
 'opinion',
 'executive',
 'evidence']

In [8]:
# word2vec on variable names works, but only works for labels that are in-vocab (e.g. "judcrts4" won't work)
# likely would perform better with a more broadly-trained model
# cutoff value optimization?

print('Number of matches:', len(word2vec_node_matches))
print('--------')
for node in word2vec_node_matches:
    print(node, word2vec_node_matches[node])
    print('\n')

Number of matches: 38
--------
Quorum {'quorum': 1.0}


Press {'press': 0.99999999999999978}


Descent {'citizenship': 0.51500973580595866}


Religion {'language': 0.50396666816307301, 'religion': 1.0000000000000002}


Evidence {'evidence': 1.0000000000000002}


Elections {'elections': 1.0000000000000002}


Sources {'solid': 0.52187535536213903, 'artists': 0.50256415038372437, 'income': 0.58208191228157891}


Opinions {'religion': 0.5335791064221963}


Subsidiarity {'artists': 0.58169680661190515, 'leisure': 0.5057359604314462}


Gender {'religion': 0.53510538839770427}


Sessions {'session': 0.72443373048092163}


Equity {'artists': 0.55882544590066585, 'leisure': 0.51139734863407682}


Proportionality {'artists': 0.54255512659428107}


Minors {'artists': 0.63674318019057807, 'cruelty': 0.60806843932744281, 'juvenile': 0.50801678725797683, 'shelter': 0.57942234718381536}


Election {'elections': 0.64403200109000447}


Budget {'budget': 0.99999999999999967}


Universities {'artists': 0

In [9]:
# run the label match approach
label_match = LabelMatch(constitute_labels, 'data/word2vec_model_200_no_stem.pydata')

word2vec_label_matches = {}
for node in thesaurus_nodes:
    word2vec_match = label_match.word2vec_match(node)
    if word2vec_match:
        word2vec_label_matches[node] = word2vec_match

In [10]:
# Not bad, though still not a ton of matches
# again, cutoff optimization?

print('Number of matches:', len(word2vec_label_matches))
print('--------')
for label in word2vec_label_matches:
    print(label, word2vec_label_matches[label])
    print('\n')

Number of matches: 50
--------
Supreme court {'Supreme court selection': 0.90037642165321397, 'Supreme Court': 1.0000000637617144}


Right to strike {'Right to strike': 0.99999993040035029, 'Right to marry': 0.93834152957881189}


Right to examine witnesses {'Right to examine evidence/ witnesses': 0.95642475482647193}


Right to property {'Right to transfer property': 0.90977353764079061}


Religion {'Religion': 1.0000000508991531}


Campaign financing {'Campaign financing': 0.99999999655654837}


Right to culture {'Right to culture': 0.99999987631324583}


Political parties {'Prohibited political parties': 0.91217097770310263, 'Preferred political parties': 0.98159481404626348, 'Political Parties': 0.99999993103685325}


Right of petition {'Right of petition': 1.0000001293862277}


Right to life {'Right to life': 0.99999993500477657}


Administrative courts {'Administrative Courts': 0.99999996484490739}


Banning of political parties {'Preferred political parties': 0.94257587757580841

In [11]:
# Interestingly, pretty different information in two word2vec approaches
# combined, this gets ~10% of total
print('Number of overlaps:', sum([k in word2vec_label_matches for k in word2vec_node_matches]))

Number of overlaps: 5
