In [1]:
import numpy as np
import pickle
import random

In [2]:
PATH = "../data/ontologies/anatomy/"

with open(f'{PATH}human.pickle', 'rb') as handle:
    human = pickle.load(handle)
    
with open(f'{PATH}mouse.pickle', 'rb') as handle:
    mouse = pickle.load(handle)

with open(f'{PATH}entities.pickle', 'rb') as handle:
    entities = pickle.load(handle)


In [3]:
def subontology(headnode:int, children_of:dict):
    l=0
    nlayers=3
    
    subtree = [[headnode]]
    
    subtree.append(children_of[headnode])
    
    # Keep a set of all nodes seen so far, in the form of a list.
    set_of_nodes = [headnode]

    while l < nlayers:
        new_children = []
        
        # go through all nodes in the current layer 
        for node in subtree[-1]:
            # if the node has not been expanded yet, get that node's children (edges can skip levels)
            if node not in set_of_nodes:
                set_of_nodes.append(node)
                
                # some nodes are not in children_of, because they are leaf nodes
                if node in children_of:
                    new_children = new_children + children_of[node]
                else:
                    pass
                
            else: 
                pass
            
        next_nodes = [x for x in new_children if x not in set_of_nodes]
        subtree.append(list(set(next_nodes)))
    
        l += 1

    return subtree, set(set_of_nodes)

m_subtree, m_subset = subontology(11, mouse['children_of']) # limb
h_subtree, h_subset = subontology(3030, human['children_of']) # Limb

subset = m_subset.union(h_subset)

In [4]:
def is_duplicated(sub):
    set_of_nodes = []
    
    for layer in sub:
        for node in layer:
            if node not in set_of_nodes:
                set_of_nodes.append(node)
            else:
                print(f"The node {node} is already in the tree.")
                print("before proceeding, make sure you got the subontology correct.")
                
is_duplicated(h_subtree)
is_duplicated(m_subtree) 

### Get alignments for subontologies

In [5]:
def subontology_alignments(alignments, subset):
    
    sub_alignments = []
    
    for align in alignments:
        if align[0] in subset and align[1] in subset:
            sub_alignments.append(align)
        else:
            pass
        
    return sub_alignments

sub_alignments = subontology_alignments(entities['alignments'], subset)

### Get edges within each sub ontology

In [6]:
def set_dict(_dict:dict, key:int, value:int):
    if key in _dict:
        _dict[key].append(value) 
    else:
        _dict[key] = [value]

def subontology_edges(edges, subset):
    
    sub_edges = []
    parents = {}
    children = {}
    
    for (node1, node2) in edges:
        if node1 in subset and node2 in subset:
            sub_edges.append((node1, node2))
            set_dict(parents, node2, node1)
            set_dict(children, node1, node2)
        else:
            pass
    
    return sub_edges, parents, children

sub_mouse, m_parents, m_children = subontology_edges(mouse['edges'], subset)
sub_human, h_parents, h_children = subontology_edges(human['edges'], subset)

### Zero index the sub ontologies

In [7]:
def zeroize(input_set):
    
    sorted_set = sorted(list(input_set))
    
    zeroized_dict = {}
    zeroized_set = []
    
    for item in sorted_set:
        zeroized_set.append(len(zeroized_dict))
        zeroized_dict[item] = len(zeroized_dict)
        
        
    return zeroized_dict, set(zeroized_set)

zeroized_dict, zeroized_set = zeroize(subset)
zero_to_orig = dict((v,k) for k,v in zeroized_dict.items())

In [8]:
def zeroize_edges(edges, zd):
    """zd: dictionary that translates original indeces to zeroized indeces"""
    zeroized_edges = []
    
    for (node1, node2) in edges:
        zeroized_edges.append((zd[node1], zd[node2]))
            
    return zeroized_edges 

def zeroize_set(input_set:set, zd):
    zeroized_set = [zd[x] for x in input_set]
    return set(zeroized_set)

def zeroize_other(fam:dict, zd):
    """Used for zeroizing other parents & children"""
    
    zfam = {}
    
    for (key, values) in fam.items():
        for v in values:
            set_dict(zfam, zd[key], zd[v])
            
    return zfam

sub_mouse = zeroize_edges(sub_mouse, zeroized_dict)
sub_human = zeroize_edges(sub_human, zeroized_dict)
sub_alignments = zeroize_edges(sub_alignments, zeroized_dict)

m_subset = zeroize_set(m_subset, zeroized_dict)
h_subset = zeroize_set(h_subset, zeroized_dict)

m_children = zeroize_other(m_children, zeroized_dict)
h_children = zeroize_other(h_children, zeroized_dict)

m_parents = zeroize_other(m_parents, zeroized_dict)
h_parents = zeroize_other(h_parents, zeroized_dict)

### Train/dev split

In [9]:
tr_m_pos_edges = []
tr_h_pos_edges = []

# Trainsplit: used to determine how many edges within a tree are going to be in the training set.
# 1.0 -> full set of edges in the ontology will be in the training dataset
train_split = 1.0

# alignmentsplit: choose how many of the alignment edges to include in the training dataset
alignment_split = 0.8

for edge in sub_mouse:
    if np.random.uniform() > (1-train_split):
        tr_m_pos_edges.append(edge)
    
for edge in sub_human:
    if np.random.uniform() > (1-train_split):
        tr_h_pos_edges.append(edge)

if np.floor(len(sub_alignments)*alignment_split)%2 == 1.0:
    tr_pos_alignments  = sub_alignments[:int(np.floor(len(sub_alignments)*alignment_split))+1]
    dev_pos_alignments = sub_alignments[int(np.floor(len(sub_alignments)*alignment_split))+1:]
else:
    tr_pos_alignments  = sub_alignments[:int(np.floor(len(sub_alignments)*alignment_split))]
    dev_pos_alignments = sub_alignments[int(np.floor(len(sub_alignments)*alignment_split)):]

print("Number of training edges in the mouse ontology:", len(tr_m_pos_edges))
print("Number of training edges in the human ontology:", len(tr_h_pos_edges))

train_positives = tr_m_pos_edges + tr_h_pos_edges + tr_pos_alignments

Number of training edges in the mouse ontology: 227
Number of training edges in the human ontology: 44


### True negative alignments

In [10]:
def get_siblings(parents:dict, children:dict, node:int):
    siblings = []
    
    # There should only be only one node that doesn't have any parents, the root node
    if node in parents:
        parents_of_node = parents[node]
        
        # Cycle through all possible parents of the given node
        for p in parents_of_node:
            
            # if the parent node has any children, add them to the siblings list
            if p in children:
                siblings = siblings + children[p]
                
                # remove the node from the siblings list
                siblings.remove(node)
        
        # if there are any siblings, return the list of them
        if siblings:
            return siblings
        
        # if there are no siblings, return -1
        else:
            print("Given node does not have any siblings:", node)
            return -1
            
    # if the node does not have any parents, return -1
    else:
        print("Given node does not have any parents:", node)
        return -1
    # ---- 

def generate_true_neg_alignments(alignments:list, alignment_split:float=0.5, ratio:float=1.0):
    
    true_negatives = []
    numFailures = 0
    num_samples = int(len(alignments) * alignment_split * ratio)
    
    while (len(true_negatives) < num_samples) and (numFailures < 100):
        # Select a random alignment within the list of all alignments
        rdm_align = random.choice(alignments)

        # Pick a node to alter within the randomly chosen alignment 
        const_node = rdm_align[0]
        change_node = rdm_align[1]
        
        # generate all siblings within the human ontology of the chosen node
        if change_node in h_parents:
            siblings = get_siblings(h_parents, h_children, change_node)
            
        # generate all siblings within the mouse ontology of the chosen node
        elif change_node in m_parents:
            siblings = get_siblings(m_parents, m_children, change_node)
            
        # This shouldn't be triggered -- every node should have a parent node
        # The only possible node that could trigger the below statement is the root node
        else:
            print("Node not found in either Ontology or does not have any parents")
            
            
        # This error will typically be thrown if the chosen node does not have any siblings
        if siblings == -1:
            print("Error thrown when retrieving siblings")
            
        else:
            # Choose some random siblings to be make the true negative
            negative_alignment = (const_node, random.choice(siblings))
            
            if negative_alignment in alignments:
                numFailures += 1
                print("Generated negative is an existing alignment:", negative_alignment, "OG random:", rdm_align, siblings)
                pass
            
            elif negative_alignment in true_negatives:
                numFailures += 1
                print("Generated negative already in true_negatives:", negative_alignment)
                pass
            
            # include this negative alignment in the true_negatives list
            else:
                true_negatives.append(negative_alignment)
                true_negatives.append((negative_alignment[1], const_node))
                numFailures = 0
                
            
    return true_negatives

In [11]:
tr_neg_alignments = generate_true_neg_alignments(sub_alignments, alignment_split)

Generated negative already in true_negatives: (185, 14)
Given node does not have any siblings: 180
Error thrown when retrieving siblings


In [12]:
dev_true_negatives = generate_true_neg_alignments(sub_alignments, (1-alignment_split))

### Generate negatives within ontologies

In [13]:
negatives_ratio = 10

train_negatives = np.random.choice(list(zero_to_orig.keys()), size=(int(negatives_ratio*len(train_positives)), 2))

In [14]:
train_negatives

array([[124,  93],
       [155,  35],
       [  6, 103],
       ...,
       [103, 130],
       [177,  22],
       [ 33, 120]])

### Transitive Closure

In [15]:
alignment_dict = {}

for a in sub_alignments:
    if a[0] not in alignment_dict:
        alignment_dict[a[0]] = [a[1]]
    else:
        alignment_dict[a[0]].append(a[1])

In [17]:
def get_all_parents(parents_of:dict, node:int):
    ancestors = []
    
    if node in parents_of:
        ancestors = ancestors + parents_of[node]
        
        for a in ancestors:
            ancestors = ancestors + get_all_parents(parents_of, a)
            
    else:
        pass
    
    return list(set(ancestors))

def get_all_children(children_of:dict, node:int):
    descendants = []
    
    if node in children_of:
        # print(node, children_of[node])
        descendants = descendants + children_of[node]
        
        for d in descendants:
            descendants = descendants + get_all_children(children_of, d)
        # print("returning from", node, descendants)
            
    else:
        # print(node, "has no children")
        pass
    
    return list(set(descendants))

In [29]:
def nodes_in_tr_align(tr_alignments):
    tr_align_nodes = []
    
    for edge in tr_alignments:
        for node in edge:
            tr_align_nodes.append(node)
    
    return set(tr_align_nodes)    

tr_align_set = nodes_in_tr_align(tr_pos_alignments)

In [31]:
def transitive_closure(entity_set:set, h_parents, m_parents, tr_alignments, USE_OWL_THING:bool=False):
    transitive_edges = []
    mouse_tc = []
    human_tc = []

    for _node1 in entity_set:
        
        if _node1 in m_subset:
            parents = get_all_parents(m_parents, _node1)       
            other_parents = h_parents
        
        elif _node1 in h_subset:
            parents = get_all_parents(h_parents, _node1)
            other_parents = m_parents 
        
        if not parents:
            continue
        
        align_parents = []
        # go through all ancestors of the current node (_node1)
        for p in parents:
            
            # if a parent of the node has an alignment, get the parents of that alignment
            if p in tr_alignments:
                # since some nodes can have multiple alignments, 
                # go through every alignment and add all parents to the list
                for aligned_node in alignment_dict[p]:
                    align_parents.append(aligned_node)
                    align_parents = align_parents + get_all_parents(other_parents, aligned_node)
                    
        parents = parents + align_parents
        
        for _node2 in parents:
            transitive_edges.append((_node2, _node1))
            
            if _node1 in m_subset:
                mouse_tc.append((_node2, _node1))
                
            elif _node1 in h_subset:
                human_tc.append((_node2, _node1))
        
    return transitive_edges, mouse_tc, human_tc

tc_pos_edges, mouse_tc, human_tc = transitive_closure(zeroized_set, h_parents, m_parents, tr_align_set)

In [57]:
mouse_probs = [1/(len(m_subset)-1) for i in range(len(m_subset)) ]
human_probs = [1/(len(h_subset)-1) for i in range(len(h_subset)) ]

unary_probs = mouse_probs + human_probs

In [64]:
tc_negatives_ratio = 2

tc_neg_edges = np.random.choice(list(zero_to_orig.keys()), size=(int(negatives_ratio*len(tc_pos_edges)), 2))

np.savetxt(f'../data/ontologies/anatomy/subset/tr_pos_tc_{alignment_split}.tsv', tc_pos_edges, delimiter='\t', fmt='%1.1d')
np.savetxt(f'../data/ontologies/anatomy/subset/tr_neg_tc_{alignment_split}.tsv', tc_neg_edges, delimiter='\t', fmt='%1.1d')

In [71]:
np.savetxt(f'../data/ontologies/anatomy/subset/tr_pos_{alignment_split}.tsv', train_positives, delimiter='\t', fmt='%1.1d')
np.savetxt(f'../data/ontologies/anatomy/subset/tr_neg_{alignment_split}.tsv', train_negatives, delimiter='\t', fmt='%1.1d')
np.savetxt(f'../data/ontologies/anatomy/subset/dev_align_pos_{alignment_split}.tsv', dev_pos_alignments, delimiter='\t', fmt='%1.1d')
np.savetxt(f'../data/ontologies/anatomy/subset/dev_align_neg_{alignment_split}.tsv', dev_true_negatives, delimiter='\t', fmt='%1.1d')

In [74]:
np.savetxt(f'../data/ontologies/anatomy/subset/tr_align_pos_{alignment_split}.tsv', tr_pos_alignments, delimiter='\t', fmt='%1.1d')
np.savetxt(f'../data/ontologies/anatomy/subset/tr_align_neg_{alignment_split}.tsv', tr_neg_alignments, delimiter='\t', fmt='%1.1d')

In [75]:
human_pickle = {}
mouse_pickle = {}
entity_pickle = {}

human_pickle['edges'] = sub_human 
human_pickle['tc'] = human_tc 
human_pickle['parents_of'] = h_parents
human_pickle['children_of'] = h_children
human_pickle['human_entities'] = h_subset

mouse_pickle['edges'] = sub_mouse
mouse_pickle['tc'] = mouse_tc 
mouse_pickle['parents_of'] = m_parents
mouse_pickle['children_of'] = m_children
mouse_pickle['mouse_entities'] = m_subset

entity_pickle['alignments'] = sub_alignments
entity_pickle['all_tc'] = tc_pos_edges
entity_pickle['zero_to_orig'] = zero_to_orig
entity_pickle['orig_to_zero'] = zeroized_dict
entity_pickle['zero_set'] = zeroized_set
entity_pickle['set'] = subset
entity_pickle['align_dict'] = alignment_dict

with open('../data/ontologies/anatomy/subset/human_subset.pickle', 'wb') as handle:
    pickle.dump(human_pickle, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../data/ontologies/anatomy/subset/mouse_subset.pickle', 'wb') as handle:
    pickle.dump(mouse_pickle, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('../data/ontologies/anatomy/subset/entities_subset.pickle', 'wb') as handle:
    pickle.dump(entity_pickle, handle, protocol=pickle.HIGHEST_PROTOCOL)    

In [82]:
np.savetxt('../data/ontologies/anatomy/subset/unary/human_unary.tsv', human_probs, delimiter='\t')
np.savetxt('../data/ontologies/anatomy/subset/unary/mouse_unary.tsv', mouse_probs, delimiter='\t')
np.savetxt('../data/ontologies/anatomy/subset/unary/unary.tsv', unary_probs, delimiter='\t')