This note book contains a customization of the Fengbo's code to obtain lexical features for NCIt. The code was based on: NLS_FCA_LogicalDef_Latest.py

In [3]:
import string
allowed = string.ascii_letters + string.digits
from tqdm import tqdm
import pandas as pd
from Graph import *
from NonLatticeSubgraphs import *
from tqdm import tqdm
tqdm.pandas()

In [4]:
import csv
import spacy
from spacy.tokenizer import Tokenizer #split by space #default for Tokenizer
nlp = spacy.load("en_core_web_lg")
nlp.tokenizer = Tokenizer(nlp.vocab)

In [5]:
import copy

# Loading graph

In [6]:
# Fengbo's method
def checkNotSymbol(inputToken):
    if any(x in allowed for x in inputToken) == True:
        return True
    else:
        return False

In [7]:
# input files
input_labelsFile_ncit = 'NCIt/NCIt_labels_23_05e.txt'
input_hierarchyFile_ncit = 'NCIt/NCIt_hierarchy_23_05e.txt'
input_nlsFile_ncit = 'NCIt/NCIT_nonlattice_bottom_up_closure_23-05e.txt'

In [8]:
G_ncit = load_networkx_graph(input_labelsFile_ncit, input_hierarchyFile_ncit)

nlss_ncit = loadNLSsFromHierarchy(G_ncit, input_nlsFile_ncit, None)

Number of nodes:  180065
Number of edges:  207501
Num NLSs:  14529


In [9]:
# pre-computing ancestors as nx.descendants() method takes time
con_ancs = {}  # key=concept id, value = set of its ancestors
for con in tqdm(G_ncit.nodes()):
    con_ancs[con] = nx.descendants(G_ncit, con)

100%|███████████████████████████████████████████████████████████████████████| 180065/180065 [00:02<00:00, 79052.05it/s]


# Loading non-lattice subgraph (NLS) and the non-related concept-pairs within the NLS

In [8]:
ncit_drug_id = 'C1908' # CORRECT DRUG CODE
#ncit_drug_id = 'C3262'
ncit_drug_graph = obtainSubhierarchy(G_ncit, ncit_drug_id)
nlss_ncit_drug= loadNLSsFromHierarchy(ncit_drug_graph, input_nlsFile_ncit, None)

Number of nodes in subhierarchy:  26482
Number of edges in subhierarchy:  32217
Num NLSs:  1320


In [9]:
ncit_drug_nls_non_rels = set()
for nls in tqdm(nlss_ncit_drug):
    ncit_drug_nls_non_rels.update(nls.getAllNonRelations())

100%|███████████████████████████████████████| 1320/1320 [01:56<00:00, 11.32it/s]


In [10]:
len(ncit_drug_nls_non_rels)

1896730

In [10]:
ncit_all_nls_non_rels = set()
for nls in tqdm(nlss_ncit):
    ncit_all_nls_non_rels.update(nls.getAllNonRelations())

100%|████████████████████████████████████████████████████████████████████████████| 14529/14529 [09:40<00:00, 25.01it/s]


In [12]:
len(ncit_all_nls_non_rels)

7212970

# Obtaining lexical features

In [13]:
# obtaining bag-of-words
for con in G_ncit.nodes():
    if 'label' not in G_ncit.nodes[con]: # 'thing' does not have a label
        continue
    G_ncit.nodes[con]['bow'] = set(G_ncit.nodes[con]['label'].lower().replace(",", " ,").replace(":"," :").replace("."," .").split())

In [65]:
usedToken

{of, nerve, of, abdomen}

In [16]:
# obtaining words, noun chunks, pobj, dobj dependencies
for con in tqdm(G_ncit.nodes()):
    if 'label' not in G_ncit.nodes[con]: # 'thing' does not have a label
        continue
    doc = nlp(G_ncit.nodes[con]['label'])
    LinguisticFeature = set()
    usedToken = set()
    for nounChunks in doc.noun_chunks:
        if checkNotSymbol(nounChunks.text) == True and len(nounChunks)>1:  # Fengbo's code
            LinguisticFeature.add(nounChunks.text)
        if (checkNotSymbol(nounChunks.root.text) == True) and ((nounChunks.root.dep_ == "pobj") or (nounChunks.root.dep_ == "dobj")):
            LinguisticFeature.add(nounChunks.root.head.text+" "+nounChunks.root.text)
            usedToken.add(nounChunks.root)
            usedToken.add(nounChunks.root.head)
    
    for token in doc:
        if token not in usedToken and checkNotSymbol(token.text) == True:
            LinguisticFeature.add(token.text)
    
    G_ncit.nodes[con]['linguistic_features'] = LinguisticFeature

100%|█████████████████████████████████████████████████████████████████████████| 180065/180065 [11:07<00:00, 269.78it/s]


In [17]:
# obtaining enriched words, noun chunks, pobj, dobj dependencies
for con in tqdm(G_ncit.nodes()):
    if 'linguistic_features' in G_ncit.nodes[con]:
        enriched = copy.deepcopy(G_ncit.nodes[con]['linguistic_features'])
    else:
        enriched = set()
        
    for anc in con_ancs[con]: #nx.descendants(G_ncit, con):
        if 'linguistic_features' in G_ncit.nodes[anc]:
            enriched.update(G_ncit.nodes[anc]['linguistic_features'])
    
    G_ncit.nodes[con]['enriched_linguistic_features'] = enriched

100%|███████████████████████████████████████████████████████████████████████| 180065/180065 [00:03<00:00, 59656.91it/s]


In [25]:
# writing enriched linguistic features to a file
enriched_linguistic_features_output = 'Outputs/enriched_linguistic_features.txt'
with open(enriched_linguistic_features_output, 'w') as txtfile:
    for con in tqdm(G_ncit.nodes()):
        if 'enriched_linguistic_features' in G_ncit.nodes[con]:
            txtfile.write(con + '\t' + '-##-'.join(G_ncit.nodes[con]['enriched_linguistic_features'])+ '\n')

100%|██████████| 180064/180064 [00:00<00:00, 267102.55it/s]


# Loading concept statuses (primitive/defined)

In [18]:
input_conceptStatus_ncit = 'NCIt/NCIt_concept_status_23_05e.txt'
with open(input_conceptStatus_ncit, 'r') as txtfile:
    for line in txtfile:
        con_id, status = line.strip().split('\t')
        if con_id not in G_ncit.nodes():
            print('concept not in graph: ', con_id)
            continue
        G_ncit.nodes[con_id]['status'] = status

C43634 and C63553 are obsolete. Should check how these appear in the concept status file but not in the labels file.

# Obtaining logical definitions (attribute relations)

In [21]:
# loading attribute relations from file
input_attributeFile_ncit = 'NCIt/NCIt_attribute_rels_23_05e.txt'
with open(input_attributeFile_ncit, 'r') as txtfile:
    for line in txtfile:
        con_id, prop_value = line.strip().split('\t')
        prop, value = prop_value.split('|')
        if 'attribute_rels' not in G_ncit.nodes[con_id]:
            G_ncit.nodes[con_id]['attribute_rels'] = {}
        if prop not in G_ncit.nodes[con_id]['attribute_rels']:
            G_ncit.nodes[con_id]['attribute_rels'][prop] = set()
        G_ncit.nodes[con_id]['attribute_rels'][prop].add(value)

In [22]:
G_ncit.nodes['C9118']['attribute_rels']

{'C43752': {'C12917', 'C12922'},
 'C43750': {'C45714'},
 'C43753': {'C12964'},
 'C43746': {'C36105', 'C36122', 'C53665', 'C53667'},
 'C43756': {'C36115'},
 'C43749': {'C12578'},
 'C43760': {'C12964'}}

In [16]:
# all drug NLS candidate-pair concepts' and their parents' attribute relationships
drug_con_attr_relationships = set()
for con1_con2 in tqdm(ncit_drug_nls_non_rels):
    con1, con2 = con1_con2
    if 'attribute_rels' in G_ncit.nodes[con1]:
        drug_con_attr_relationships.update(G_ncit.nodes[con1]['attribute_rels'].keys())
    if 'attribute_rels' in G_ncit.nodes[con2]:
        drug_con_attr_relationships.update(G_ncit.nodes[con2]['attribute_rels'].keys())

100%|█████████████████████████████| 1896730/1896730 [00:04<00:00, 431999.95it/s]


In [19]:
for attr in drug_con_attr_relationships:
    print(G_ncit.nodes[attr]['label'])

Chemical_Or_Drug_Is_Metabolized_By_Enzyme
Chemical_Or_Drug_Affects_Abnormal_Cell
Chemical_Or_Drug_Has_Physiologic_Effect
Chemical_Or_Drug_Has_Mechanism_Of_Action
Chemical_Or_Drug_Affects_Cell_Type_Or_Tissue
Chemical_Or_Drug_Affects_Gene_Product
Chemical_Or_Drug_Plays_Role_In_Biological_Process


# Obtaining Associations

In [23]:
# loading associations from file
input_associationFile_ncit = 'NCIt/NCIt_association_rels_23_05e.txt'
with open(input_associationFile_ncit, 'r') as txtfile:
    for line in txtfile:
        con_id, prop_value = line.strip().split('\t')
        if con_id not in G_ncit.nodes():
            continue
        prop, value = prop_value.split('|')
        if 'associations' not in G_ncit.nodes[con_id]:
            G_ncit.nodes[con_id]['associations'] = {}
        if prop not in G_ncit.nodes[con_id]['associations']:
            G_ncit.nodes[con_id]['associations'][prop] = set()
        G_ncit.nodes[con_id]['associations'][prop].add(value)

In [24]:
G_ncit.nodes['C142912']['associations']

{'C54538': {'C116977', 'C116978', 'C176424', 'C63923'},
 'C53690': {'C30006'},
 'C49123': {'C71717'}}

# Missing IS-A identification

In [82]:
# check logical definition subsumption - pairwise comparison

def logical_subsumption_0(pot_child, pot_parent):
    if not con_ancs[pot_parent].issubset(con_ancs[pot_child]):  # child's ancestors should be a super set of parents.
        #print('AAAA')
        return False
    
    for prop_par, value_set_par in G_ncit.nodes[pot_parent]['attribute_rels'].items():
        flag2 = False
        for prop_chi, value_set_chi in G_ncit.nodes[pot_child]['attribute_rels'].items():
            if (prop_par==prop_chi) or (prop_par in con_ancs[prop_chi]):
                flag2 = True  #if attribute type is found
#                 if prop_par in con_ancs[prop_chi]:
#                     print('property level subsumption', prop_chi, prop_par)
                for val_par in value_set_par:
                    flag=False
                    for val_chi in value_set_chi:
                        if (val_par==val_chi) or (val_par in con_ancs[val_chi]):
#                             if val_par in con_ancs[val_chi]:
#                                 print('value level subsumption', val_chi, val_par)
                            flag=True
                            break
                    if flag==False:    # a specific attribute-value pair for (prop_chi, value_set_chi) has not been found
                        return False

        if flag2 == False:
            return False
            
                    
    return True



def general_exists(prop_par, val_par, pot_child):
    for prop_chi, value_set_chi in G_ncit.nodes[pot_child]['attribute_rels'].items():
        if (prop_par==prop_chi) or (prop_par in con_ancs[prop_chi]):
            for val_chi in value_set_chi:
                if (val_par==val_chi) or (val_par in con_ancs[val_chi]):
                    return True
    return False
    


def logical_subsumption(pot_child, pot_parent):
    if not con_ancs[pot_parent].issubset(con_ancs[pot_child]):  # child's ancestors should be a super set of parents.
        #print('AAAA')
        return False
    
    for prop_par, value_set_par in G_ncit.nodes[pot_parent]['attribute_rels'].items():
        for val_par in value_set_par:
            if not general_exists(prop_par, val_par, pot_child):
                return False
    return True

                    
#logical_subsumption('C2372', 'C179618')
logical_subsumption('C7446', 'C7045')

True

In [86]:
# obtain the attribute relation between two given concepts
def get_attribute_relation_between_cons(con1, con2):
    con1_descendants = nx.ancestors(G_ncit, con1)
    con2_descendants = nx.ancestors(G_ncit, con2)
    for attr, val_set in G_ncit.nodes[con1]['attribute_rels'].items():
        if con2 in val_set:
            return attr
        for dec_con2 in con2_descendants:
            if dec_con2 in val_set:
                return attr
            
    for attr, val_set in G_ncit.nodes[con2]['attribute_rels'].items():
        if con1 in val_set:
            return attr
        for dec_con1 in con1_descendants:
            if dec_con1 in val_set:
                return attr

In [87]:
get_attribute_relation_between_cons('C155321', 'C18219')

'C43737'

In [88]:
# get associations between concepts (only applies to NCIt which has a special kind of relations called associations)
def get_associations_between_cons(con1, con2):
    if 'associations' in G_ncit.nodes[con1]:
        for attr, val_set in G_ncit.nodes[con1]['associations'].items():
            if con2 in val_set:
                return attr
    
    if 'associations' in G_ncit.nodes[con2]:
        for attr, val_set in G_ncit.nodes[con2]['associations'].items():
            if con1 in val_set:
                return attr

In [89]:
get_associations_between_cons('C142912', 'C176424')

'C54538'

In [114]:
potential_missing_isa = set()
#for pot_child, pot_parent in tqdm(ncit_drug_nls_non_rels):
for pot_child, pot_parent in tqdm(ncit_all_nls_non_rels):
    if ('attribute_rels' not in G_ncit.nodes[pot_parent]) or ('attribute_rels' not in G_ncit.nodes[pot_child]): # the parent should have at least one attribute relation
        continue
        
#     if G_ncit.nodes[pot_child]['status']!='defined':  # the child should be defined
#         continue
        
    if (pot_parent in con_ancs[pot_child]) or (pot_child in con_ancs[pot_parent]): # an relation exists already (this does not apply hear as we are working with NLS non-relations)
        continue

    if pot_child == 'C179912' and (G_ncit.nodes[pot_parent]['enriched_linguistic_features'].issubset(G_ncit.nodes[pot_child]['enriched_linguistic_features'])):#logical_subsumption(pot_child, pot_parent):
        print(pot_child, pot_parent)
    
    if (G_ncit.nodes[pot_parent]['enriched_linguistic_features'].issubset(G_ncit.nodes[pot_child]['enriched_linguistic_features'])) and logical_subsumption(pot_child, pot_parent):
        if pot_child == 'C179912':
            print('AAAA')
        con1_con2_attr_rel = get_attribute_relation_between_cons(pot_child, pot_parent)
        if con1_con2_attr_rel is not None:    # if there exists an attribute relationship between these concepts
            #print('Attribute relation exists!!!: ', pot_child, pot_parent, con1_con2_attr_rel)
            continue
            
        con1_con2_association = get_associations_between_cons(pot_child, pot_parent)
        if con1_con2_association is not None:
            #print('Association exists!!!: ', pot_child, pot_parent, con1_con2_association)
            potential_missing_isa.add((pot_child, G_ncit.nodes[pot_child]['label'], pot_parent, G_ncit.nodes[pot_parent]['label'], G_ncit.nodes[con1_con2_association]['label']))
            continue

        potential_missing_isa.add((pot_child, G_ncit.nodes[pot_child]['label'], pot_parent, G_ncit.nodes[pot_parent]['label'], ''))
        
#     if (G_ncit.nodes[pot_parent]['enriched_linguistic_features'].issubset(G_ncit.nodes[pot_child]['enriched_linguistic_features'])):
#         potential_missing_isa.add((pot_child, pot_parent))
        
#     if logical_subsumption(pot_child, pot_parent):
#         potential_missing_isa.add((pot_child, pot_parent))

100%|████████████████████████████████████████████████████████████████████| 7212970/7212970 [00:18<00:00, 386704.35it/s]


In [91]:
len(potential_missing_isa)

293

In [92]:
def remove_redundant_relations_2(graph, results_set):
    redundant_rels = set()
    for res in tqdm(results_set):
        #graph_copy = copy.deepcopy(graph)
        graph_copy = graph.copy()
        res_con_id, res_con_label, res_anc_id, res_anc_label, res_association = res
        for other_res in results_set:
            if res == other_res:
                continue
            other_res_con_id, other_res_con_label, other_res_anc_id, other_res_anc_label, other_res_association = other_res
            graph_copy.add_edge(other_res_con_id, other_res_anc_id)

        if nx.has_path(graph_copy, res_con_id, res_anc_id):
            redundant_rels.add(res)
    return results_set.difference(redundant_rels)

In [93]:
potential_missing_isa_nonRedundant = remove_redundant_relations_2(G_ncit, potential_missing_isa)

100%|████████████████████████████████████████████████████████████████████████████████| 293/293 [08:07<00:00,  1.66s/it]


In [94]:
len(potential_missing_isa_nonRedundant)

120

In [95]:
# Checking how many belonging to the drug subhierarchy
drug_subhierarchy_results = set()
for con_id, con_label, anc_id, anc_label, association in potential_missing_isa_nonRedundant:
    if 'C1908' in con_ancs[con_id]:
        drug_subhierarchy_results.add((con_id, con_label, anc_id, anc_label, association))

len(drug_subhierarchy_results)

38

In [99]:
res_drug_potential_df = pd.DataFrame(drug_subhierarchy_results, columns=['child_id', 'child_label', 'parent_id', 'parent_label', 'association'])
res_drug_potential_df.to_excel('Outputs/NCIT_drug_potentialMissing_associationsMarked_correct_final.xlsx')

In [100]:
len(res_drug_potential_df[res_drug_potential_df['association']==''])

20

In [101]:
res_drug_potential_df[res_drug_potential_df['association']==''].to_excel('Outputs/NCIT_drug_potentialMissing_correct_final.xlsx')

In [96]:
# res_potential_df = pd.DataFrame(potential_missing_isa_nonRedundant, columns=['child_id', 'child_label', 'parent_id', 'parent_label', 'association'])
# res_potential_df.to_excel('Outputs/NCIT_drug_potentialMissing_associationsMarked.xlsx')

res_potential_df = pd.DataFrame(potential_missing_isa_nonRedundant, columns=['child_id', 'child_label', 'parent_id', 'parent_label', 'association'])
res_potential_df.to_excel('Outputs/NCIT_potentialMissing_associationsMarked_correct_final.xlsx')

In [97]:
# number of potential missing is-a with no association
len(res_potential_df[res_potential_df['association']==''])

102

In [98]:
# writing to file: no associations
res_potential_df[res_potential_df['association']==''].to_excel('Outputs/NCIT_potentialMissing_correct_final.xlsx')