## Notebook to process all INDRA/REACH output

Steps:
1. Input all predications (after JSON extraction and CUI mapping) from reach_data/
2. Preprocess data
3. Mapping to CURIEs
4. Add prefixes 
5. Create RDF and Networkx graph

TODO:
1. NER on unmapped for all data
2. Metadata dict/file separate from graph

In [25]:
import pandas as pd
# import built-in namespaces
from rdflib.namespace import OWL, RDF, RDFS
import numpy as np
import os
import pickle

### Input data and save counts

In [2]:
##input all TSV files from reach_data folder as pandas dataframes
# get all files in the folder
files = os.listdir('reach_data')
df = pd.DataFrame()
# get only the tsv files
tsv_files = [file for file in files if file.endswith('.tsv')]
for filename in tsv_files:
    # read the file as a pandas dataframe
    dfnp = pd.read_csv('reach_data/' + filename, sep='\t')
    # get the name of the file without the extension
    dfnp['related_common_name'] = filename.split('_')[0]
    dfnp = dfnp.drop_duplicates()
    df = pd.concat([df, dfnp], ignore_index=True)
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63640 entries, 0 to 63639
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   seq                   63640 non-null  int64  
 1   pmid                  63640 non-null  int64  
 2   subject_cui           55344 non-null  object 
 3   subject_name          55344 non-null  object 
 4   subject_type          55344 non-null  object 
 5   subject_source        63403 non-null  object 
 6   subj_map_reach        63516 non-null  object 
 7   predicate             63640 non-null  object 
 8   object_source         63538 non-null  object 
 9   object_cui            56717 non-null  object 
 10  object_name           56717 non-null  object 
 11  object_type           56717 non-null  object 
 12  obj_map_reach         63640 non-null  object 
 13  belief                63640 non-null  float64
 14  sentence              63640 non-null  object 
 15  year               

In [5]:
def save_data_statistics(df):
    nps = df['related_common_name'].unique()
    info = {
        'related_common_name': [],
        'PMIDs': [],
        'statements': []
    }
    for np_name in nps:
        dfnp = df.loc[df['related_common_name'] == np_name]
        info['related_common_name'].append(np_name)
        info['PMIDs'].append(len(dfnp['pmid'].unique()))
        info['statements'].append(len(dfnp))
    df_stats = pd.DataFrame.from_dict(info)
    df_stats.to_csv('reach_data/reach_data_statistics.tsv', sep='\t', index=False)

In [6]:
df = df.fillna('')
save_data_statistics(df)
df.head()

Unnamed: 0,seq,pmid,subject_cui,subject_name,subject_type,subject_source,subj_map_reach,predicate,object_source,object_cui,...,sentence,year,subject_score,object_score,umls_flag,subj_reach_grounding,obj_reach_grounding,pub_type,source_section,related_common_name
0,0,23674609,,,,A-792611,{'TEXT': 'A-792611'},Activation,CYP3A4,C3714798,...,Compared with the 32 and 49% inhibition of CYP...,2013 Aug,,27.26,1,"(None, None)","('HGNC', '2637')",['Journal Article'],,valerian
1,1,17910620,C0057223,Cytochrome_P-450_CYP2D6,"['aapp', 'enzy']",CYP2D6- IC,{'TEXT': 'CYP2D6- IC'},Activation,metabolic process,C0025520,...,St. John’s wort inhibited CYP2D6- IC mediated ...,2007 Nov,25.58,14.64,1,"(None, None)","('GO', 'GO:0008152')",['Journal Article'],,valerian
2,2,33932511,,,,HPG,{'TEXT': 'HPG'},Activation,papF,C0030428,...,"10 μM, 30 μM, and 50 μM of HPG caused outward ...",2021 Aug 10,,12.96,1,"(None, None)","('UP', 'P08408')",['Journal Article'],,valerian
3,3,18331390,C3898062,P-glycoprotein_Inhibitor,['chvf'],P-gp inhibitor,{'TEXT': 'P-gp inhibitor'},Activation,digoxin,C0012265,...,"Verapamil, the classical P-gp inhibitor, dec...",2008 May,5.18,27.26,1,"(None, None)","('CHEBI', 'CHEBI:4551')",['Journal Article'],,valerian
4,4,18331390,C3898062,P-glycoprotein_Inhibitor,['chvf'],P-gp inhibitor,{'TEXT': 'P-gp inhibitor'},Activation,transport,C0005528,...,"Verapamil, the classical P-gp inhibitor, dec...",2008 May,5.18,8.34,1,"(None, None)","('GO', 'GO:0006810')",['Journal Article'],,valerian


### Mappings 
1. REACH existing mappings
2. HPO and GO mappings
3. Existing OntoRunNER
4. New Gilda and OntoRunNER

In [7]:
#fixes reach mappings to OBO (if available) and adds text if no mapping or blank name of entity
for i in range(len(df.index)):
    subj_ground = df.at[i, 'subj_reach_grounding']
    obj_ground = df.at[i, 'obj_reach_grounding']
    subj_map = df.at[i, 'subj_map_reach']
    obj_map = df.at[i, 'obj_map_reach']
    if subj_ground == '(None, None)' or subj_ground == np.nan:
        temp = subj_map.split(',')
        if 'GO' in subj_map or 'CHEBI' in subj_map:
            for item in temp:
                if 'GO' in item or 'CHEBI' in item:
                    df.at[i, 'subj_reach_grounding'] = item
        elif 'UP' in subj_map:
            for item in temp:
                if 'UP' in item:
                    df.at[i, 'subj_reach_grounding'] = item
        elif 'TEXT' in subj_map:
            if df.at[i, 'subject_source'] == '':
                for item in temp:
                    if 'TEXT' in item:
                        df.at[i, 'subject_source'] = item.split(':')[-1]
    elif 'GO' not in subj_ground and 'CHEBI' not in subj_ground and 'UP' not in subj_ground:
        temp = subj_map.split(',')
        if 'GO' in subj_map or 'CHEBI' in subj_map:
            for item in temp:
                if 'GO' in item or 'CHEBI' in item:
                    df.at[i, 'subj_reach_grounding'] = item
        elif 'UP' in subj_map:
            for item in temp:
                if 'UP' in item:
                    df.at[i, 'subj_reach_grounding'] = item
        elif 'TEXT' in subj_map:
            if df.at[i, 'subject_source'] == '':
                for item in temp:
                    if 'TEXT' in item:
                        df.at[i, 'subject_source'] = item.split(':')[-1]
    
    #same for object mapping
    if obj_ground == '(None, None)' or obj_ground == np.nan:
        temp = obj_map.split(',')
        if 'GO' in obj_map or 'CHEBI' in obj_map:
            for item in temp:
                if 'GO' in item or 'CHEBI' in item:
                    df.at[i, 'obj_reach_grounding'] = item
        elif 'UP' in obj_map:
            for item in temp:
                if 'UP' in item:
                    df.at[i, 'obj_reach_grounding'] = item
        elif 'TEXT' in obj_map:
            if df.at[i, 'object_source'] == '':
                for item in temp:
                    if 'TEXT' in item:
                        df.at[i, 'object_source'] = item.split(':')[-1]
    elif 'GO' not in obj_ground and 'CHEBI' not in obj_ground and 'UP' not in obj_ground:
        temp = obj_map.split(',')
        if 'GO' in obj_map or 'CHEBI' in obj_map:
            for item in temp:
                if 'GO' in item or 'CHEBI' in item:
                    df.at[i, 'obj_reach_grounding'] = item
        elif 'UP' in obj_map:
            for item in temp:
                if 'UP' in item:
                    df.at[i, 'obj_reach_grounding'] = item
        elif 'TEXT' in obj_map:
            if df.at[i, 'object_source'] == '':
                for item in temp:
                    if 'TEXT' in item:
                        df.at[i, 'object_source'] = item.split(':')[-1]

In [8]:
df = df[['related_common_name', 'subject_cui', 'subject_name', 'subject_source', 'predicate', 'object_source',
       'object_cui', 'object_name', 'subj_reach_grounding', 'obj_reach_grounding', 'pmid', 
           'year', 'belief', 'sentence', 'pub_type', 'source_section']]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63640 entries, 0 to 63639
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   related_common_name   63640 non-null  object 
 1   subject_cui           63640 non-null  object 
 2   subject_name          63640 non-null  object 
 3   subject_source        63640 non-null  object 
 4   predicate             63640 non-null  object 
 5   object_source         63640 non-null  object 
 6   object_cui            63640 non-null  object 
 7   object_name           63640 non-null  object 
 8   subj_reach_grounding  63640 non-null  object 
 9   obj_reach_grounding   63640 non-null  object 
 10  pmid                  63640 non-null  int64  
 11  year                  63640 non-null  object 
 12  belief                63640 non-null  float64
 13  sentence              63640 non-null  object 
 14  pub_type              63640 non-null  object 
 15  source_section     

In [9]:
df['predicate_obo'] = None
df['subject_obo'] = None
df['object_obo'] = None

In [10]:
#map cui to reach grounding
reach_grounding_dict = {}
for i in range(len(df.index)):
    sub_cui = df.at[i, 'subject_cui']
    sub_source = df.at[i, 'subject_source']
    obj_cui = df.at[i, 'object_cui']
    obj_source = df.at[i, 'object_source']
    sub_ground = df.at[i, 'subj_reach_grounding']
    obj_ground = df.at[i, 'obj_reach_grounding']
    sub_key = i
    obj_key = i
    if sub_cui != '':
        sub_key = sub_cui
    elif sub_source != '':
        sub_key = sub_source
    if obj_cui != '':
        obj_key = obj_cui
    elif obj_source != '':
        obj_key = obj_source
    if sub_key not in reach_grounding_dict:
        reach_grounding_dict[sub_key] = []
    if obj_key not in reach_grounding_dict:
        reach_grounding_dict[obj_key] = []
    if sub_ground != np.nan:
        if 'CHEBI' in sub_ground or 'GO' in sub_ground:
            subject_map = sub_ground.split("'")[-2]
            subject_map = subject_map.replace(':', '_')
            reach_grounding_dict[sub_key].append(subject_map)
        elif 'UP' in sub_ground:
            subject_map = sub_ground.split("'")[-2]
            subject_map = 'PR_' + subject_map
            reach_grounding_dict[sub_key].append(subject_map)
    if obj_ground != np.nan:
        if 'CHEBI' in obj_ground or 'GO' in obj_ground:
            object_map = obj_ground.split("'")[-2]
            object_map = object_map.replace(':', '_')
            reach_grounding_dict[obj_key].append(object_map)
        elif 'UP' in obj_ground:
            object_map = obj_ground.split("'")[-2]
            object_map = 'PR_' + object_map
            reach_grounding_dict[obj_key].append(object_map)
print(len(reach_grounding_dict))

12560


In [13]:
constituent_mapping_dict = {}
dfcons = pd.read_csv('cui_to_ontology_maps/chebi-extensions-constituents-and-NP-20230405.tsv', sep='\t')
dfcons.head()

Unnamed: 0,constituent_name,URI
0,12beta-acetoxycimigenol 3-o-beta-d-xylopyranoside,http://napdi.org/napdi_srs_imports:12beta_acet...
1,2'-o-acetylactein,http://napdi.org/napdi_srs_imports:2_o_acetyla...
2,2'-o-acetylcimicifugoside h1,http://napdi.org/napdi_srs_imports:2_o_acetylc...
3,23-o-acetylshengmanol,http://napdi.org/napdi_srs_imports:23_o_acetyl...
4,23-o-acetylshengmanol-3-o-alpha-l-arabinoside,http://napdi.org/napdi_srs_imports:23_o_acetyl...


In [14]:
##use for mapping after prefixing
for i in range(len(dfcons.index)):
    constituent_name = dfcons.at[i, 'constituent_name']
    uri = dfcons.at[i, 'URI']
    constituent_mapping_dict[constituent_name] = uri
len(constituent_mapping_dict)

629

In [15]:
predMapD = {
    'regulateactivity':'RO_0011002',
    'regulateamount':'RO_0011003',
    'phosphorylation':'RO_0002447',
    'dephosphorylation':'GO_0006470',
    'ubiquitination':'RO_0002480',
    'deubiquitination':'GO_0016579',
    'sumoylation':'RO_0002436',
    'desumoylation':'RO_0002436',
    'hydroxylation':'GO_0018126',
    'dehydroxylation':'RO_0002436',
    'acetylation':'GO_0006473',
    'deacetylation':'GO_0006476',
    'glycosylation':'GO_0006486',
    'deglycosylation':'GO_0006517',
    'farnesylation':'RO_0002436',
    'defarnesylation':'RO_0002436',
    'geranylgeranylation':'RO_0002436',
    'degeranylgeranylation':'RO_0002436',
    'palmitoylation':'RO_0002436',
    'depalmitoylation':'RO_0002436',
    'myristoylation':'RO_0002436',
    'demyristoylation':'RO_0002436',
    'ribosylation':'RO_0002436',
    'deribosylation':'RO_0002436',
    'methylation':'GO_0006479',
    'demethylation':'GO_0006482',
    'activation':'RO_0002448',
    'inhibition':'RO_0002449',
    'increaseamount':'RO_0011009',
    'decreaseamount':'RO_0011010'
}

In [16]:
with open('cui_to_ontology_maps/go_hpo_map_dict.pickle', 'rb') as filep:
    go_hpo_mapping_dict = pickle.load(filep)
len(go_hpo_mapping_dict)

87745

In [22]:
#if reach_grounding has multiple values, currently only first one is taken -- assess if this is the best option
def get_obo_mapping(row, col):
    if col == 'subject':
        cui = row['subject_cui']
        source = row['subject_source']
    elif col == 'object':
        cui = row['object_cui']
        source = row['object_source']
    else:
        print('specify if subject or object mapping required')
        exit(0)
    if cui in go_hpo_mapping_dict:
        if len(go_hpo_mapping_dict):
            return go_hpo_mapping_dict[cui][0]
    if cui in reach_grounding_dict:
        if len(reach_grounding_dict[cui]):
            return reach_grounding_dict[cui][0]
    if source in reach_grounding_dict:
        if len(reach_grounding_dict[source]):
            return reach_grounding_dict[source][0]
    if source.lower() in constituent_mapping_dict:
        return constituent_mapping_dict[source.lower()]
    return ''

In [18]:
def relation_mapping(row):
    rel = row['predicate'].lower()
    if rel in predMapD:
        return predMapD[rel]
    else:
        return ''

In [20]:
df['predicate_obo'] = df.apply(relation_mapping, axis=1)
df.head()

Unnamed: 0,related_common_name,subject_cui,subject_name,subject_source,predicate,object_source,object_cui,object_name,subj_reach_grounding,obj_reach_grounding,pmid,year,belief,sentence,pub_type,source_section,predicate_obo,subject_obo,object_obo
0,valerian,,,A-792611,Activation,CYP3A4,C3714798,Cytochrome_P-450_CYP3A4,"(None, None)",{'UP': 'P08684',23674609,2013 Aug,0.65,Compared with the 32 and 49% inhibition of CYP...,['Journal Article'],,RO_0002448,,
1,valerian,C0057223,Cytochrome_P-450_CYP2D6,CYP2D6- IC,Activation,metabolic process,C0025520,metabolic_aspects,"(None, None)","('GO', 'GO:0008152')",17910620,2007 Nov,0.65,St. John’s wort inhibited CYP2D6- IC mediated ...,['Journal Article'],,RO_0002448,,
2,valerian,,,HPG,Activation,papF,C0030428,Paraguay,"(None, None)","('UP', 'P08408')",33932511,2021 Aug 10,0.65,"10 μM, 30 μM, and 50 μM of HPG caused outward ...",['Journal Article'],,RO_0002448,,
3,valerian,C3898062,P-glycoprotein_Inhibitor,P-gp inhibitor,Activation,digoxin,C0012265,Digoxin,"(None, None)","('CHEBI', 'CHEBI:4551')",18331390,2008 May,0.65,"Verapamil, the classical P-gp inhibitor, dec...",['Journal Article'],,RO_0002448,,
4,valerian,C3898062,P-glycoprotein_Inhibitor,P-gp inhibitor,Activation,transport,C0005528,Biological_Transport,"(None, None)","('GO', 'GO:0006810')",18331390,2008 May,0.65,"Verapamil, the classical P-gp inhibitor, dec...",['Journal Article'],,RO_0002448,,


In [23]:
df['subject_obo'] = df.apply(get_obo_mapping, axis=1, col='subject')
df['object_obo'] = df.apply(get_obo_mapping, axis=1, col='object')
df.head()

Unnamed: 0,related_common_name,subject_cui,subject_name,subject_source,predicate,object_source,object_cui,object_name,subj_reach_grounding,obj_reach_grounding,pmid,year,belief,sentence,pub_type,source_section,predicate_obo,subject_obo,object_obo
0,valerian,,,A-792611,Activation,CYP3A4,C3714798,Cytochrome_P-450_CYP3A4,"(None, None)",{'UP': 'P08684',23674609,2013 Aug,0.65,Compared with the 32 and 49% inhibition of CYP...,['Journal Article'],,RO_0002448,,PR_P08684
1,valerian,C0057223,Cytochrome_P-450_CYP2D6,CYP2D6- IC,Activation,metabolic process,C0025520,metabolic_aspects,"(None, None)","('GO', 'GO:0008152')",17910620,2007 Nov,0.65,St. John’s wort inhibited CYP2D6- IC mediated ...,['Journal Article'],,RO_0002448,PR_P10635,GO_0008152
2,valerian,,,HPG,Activation,papF,C0030428,Paraguay,"(None, None)","('UP', 'P08408')",33932511,2021 Aug 10,0.65,"10 μM, 30 μM, and 50 μM of HPG caused outward ...",['Journal Article'],,RO_0002448,,PR_P08408
3,valerian,C3898062,P-glycoprotein_Inhibitor,P-gp inhibitor,Activation,digoxin,C0012265,Digoxin,"(None, None)","('CHEBI', 'CHEBI:4551')",18331390,2008 May,0.65,"Verapamil, the classical P-gp inhibitor, dec...",['Journal Article'],,RO_0002448,,CHEBI_4551
4,valerian,C3898062,P-glycoprotein_Inhibitor,P-gp inhibitor,Activation,transport,C0005528,Biological_Transport,"(None, None)","('GO', 'GO:0006810')",18331390,2008 May,0.65,"Verapamil, the classical P-gp inhibitor, dec...",['Journal Article'],,RO_0002448,,GO_0006810


In [24]:
import pickle
with open('cui_to_ontology_maps/CUItoOBO_20220505.pickle', 'rb') as filep:
    onto_dict = pickle.load(filep)
len(onto_dict)

3951

In [25]:
#get ontorunner mappings and add to df
def get_obo_onto_mapping(row, col):
    if col == 'subject':
        cui = row['subject_cui']
        mapping = row['subject_obo']
    elif col == 'object':
        cui = row['object_cui']
        mapping = row['object_obo']
    else:
        print('specify if subject or object mapping required')
        exit(0)
    if mapping == '':
        if cui in onto_dict:
            return onto_dict[cui]
    else:
        return mapping

In [26]:
##for unmapped - try gilda and run ontorunner again
df['subject_obo'] = df.apply(get_obo_onto_mapping, axis=1, col='subject')
df['object_obo'] = df.apply(get_obo_onto_mapping, axis=1, col='object')

In [33]:
df = df.fillna('')

''

In [34]:
##finish mappings
##get unmapped terms
dfsubj_unmap = df.loc[df['subject_obo'] == '']
dfsubj_unmap.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9302 entries, 0 to 63639
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   related_common_name   9302 non-null   object 
 1   subject_cui           9302 non-null   object 
 2   subject_name          9302 non-null   object 
 3   subject_source        9302 non-null   object 
 4   predicate             9302 non-null   object 
 5   object_source         9302 non-null   object 
 6   object_cui            9302 non-null   object 
 7   object_name           9302 non-null   object 
 8   subj_reach_grounding  9302 non-null   object 
 9   obj_reach_grounding   9302 non-null   object 
 10  pmid                  9302 non-null   int64  
 11  year                  9302 non-null   object 
 12  belief                9302 non-null   float64
 13  sentence              9302 non-null   object 
 14  pub_type              9302 non-null   object 
 15  source_section      

In [35]:
dfobj_unmap = df.loc[df['object_obo'] == '']
dfobj_unmap.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8520 entries, 6 to 63623
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   related_common_name   8520 non-null   object 
 1   subject_cui           8520 non-null   object 
 2   subject_name          8520 non-null   object 
 3   subject_source        8520 non-null   object 
 4   predicate             8520 non-null   object 
 5   object_source         8520 non-null   object 
 6   object_cui            8520 non-null   object 
 7   object_name           8520 non-null   object 
 8   subj_reach_grounding  8520 non-null   object 
 9   obj_reach_grounding   8520 non-null   object 
 10  pmid                  8520 non-null   int64  
 11  year                  8520 non-null   object 
 12  belief                8520 non-null   float64
 13  sentence              8520 non-null   object 
 14  pub_type              8520 non-null   object 
 15  source_section      

In [37]:
dfsubj_unmap = dfsubj_unmap[['subject_cui', 'subject_source', 'subject_name']]
dfsubj_unmap = dfsubj_unmap.rename(columns=
{'subject_cui':'cui', 'subject_source':'source', 'subject_name':'pref_name'})
dfobj_unmap = dfobj_unmap[['object_cui', 'object_source', 'object_name']]
dfobj_unmap = dfobj_unmap.rename(columns=
{'object_cui':'cui', 'object_source':'source', 'object_name':'pref_name'})

In [40]:
dfunmap = pd.concat([dfsubj_unmap, dfobj_unmap])
dfunmap = dfunmap.drop_duplicates()
dfunmap = dfunmap.reset_index(drop=True)
dfunmap.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5894 entries, 0 to 5893
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   cui        5894 non-null   object
 1   source     5894 non-null   object
 2   pref_name  5894 non-null   object
dtypes: object(3)
memory usage: 138.3+ KB


In [41]:
dfunmap.to_csv('reach_data/unmapped_reach_terms_20230412.tsv', sep='\t', index=False)

In [None]:
##get unmapped subjects and objects and run gilda/ontorunner - add mappings back

In [42]:
df.to_csv('reach_data/reach_all_predications_mapped_20230412.tsv', sep='\t', index=False)

### Add prefixes and process - run from here after mapping

In [44]:
def add_prefix(row, col):
    ##check for prefix before adding

    obo_prefix = 'http://purl.obolibrary.org/obo/'
    if col == 'predicate':
        predicate_obo = row['predicate_obo']
        if obo_prefix in predicate_obo:
            return predicate_obo
        else:
            return obo_prefix+predicate_obo
    elif col == 'subject':
        subject_obo = row['subject_obo']
        if isinstance(subject_obo, list) and subject_obo:
            subject_obo = subject_obo[0]
        if subject_obo == '':
            return ''
        subject_obo = subject_obo.replace(']', '')
        subject_obo = subject_obo.replace('[', '')
        subject_obo = subject_obo.replace(')', '')
        subject_obo = subject_obo.replace('(', '')
        if 'http' not in subject_obo:
            return obo_prefix+subject_obo
        else:
            return subject_obo
    elif col == 'object':
        object_obo = row['object_obo']
        if isinstance(object_obo, list) and object_obo:
            object_obo = object_obo[0]
        if object_obo == '':
            return ''
        object_obo = object_obo.replace(']', '')
        object_obo = object_obo.replace('[', '')
        object_obo = object_obo.replace(')', '')
        object_obo = object_obo.replace('(', '')
        if 'http' not in object_obo:
            return obo_prefix+object_obo
        else:
            return object_obo

In [45]:
#add OBO identifiers to the OBO mappings (where not present) - see df
#drop rows with no mappings
df['subject_obo'] = df.apply(add_prefix, axis=1, col='subject')
df['object_obo'] = df.apply(add_prefix, axis=1, col='object')
df['predicate_obo'] = df.apply(add_prefix, axis=1, col='predicate')

In [46]:
##fix napdi identifiers
for i in range(len(df.index)):
    subj = df.at[i, 'subject_obo']
    obj = df.at[i, 'object_obo']
    if 'napdi' in subj:
        subjnew = subj.replace('http://purl.obolibrary.org/obo/', 'http://napdi.org/')
        df.at[i, 'subject_obo'] = subjnew
    if 'napdi' in obj:
        objnew = obj.replace('http://purl.obolibrary.org/obo/', 'http://napdi.org/')
        df.at[i, 'object_obo'] = objnew

In [47]:
df.to_csv('reach_data/reach_all_predications_mapped_prefixes_20230412.tsv', sep='\t', index=False)

In [2]:
df = pd.read_csv('reach_data/reach_all_predications_mapped_prefixes_20230412.tsv', sep='\t')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63640 entries, 0 to 63639
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   related_common_name   63640 non-null  object 
 1   subject_cui           55344 non-null  object 
 2   subject_name          55344 non-null  object 
 3   subject_source        63516 non-null  object 
 4   predicate             63640 non-null  object 
 5   object_source         63640 non-null  object 
 6   object_cui            56717 non-null  object 
 7   object_name           56717 non-null  object 
 8   subj_reach_grounding  63516 non-null  object 
 9   obj_reach_grounding   63640 non-null  object 
 10  pmid                  63640 non-null  int64  
 11  year                  63640 non-null  object 
 12  belief                63640 non-null  float64
 13  sentence              63640 non-null  object 
 14  pub_type              63640 non-null  object 
 15  source_section     

In [8]:
df = df.fillna('')

In [9]:
obo_prefix = 'http://purl.obolibrary.org/obo/'
#remove all rows with blank obo mappings and only prefix rows
df_new = df[df['subject_obo'] != '']
df_new = df_new[df_new['subject_obo'] != obo_prefix]
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54338 entries, 1 to 63638
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   related_common_name   54338 non-null  object 
 1   subject_cui           54338 non-null  object 
 2   subject_name          54338 non-null  object 
 3   subject_source        54338 non-null  object 
 4   predicate             54338 non-null  object 
 5   object_source         54338 non-null  object 
 6   object_cui            54338 non-null  object 
 7   object_name           54338 non-null  object 
 8   subj_reach_grounding  54338 non-null  object 
 9   obj_reach_grounding   54338 non-null  object 
 10  pmid                  54338 non-null  int64  
 11  year                  54338 non-null  object 
 12  belief                54338 non-null  float64
 13  sentence              54338 non-null  object 
 14  pub_type              54338 non-null  object 
 15  source_section     

In [10]:
df_new = df_new[df_new['object_obo'] != '']
df_new = df_new[df_new['object_obo'] != obo_prefix]
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 47206 entries, 1 to 63638
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   related_common_name   47206 non-null  object 
 1   subject_cui           47206 non-null  object 
 2   subject_name          47206 non-null  object 
 3   subject_source        47206 non-null  object 
 4   predicate             47206 non-null  object 
 5   object_source         47206 non-null  object 
 6   object_cui            47206 non-null  object 
 7   object_name           47206 non-null  object 
 8   subj_reach_grounding  47206 non-null  object 
 9   obj_reach_grounding   47206 non-null  object 
 10  pmid                  47206 non-null  int64  
 11  year                  47206 non-null  object 
 12  belief                47206 non-null  float64
 13  sentence              47206 non-null  object 
 14  pub_type              47206 non-null  object 
 15  source_section     

In [11]:
df_new = df_new.drop_duplicates()
df_new = df_new.reset_index(drop=True)
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47125 entries, 0 to 47124
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   related_common_name   47125 non-null  object 
 1   subject_cui           47125 non-null  object 
 2   subject_name          47125 non-null  object 
 3   subject_source        47125 non-null  object 
 4   predicate             47125 non-null  object 
 5   object_source         47125 non-null  object 
 6   object_cui            47125 non-null  object 
 7   object_name           47125 non-null  object 
 8   subj_reach_grounding  47125 non-null  object 
 9   obj_reach_grounding   47125 non-null  object 
 10  pmid                  47125 non-null  int64  
 11  year                  47125 non-null  object 
 12  belief                47125 non-null  float64
 13  sentence              47125 non-null  object 
 14  pub_type              47125 non-null  object 
 15  source_section     

In [12]:
##new mappings not included yet - 74% mapped
df_new.to_csv('reach_data/reach_all_predicates_mapped_processed_20230412.tsv', sep='\t', index=False)

In [26]:
import pandas as pd
df_new = pd.read_csv('reach_data/reach_all_predicates_mapped_processed_20230412.tsv', sep='\t')
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47125 entries, 0 to 47124
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   related_common_name   47125 non-null  object 
 1   subject_cui           44264 non-null  object 
 2   subject_name          44264 non-null  object 
 3   subject_source        47125 non-null  object 
 4   predicate             47125 non-null  object 
 5   object_source         47125 non-null  object 
 6   object_cui            44729 non-null  object 
 7   object_name           44729 non-null  object 
 8   subj_reach_grounding  47125 non-null  object 
 9   obj_reach_grounding   47125 non-null  object 
 10  pmid                  47125 non-null  int64  
 11  year                  47125 non-null  object 
 12  belief                47125 non-null  float64
 13  sentence              47125 non-null  object 
 14  pub_type              47125 non-null  object 
 15  source_section     

  interactivity=interactivity, compiler=compiler, result=result)


### Create graph

In [27]:
#Create networkx graph from triples
import glob
import hashlib
import json
import networkx as nx  # type: ignore
import os
import os.path

from collections import Counter  # type: ignore
from more_itertools import unique_everseen  # type: ignore
from rdflib import BNode, Graph, Literal, Namespace, URIRef  # type: ignore
from rdflib.namespace import OWL, RDF, RDFS  # type: ignore
from rdflib.plugins.serializers.nt import _quoteLiteral  # type: ignore
import subprocess

from tqdm import tqdm  # type: ignore
from typing import Dict, List, Optional, Set, Tuple, Union

# set-up environment variables
obo = Namespace('http://purl.obolibrary.org/obo/')
oboinowl = Namespace('http://www.geneontology.org/formats/oboInOwl#')
schema = Namespace('http://www.w3.org/2001/XMLSchema#')

In [3]:
dfres = df_new[['subject_obo', 'predicate_obo', 'object_obo']]
dfres = dfres.drop_duplicates()
dfres = dfres.reset_index(drop=True)
dfres.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40245 entries, 0 to 40244
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   subject_obo    40245 non-null  object
 1   predicate_obo  40245 non-null  object
 2   object_obo     40245 non-null  object
dtypes: object(3)
memory usage: 943.4+ KB


In [4]:
#N = 40245
dfres.to_csv('reach_data/reach_pmid_all_predicates_processed_triples_20230412.tsv', sep='\t', index=False)

In [5]:
df_new['predicate'].value_counts()

Activation           21974
Inhibition           16506
IncreaseAmount        3838
DecreaseAmount        2785
Phosphorylation        725
Hydroxylation          462
Dephosphorylation      286
Dehydroxylation        160
Demethylation          145
Methylation            104
Glycosylation           42
Acetylation             40
Deacetylation           20
Ubiquitination          17
Deglycosylation          8
Deubiquitination         8
Ribosylation             3
Sumoylation              2
Name: predicate, dtype: int64

In [6]:
df_new['predicate_obo'].value_counts()

http://purl.obolibrary.org/obo/RO_0002448    21974
http://purl.obolibrary.org/obo/RO_0002449    16506
http://purl.obolibrary.org/obo/RO_0011009     3838
http://purl.obolibrary.org/obo/RO_0011010     2785
http://purl.obolibrary.org/obo/RO_0002447      725
http://purl.obolibrary.org/obo/GO_0018126      462
http://purl.obolibrary.org/obo/GO_0006470      286
http://purl.obolibrary.org/obo/RO_0002436      165
http://purl.obolibrary.org/obo/GO_0006482      145
http://purl.obolibrary.org/obo/GO_0006479      104
http://purl.obolibrary.org/obo/GO_0006486       42
http://purl.obolibrary.org/obo/GO_0006473       40
http://purl.obolibrary.org/obo/GO_0006476       20
http://purl.obolibrary.org/obo/RO_0002480       17
http://purl.obolibrary.org/obo/GO_0016579        8
http://purl.obolibrary.org/obo/GO_0006517        8
Name: predicate_obo, dtype: int64

In [28]:
import ast


#### Fix prefixes and nodes

In [29]:
##get unique prefixes for subjects and objects
prefixes = []
for i in range(len(df_new.index)):
    subj = df_new.at[i, 'subject_obo']
    obj = df_new.at[i, 'object_obo']
    prefixes.append(subj.split('/')[-1].split('_')[0])
    prefixes.append(obj.split('/')[-1].split('_')[0])
prefixes = list(set(prefixes))
len(prefixes)

14

In [30]:
prefixes

['PR',
 'SO',
 'OBO',
 'GOT',
 'napdi',
 'NCBITaxon',
 'GO',
 'CHEBI',
 'MONDO',
 'UBERON',
 'DOID',
 'CL',
 'HP',
 'SGOT']

'Journal Article'

In [31]:
problems = ['OBO', 'SGOT', 'GOT']
for i in range(len(df_new.index)):
    subj = df_new.at[i, 'subject_obo']
    obj = df_new.at[i, 'object_obo']
    subjpref = subj.split('/')[-1].split('_')[0]
    objpref = obj.split('/')[-1].split('_')[0]
    if subjpref in problems:
        if subjpref == 'OBO':
            df_new.at[i, 'subject_obo'] = subj.replace('OBO_', '')
        elif subjpref == 'SGOT':
            df_new.at[i, 'subject_obo'] = subj.replace('SGOT', 'PR_000008153')
        elif subjpref == 'GOT':
            df_new.at[i, 'subject_obo'] = subj.replace('GOT', 'PR_000008153')
    if objpref in problems:
        if objpref == 'OBO':
            df_new.at[i, 'object_obo'] = obj.replace('OBO_', '')
        elif objpref == 'SGOT':
            df_new.at[i, 'object_obo'] = obj.replace('SGOT', 'PR_000008153')
        elif objpref == 'GOT':
            df_new.at[i, 'object_obo'] = obj.replace('GOT', 'PR_000008153')

In [32]:
df_new.to_csv('reach_data/reach_all_predicates_mapped_processed_new_20230423.tsv', sep='\t', index=False)

In [33]:
import pandas as pd
df_new = pd.read_csv('reach_data/reach_all_predicates_mapped_processed_new_20230423.tsv', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [15]:
typedict = {
    'CHEBI': 'chemical',
    'PR': 'protein',
    'GO': 'GO_process',
    'DOID': 'disease',
    'HP': 'phenotype',
    'UBERON': 'anatomical_entity',
    'SO': 'sequence',
    'CL': 'cell',
    'NCBITaxon': 'organism',
    'MONDO': 'disease',
    'napdi': 'natural_product',
    'OAE': 'adverse_event',
    'NCBIGene': 'gene'
}

In [7]:
#check prefixes
prefixes = []
for i in range(len(df_new.index)):
    subj = df_new.at[i, 'subject_obo']
    obj = df_new.at[i, 'object_obo']
    prefixes.append(subj.split('/')[-1].split('_')[0])
    prefixes.append(obj.split('/')[-1].split('_')[0])
prefixes = list(set(prefixes))
prefixes

['PR',
 'SO',
 'OAE',
 'napdi',
 'NCBITaxon',
 'GO',
 'CHEBI',
 'MONDO',
 'UBERON',
 'DOID',
 'CL',
 'HP']

In [8]:
#create rdflib graph from dataframe triples and serialize as ntriples file
graph  = Graph()
pred_label = URIRef("http://www.w3.org/2000/01/rdf-schema#label")
for i in range(len(df_new.index)):
    subj = df_new.at[i, 'subject_obo']
    obj = df_new.at[i, 'object_obo']
    pred = df_new.at[i, 'predicate_obo']
    subj_node = URIRef(subj)
    obj_node = URIRef(obj)
    predicate = URIRef(pred)
    subj_name = df_new.at[i, 'subject_name']
    obj_name = df_new.at[i, 'object_name']
    graph.add((subj_node, predicate, obj_node))
    graph.add((subj_node, pred_label, Literal(subj_name)))
    graph.add((obj_node, pred_label, Literal(obj_name)))

In [10]:
graph.serialize('output_graphs/machinread_reach_version1.nt', format='nt')

In [9]:
len(graph)

47285

In [11]:
def n3(node: Union[URIRef, BNode, Literal]) -> str:
    """Method takes an RDFLib node of type BNode, URIRef, or Literal and serializes it to meet the RDF 1.1 NTriples
    format.
    Src: https://github.com/RDFLib/rdflib/blob/c11f7b503b50b7c3cdeec0f36261fa09b0615380/rdflib/plugins/serializers/nt.py
    Args:
        node: An RDFLib
    Returns:
        serialized_node: A string containing the serialized
    """
    if isinstance(node, Literal): serialized_node = "%s" % _quoteLiteral(node)
    else: serialized_node = "%s" % node.n3()
    return serialized_node

In [12]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47125 entries, 0 to 47124
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   related_common_name   47125 non-null  object 
 1   subject_cui           44264 non-null  object 
 2   subject_name          44264 non-null  object 
 3   subject_source        47125 non-null  object 
 4   predicate             47125 non-null  object 
 5   object_source         47125 non-null  object 
 6   object_cui            44729 non-null  object 
 7   object_name           44729 non-null  object 
 8   subj_reach_grounding  47125 non-null  object 
 9   obj_reach_grounding   47125 non-null  object 
 10  pmid                  47125 non-null  int64  
 11  year                  47125 non-null  object 
 12  belief                47125 non-null  float64
 13  sentence              47125 non-null  object 
 14  pub_type              47125 non-null  object 
 15  source_section     

In [None]:
##Creating edge metadata dict of format:
#'CHEBI_xx-NCBITaxon_xx':
# {
#    'type': 'chemical_substance-organism',
#    'relations': {
#       'r1': {
#          'source': [],
#          'year': [],
#          'pmid': [],
#          'sentence': [],
#          'belief': [],
#          'weight': [],
#          'pubtype': [],
#          'source_section': [],
# }
#       'r2': {
# }
#}

In [13]:
df_new = df_new.fillna('')

In [16]:
#convert rdflib graph to multidigraph - code borrowed from PheKnowLator: kg_utils.py
#use the pred key to also create a dictionary with metadata about the edge - 
#pub_year, pmid, source graph, belief
nx_mdg = nx.MultiDiGraph()
edge_metadict = {}

for s, p, o in tqdm(graph):
    #do not save label predicate to gpickle
    subj = str(s)
    obj = str(o)
    pred = str(p)
    if pred == 'http://www.w3.org/2000/01/rdf-schema#label':
        continue
    else:
        ##defining metadata dict for edge type
        subj_type = typedict[subj.split('/')[-1].split('_')[0]]
        obj_type = typedict[obj.split('/')[-1].split('_')[0]]
        subj_curie = subj.split('/')[-1]
        obj_curie = obj.split('/')[-1]
        edge_key = subj_curie + '-' + obj_curie
        if edge_key not in edge_metadict:
            edge_metadict[edge_key] = {
                'type': subj_type + '-' + obj_type,
                'relations': {}
            }
        pred_curie = pred.split('/')[-1]
        pred_key = hashlib.md5('{}{}{}'.format(n3(s), n3(p), n3(o)).encode()).hexdigest()
        pmid = str(df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj)  & 
                       (df_new['predicate_obo'] == pred)]['pmid'].values[0])
        timestamp = str(df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj)  & 
                                   (df_new['predicate_obo'] == pred)]['year'].values[0])
        belief_score = df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj)  & 
                                   (df_new['predicate_obo'] == pred)]['belief'].values[0]
        sentence = df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj)  & 
                                   (df_new['predicate_obo'] == pred)]['sentence'].values[0]
        source_section = df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj)  
                                & (df_new['predicate_obo'] == pred)]['source_section'].values[0]
        pub_type_list = df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj)
                                & (df_new['predicate_obo'] == pred)]['pub_type'].values[0]
        pub_type = ast.literal_eval(pub_type_list)
        pub_type = ' '.join(pub_type)
        #add edge to graph
        nx_mdg.add_node(s, key=n3(s))
        nx_mdg.add_node(o, key=n3(o))
        nx_mdg.add_edge(s, o, **{'key': p, 'predicate_key': pred_key, 'weight':0.0,
                                  'source_graph': 'machine_read'})
        
        #add edge metadata to edge_metadict
        if pred_curie not in edge_metadict[edge_key]['relations']:
            edge_metadict[edge_key]['relations'][pred_curie] = {
                'source': [],
                'year': [],
                'pmid': [],
                'sentence': [],
                'belief': [],
                'weight': [],
                'pubtype': [],
                'source_section': [],
            }
        edge_metadict[edge_key]['relations'][pred_curie]['source'].append('reach')
        edge_metadict[edge_key]['relations'][pred_curie]['year'].append(timestamp)
        edge_metadict[edge_key]['relations'][pred_curie]['pmid'].append(pmid)
        edge_metadict[edge_key]['relations'][pred_curie]['sentence'].append(sentence)
        edge_metadict[edge_key]['relations'][pred_curie]['belief'].append(belief_score)
        edge_metadict[edge_key]['relations'][pred_curie]['weight'].append(0.0)
        edge_metadict[edge_key]['relations'][pred_curie]['pubtype'].append(pub_type)
        edge_metadict[edge_key]['relations'][pred_curie]['source_section'].append(source_section)
            
nx.write_gpickle(nx_mdg, "output_graphs/machineread_reach_version1.gpickle")

100%|██████████| 47285/47285 [54:49<00:00, 14.38it/s] 


In [17]:
for k, v in edge_metadict.items():
    print(k)
    print(v)
    break

PR_287-PR_Q13507
{'type': 'protein-protein', 'relations': {'RO_0002448': {'source': ['reach'], 'year': ['2000 Aug'], 'pmid': ['10983988'], 'sentence': ['<RESULTS> Ca21 Release by Activation of RyRs Activates hTrp3 and Icrac The parental HEK293 cells and the HEK 293 cells stably expressing the hTrp3 channels (T3 cells) used in the present study showed a similar Ca21 release in response to stimulation with the RyRs ligand caffeine (Figure 1, A1), which averaged 38.7 6 7.7% (n 5 5) and 37.2 6 3.8% (n 5 7) of that caused by stimulation of the Gq- coupled P2Y receptor with UTP.'], 'belief': [0.86], 'weight': [0.0], 'pubtype': ['Journal Article'], 'source_section': ['']}}}


In [19]:
##write edge metadata to pickle
import pickle
with open('output_graphs/edge_metadata_reach_version1.pickle', 'wb') as filep:
    pickle.dump(edge_metadict, filep)

In [20]:
#this should have less edges than rdflib graph after removing 'labels'
nodes = nx.number_of_nodes(nx_mdg)
edges = nx.number_of_edges(nx_mdg)
density = nx.density(nx_mdg)
avg_deg = float(edges)/nodes
print(nodes, edges, density, avg_deg)

6217 40245 0.0010414059593728245 6.473379443461477


In [21]:
#save node labels as dictionary
#key: URI, value is label
label_dict = {}
##change label for activation and inhibition, and make all lowercase
for i in range(len(df_new.index)):
    subj = str(df_new.at[i, 'subject_obo'])
    obj = str(df_new.at[i, 'object_obo'])
    pred = str(df_new.at[i, 'predicate_obo'])
    if subj not in label_dict:
        label_dict[subj] = {}
        label_dict[subj]['entity_type'] = 'NODES'
        label_dict[subj]['label'] = df_new.at[i, 'subject_name']
        label_dict[subj]['cui'] = df_new.at[i, 'subject_cui']
    if obj not in label_dict:
        label_dict[obj] = {}
        label_dict[obj]['entity_type'] = 'NODES'
        label_dict[obj]['label'] = df_new.at[i, 'object_name']
        label_dict[obj]['cui'] = df_new.at[i, 'object_cui']
    if pred not in label_dict:
        label_dict[pred] = {}
        label_dict[pred]['entity_type'] = 'RELATIONS'
        predlabel = df_new.at[i, 'predicate'].lower()
        label_dict[pred]['label'] = df_new.at[i, 'predicate']
len(label_dict)

6230

In [22]:
import pickle
with open('output_graphs/reach_version1_NodeLabels.pickle', 'wb') as file_p2:
    pickle.dump(label_dict, file_p2)

In [23]:
dfmap = pd.DataFrame.from_dict(label_dict, orient='index')
dfmap = dfmap.rename(columns={"index":"entity_uri"})
dfmap = dfmap.reset_index()
dfmap.head()

Unnamed: 0,index,entity_type,label,cui
0,http://purl.obolibrary.org/obo/PR_P10635,NODES,Cytochrome_P-450_CYP2D6,C0057223
1,http://purl.obolibrary.org/obo/GO_0008152,NODES,metabolic_aspects,C0025520
2,http://purl.obolibrary.org/obo/RO_0002448,RELATIONS,Activation,
3,http://purl.obolibrary.org/obo/CHEBI_77748,NODES,P-glycoprotein_Inhibitor,C3898062
4,http://purl.obolibrary.org/obo/CHEBI_4551,NODES,Digoxin,C0012265


In [24]:
dfmap.to_csv('output_graphs/reach_NodeLabels.tsv', index=False, sep='\t')