## Notebook to process all INDRA/REACH output

Steps:
1. Input all predications (after JSON extraction and CUI mapping) from reach_data/
2. Preprocess data
3. Mapping to CURIEs
4. Add prefixes 
5. Create RDF and Networkx graph

TODO:
1. NER on unmapped for all data
2. Metadata dict/file separate from graph

In [1]:
import pandas as pd
# import built-in namespaces
from rdflib.namespace import OWL, RDF, RDFS
import numpy as np
import os
import pickle

### Input data and save counts

In [2]:
##input all TSV files from reach_data folder as pandas dataframes
# get all files in the folder
files = os.listdir('reach_data')
# get only the tsv files
tsv_files = [file for file in files if file.endswith('.tsv')]
print(tsv_files)
df = pd.DataFrame()
for filename in tsv_files:
    # read the file as a pandas dataframe
    dfnp = pd.read_csv('reach_data/' + filename, sep='\t')
    # get the name of the file without the extension
    dfnp['related_common_name'] = filename.split('_')[0]
    dfnp = dfnp.drop_duplicates()
    df = pd.concat([df, dfnp], ignore_index=True)
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66241 entries, 0 to 66240
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   seq                   66241 non-null  int64  
 1   pmid                  66241 non-null  int64  
 2   subject_cui           57596 non-null  object 
 3   subject_name          57596 non-null  object 
 4   subject_type          57596 non-null  object 
 5   subject_source        65995 non-null  object 
 6   subj_map_reach        66115 non-null  object 
 7   predicate             66241 non-null  object 
 8   object_source         66138 non-null  object 
 9   object_cui            59093 non-null  object 
 10  object_name           59093 non-null  object 
 11  object_type           59093 non-null  object 
 12  obj_map_reach         66241 non-null  object 
 13  belief                66241 non-null  float64
 14  sentence              66241 non-null  object 
 15  year               

In [3]:
def save_data_statistics(df):
    nps = df['related_common_name'].unique()
    info = {
        'related_common_name': [],
        'PMIDs': [],
        'statements': []
    }
    for np_name in nps:
        dfnp = df.loc[df['related_common_name'] == np_name]
        info['related_common_name'].append(np_name)
        info['PMIDs'].append(len(dfnp['pmid'].unique()))
        info['statements'].append(len(dfnp))
    df_stats = pd.DataFrame.from_dict(info)
    df_stats.to_csv('reach_data/reach_data_statistics.tsv', sep='\t', index=False)

In [6]:
df = df.fillna('')
save_data_statistics(df)
df.head()

Unnamed: 0,seq,pmid,subject_cui,subject_name,subject_type,subject_source,subj_map_reach,predicate,object_source,object_cui,...,sentence,year,subject_score,object_score,umls_flag,subj_reach_grounding,obj_reach_grounding,pub_type,source_section,related_common_name
0,0,31092429,C0051767,amsonic_acid,"['orch', 'phsu']",DADS,{'TEXT': 'DADS'},Acetylation,Histone_H3,C0019647,...,DADS increases\nhistone H3 and H4 acetylation...,2019 May,14.64,17.8,1,"(None, None)","('FPLX', 'Histone_H3')",['Journal Article'],,garlic
1,1,31092429,C0051767,amsonic_acid,"['orch', 'phsu']",DADS,{'TEXT': 'DADS'},Acetylation,Histone_H4,,...,DADS increases\nhistone H3 and H4 acetylation...,2019 May,14.64,,1,"(None, None)","('FPLX', 'Histone_H4')",['Journal Article'],,garlic
2,2,21459083,,,,2g1,{'TEXT': '2g1'},Activation,7-hydroxycoumarin,C0049901,...,"In mice, Cyp2a5 and 2g1 produce 7-hydroxycouma...",2011 Jul 15,,14.64,1,"(None, None)","(None, None)","['Journal Article', 'Review']",,garlic
3,3,21459083,,,,2g1,{'TEXT': '2g1'},Activation,o-HPA,C0085355,...,"In mice, Cyp2a5 and 2g1 produce 7-hydroxycouma...",2011 Jul 15,,13.14,1,"(None, None)","(None, None)","['Journal Article', 'Review']",,garlic
4,4,21459083,,,,2g1,{'TEXT': '2g1'},Activation,Mice,C0026809,...,"In mice, Cyp2a5 and 2g1 produce 7-hydroxycouma...",2011 Jul 15,,30.42,1,"(None, None)","('MESH', 'D051379')","['Journal Article', 'Review']",,garlic


### Mappings 
1. REACH existing mappings
2. HPO and GO mappings
3. Existing OntoRunNER
4. New Gilda and OntoRunNER reviewed

In [7]:
#fixes reach mappings to OBO (if available) and adds text if no mapping or blank name of entity
for i in range(len(df.index)):
    subj_ground = df.at[i, 'subj_reach_grounding']
    obj_ground = df.at[i, 'obj_reach_grounding']
    subj_map = df.at[i, 'subj_map_reach']
    obj_map = df.at[i, 'obj_map_reach']
    if subj_ground == '(None, None)' or subj_ground == np.nan:
        temp = subj_map.split(',')
        if 'GO' in subj_map or 'CHEBI' in subj_map:
            for item in temp:
                if 'GO' in item or 'CHEBI' in item:
                    df.at[i, 'subj_reach_grounding'] = item
        elif 'UP' in subj_map:
            for item in temp:
                if 'UP' in item:
                    df.at[i, 'subj_reach_grounding'] = item
        elif 'TEXT' in subj_map:
            if df.at[i, 'subject_source'] == '':
                for item in temp:
                    if 'TEXT' in item:
                        df.at[i, 'subject_source'] = item.split(':')[-1]
    elif 'GO' not in subj_ground and 'CHEBI' not in subj_ground and 'UP' not in subj_ground:
        temp = subj_map.split(',')
        if 'GO' in subj_map or 'CHEBI' in subj_map:
            for item in temp:
                if 'GO' in item or 'CHEBI' in item:
                    df.at[i, 'subj_reach_grounding'] = item
        elif 'UP' in subj_map:
            for item in temp:
                if 'UP' in item:
                    df.at[i, 'subj_reach_grounding'] = item
        elif 'TEXT' in subj_map:
            if df.at[i, 'subject_source'] == '':
                for item in temp:
                    if 'TEXT' in item:
                        df.at[i, 'subject_source'] = item.split(':')[-1]
    
    #same for object mapping
    if obj_ground == '(None, None)' or obj_ground == np.nan:
        temp = obj_map.split(',')
        if 'GO' in obj_map or 'CHEBI' in obj_map:
            for item in temp:
                if 'GO' in item or 'CHEBI' in item:
                    df.at[i, 'obj_reach_grounding'] = item
        elif 'UP' in obj_map:
            for item in temp:
                if 'UP' in item:
                    df.at[i, 'obj_reach_grounding'] = item
        elif 'TEXT' in obj_map:
            if df.at[i, 'object_source'] == '':
                for item in temp:
                    if 'TEXT' in item:
                        df.at[i, 'object_source'] = item.split(':')[-1]
    elif 'GO' not in obj_ground and 'CHEBI' not in obj_ground and 'UP' not in obj_ground:
        temp = obj_map.split(',')
        if 'GO' in obj_map or 'CHEBI' in obj_map:
            for item in temp:
                if 'GO' in item or 'CHEBI' in item:
                    df.at[i, 'obj_reach_grounding'] = item
        elif 'UP' in obj_map:
            for item in temp:
                if 'UP' in item:
                    df.at[i, 'obj_reach_grounding'] = item
        elif 'TEXT' in obj_map:
            if df.at[i, 'object_source'] == '':
                for item in temp:
                    if 'TEXT' in item:
                        df.at[i, 'object_source'] = item.split(':')[-1]

In [8]:
df = df[['related_common_name', 'subject_cui', 'subject_name', 'subject_source', 'predicate', 'object_source',
       'object_cui', 'object_name', 'subj_reach_grounding', 'obj_reach_grounding', 'pmid', 
           'year', 'belief', 'sentence', 'pub_type', 'source_section']]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66241 entries, 0 to 66240
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   related_common_name   66241 non-null  object 
 1   subject_cui           66241 non-null  object 
 2   subject_name          66241 non-null  object 
 3   subject_source        66241 non-null  object 
 4   predicate             66241 non-null  object 
 5   object_source         66241 non-null  object 
 6   object_cui            66241 non-null  object 
 7   object_name           66241 non-null  object 
 8   subj_reach_grounding  66241 non-null  object 
 9   obj_reach_grounding   66241 non-null  object 
 10  pmid                  66241 non-null  int64  
 11  year                  66241 non-null  object 
 12  belief                66241 non-null  float64
 13  sentence              66241 non-null  object 
 14  pub_type              66241 non-null  object 
 15  source_section     

In [9]:
df['predicate_obo'] = None
df['subject_obo'] = None
df['object_obo'] = None

In [10]:
#map cui to reach grounding
reach_grounding_dict = {}
for i in range(len(df.index)):
    sub_cui = df.at[i, 'subject_cui']
    sub_source = df.at[i, 'subject_source']
    obj_cui = df.at[i, 'object_cui']
    obj_source = df.at[i, 'object_source']
    sub_ground = df.at[i, 'subj_reach_grounding']
    obj_ground = df.at[i, 'obj_reach_grounding']
    sub_key = i
    obj_key = i
    if sub_cui != '':
        sub_key = sub_cui
    elif sub_source != '':
        sub_key = sub_source
    if obj_cui != '':
        obj_key = obj_cui
    elif obj_source != '':
        obj_key = obj_source
    if sub_key not in reach_grounding_dict:
        reach_grounding_dict[sub_key] = []
    if obj_key not in reach_grounding_dict:
        reach_grounding_dict[obj_key] = []
    if sub_ground != np.nan:
        if 'CHEBI' in sub_ground or 'GO' in sub_ground:
            subject_map = sub_ground.split("'")[-2]
            subject_map = subject_map.replace(':', '_')
            reach_grounding_dict[sub_key].append(subject_map)
        elif 'UP' in sub_ground:
            subject_map = sub_ground.split("'")[-2]
            subject_map = 'PR_' + subject_map
            reach_grounding_dict[sub_key].append(subject_map)
    if obj_ground != np.nan:
        if 'CHEBI' in obj_ground or 'GO' in obj_ground:
            object_map = obj_ground.split("'")[-2]
            object_map = object_map.replace(':', '_')
            reach_grounding_dict[obj_key].append(object_map)
        elif 'UP' in obj_ground:
            object_map = obj_ground.split("'")[-2]
            object_map = 'PR_' + object_map
            reach_grounding_dict[obj_key].append(object_map)
print(len(reach_grounding_dict))

12823


In [11]:
constituent_mapping_dict = {}
dfcons = pd.read_csv('cui_to_ontology_maps/chebi-extensions-constituents-NP-20240229.tsv', sep='\t')
dfcons.head()

Unnamed: 0,constituent_name,URI
0,12beta-acetoxycimigenol 3-o-beta-d-xylopyranoside,http://napdi.org/napdi_srs_imports:12beta_acet...
1,2'-o-acetylactein,http://napdi.org/napdi_srs_imports:2_o_acetyla...
2,2'-o-acetylcimicifugoside h1,http://napdi.org/napdi_srs_imports:2_o_acetylc...
3,23-epi-26-deoxyactein,http://purl.obolibrary.org/obo/CHEBI_70243
4,23-o-acetylshengmanol,http://napdi.org/napdi_srs_imports:23_o_acetyl...


In [12]:
##use for mapping after prefixing
for i in range(len(dfcons.index)):
    constituent_name = dfcons.at[i, 'constituent_name']
    uri = dfcons.at[i, 'URI']
    constituent_mapping_dict[constituent_name] = uri
len(constituent_mapping_dict)

671

In [13]:
predMapD = {
    'regulateactivity':'RO_0011002',
    'regulateamount':'RO_0011003',
    'phosphorylation':'RO_0002447',
    'dephosphorylation':'GO_0006470',
    'ubiquitination':'RO_0002480',
    'deubiquitination':'GO_0016579',
    'sumoylation':'RO_0002436',
    'desumoylation':'RO_0002436',
    'hydroxylation':'GO_0018126',
    'dehydroxylation':'RO_0002436',
    'acetylation':'GO_0006473',
    'deacetylation':'GO_0006476',
    'glycosylation':'GO_0006486',
    'deglycosylation':'GO_0006517',
    'farnesylation':'RO_0002436',
    'defarnesylation':'RO_0002436',
    'geranylgeranylation':'RO_0002436',
    'degeranylgeranylation':'RO_0002436',
    'palmitoylation':'RO_0002436',
    'depalmitoylation':'RO_0002436',
    'myristoylation':'RO_0002436',
    'demyristoylation':'RO_0002436',
    'ribosylation':'RO_0002436',
    'deribosylation':'RO_0002436',
    'methylation':'GO_0006479',
    'demethylation':'GO_0006482',
    'activation':'RO_0002448',
    'inhibition':'RO_0002449',
    'increaseamount':'RO_0011009',
    'decreaseamount':'RO_0011010'
}

In [14]:
with open('cui_to_ontology_maps/go_hpo_map_dict.pickle', 'rb') as filep:
    go_hpo_mapping_dict = pickle.load(filep)
len(go_hpo_mapping_dict)

87745

In [15]:
#if reach_grounding has multiple values, currently only first one is taken -- assess if this is the best option
def get_obo_mapping(row, col):
    if col == 'subject':
        cui = row['subject_cui']
        source = row['subject_source']
    elif col == 'object':
        cui = row['object_cui']
        source = row['object_source']
    else:
        print('specify if subject or object mapping required')
        exit(0)
    if cui in go_hpo_mapping_dict:
        if len(go_hpo_mapping_dict):
            return go_hpo_mapping_dict[cui][0]
    if cui in reach_grounding_dict:
        if len(reach_grounding_dict[cui]):
            return reach_grounding_dict[cui][0]
    if source in reach_grounding_dict:
        if len(reach_grounding_dict[source]):
            return reach_grounding_dict[source][0]
    if source.lower() in constituent_mapping_dict:
        return constituent_mapping_dict[source.lower()]
    return ''

In [16]:
def relation_mapping(row):
    rel = row['predicate'].lower()
    if rel in predMapD:
        return predMapD[rel]
    else:
        return ''

In [17]:
df['predicate_obo'] = df.apply(relation_mapping, axis=1)
df.head()

Unnamed: 0,related_common_name,subject_cui,subject_name,subject_source,predicate,object_source,object_cui,object_name,subj_reach_grounding,obj_reach_grounding,pmid,year,belief,sentence,pub_type,source_section,predicate_obo,subject_obo,object_obo
0,garlic,C0051767,amsonic_acid,DADS,Acetylation,Histone_H3,C0019647,Histone_H3,"(None, None)","('FPLX', 'Histone_H3')",31092429,2019 May,0.65,DADS increases\nhistone H3 and H4 acetylation...,['Journal Article'],,GO_0006473,,
1,garlic,C0051767,amsonic_acid,DADS,Acetylation,Histone_H4,,,"(None, None)",'UP': 'P62805',31092429,2019 May,0.65,DADS increases\nhistone H3 and H4 acetylation...,['Journal Article'],,GO_0006473,,
2,garlic,,,2g1,Activation,7-hydroxycoumarin,C0049901,7-hydroxycoumarin,"(None, None)","(None, None)",21459083,2011 Jul 15,0.65,"In mice, Cyp2a5 and 2g1 produce 7-hydroxycouma...","['Journal Article', 'Review']",,RO_0002448,,
3,garlic,,,2g1,Activation,o-HPA,C0085355,Human_Platelet_Antigens,"(None, None)","(None, None)",21459083,2011 Jul 15,0.65,"In mice, Cyp2a5 and 2g1 produce 7-hydroxycouma...","['Journal Article', 'Review']",,RO_0002448,,
4,garlic,,,2g1,Activation,Mice,C0026809,Mus,"(None, None)","('MESH', 'D051379')",21459083,2011 Jul 15,0.65,"In mice, Cyp2a5 and 2g1 produce 7-hydroxycouma...","['Journal Article', 'Review']",,RO_0002448,,


In [18]:
df['subject_obo'] = df.apply(get_obo_mapping, axis=1, col='subject')
df['object_obo'] = df.apply(get_obo_mapping, axis=1, col='object')
df.head()

Unnamed: 0,related_common_name,subject_cui,subject_name,subject_source,predicate,object_source,object_cui,object_name,subj_reach_grounding,obj_reach_grounding,pmid,year,belief,sentence,pub_type,source_section,predicate_obo,subject_obo,object_obo
0,garlic,C0051767,amsonic_acid,DADS,Acetylation,Histone_H3,C0019647,Histone_H3,"(None, None)","('FPLX', 'Histone_H3')",31092429,2019 May,0.65,DADS increases\nhistone H3 and H4 acetylation...,['Journal Article'],,GO_0006473,CHEBI_17364,
1,garlic,C0051767,amsonic_acid,DADS,Acetylation,Histone_H4,,,"(None, None)",'UP': 'P62805',31092429,2019 May,0.65,DADS increases\nhistone H3 and H4 acetylation...,['Journal Article'],,GO_0006473,CHEBI_17364,PR_P62805
2,garlic,,,2g1,Activation,7-hydroxycoumarin,C0049901,7-hydroxycoumarin,"(None, None)","(None, None)",21459083,2011 Jul 15,0.65,"In mice, Cyp2a5 and 2g1 produce 7-hydroxycouma...","['Journal Article', 'Review']",,RO_0002448,,
3,garlic,,,2g1,Activation,o-HPA,C0085355,Human_Platelet_Antigens,"(None, None)","(None, None)",21459083,2011 Jul 15,0.65,"In mice, Cyp2a5 and 2g1 produce 7-hydroxycouma...","['Journal Article', 'Review']",,RO_0002448,,
4,garlic,,,2g1,Activation,Mice,C0026809,Mus,"(None, None)","('MESH', 'D051379')",21459083,2011 Jul 15,0.65,"In mice, Cyp2a5 and 2g1 produce 7-hydroxycouma...","['Journal Article', 'Review']",,RO_0002448,,PR_P31944


In [19]:
import pickle
with open('cui_to_ontology_maps/CUItoOBO_20220505.pickle', 'rb') as filep:
    onto_dict = pickle.load(filep)
len(onto_dict)

3951

New mappings added in 2024 - Gilda and OntoRunNER results with manual review

In [21]:
manual_map = pd.read_csv('cui_to_ontology_maps/semrep_reach_unmapped_terms_nongene_gilda_mapped_reviewed_2023.tsv', sep='\t')
manual_map.head()

Unnamed: 0,cui,source,pref_name,gene_entity,clean_text,identifier,score,match,entity_name
0,C0019240,herb,Herb,False,Herb,,,,
1,C1510467,Wounds and Injuries,trauma_qualifier,False,trauma qualifier,,,,
2,C0007320,Case reports,Case_Reports_Publication_Type,False,Case Reports Publication Type,,,,
3,C1041613,Commi- phora mukul,Phora,False,Phora,,,,
4,C3539649,Lansopra-\nzole enantiomer,Chiral,False,Chiral,,,,


In [26]:
manual_map = manual_map.loc[manual_map['identifier'].notna()]
manual_map = manual_map.reset_index(drop=True)
manual_map.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2923 entries, 0 to 2922
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   cui          2923 non-null   object 
 1   source       192 non-null    object 
 2   pref_name    2923 non-null   object 
 3   gene_entity  2923 non-null   bool   
 4   clean_text   2922 non-null   object 
 5   identifier   2923 non-null   object 
 6   score        2920 non-null   float64
 7   match        2920 non-null   object 
 8   entity_name  2920 non-null   object 
dtypes: bool(1), float64(1), object(7)
memory usage: 185.7+ KB


In [29]:
for i in range(len(manual_map.index)):
    cui = manual_map.at[i, 'cui']
    identifier = manual_map.at[i, 'identifier']
    identifier = identifier.split('/')[-1]
    if cui not in onto_dict:
        onto_dict[cui] = identifier
len(onto_dict)

6820

In [30]:
##save updated dict
with open('cui_to_ontology_maps/CUItoOBO_20240528.pickle', 'wb') as filep:
    pickle.dump(onto_dict, filep)

In [31]:
##load genes results (see npdi-workspace/NER-OBO/process_entities.ipynb for updated gene ontorunner results)
with open('cui_to_ontology_maps/CUItoOBO_with_genes_20240528.pickle', 'rb') as filep:
    onto_dict = pickle.load(filep)
len(onto_dict)

8552

In [33]:
#add mappings to df
def get_obo_onto_mapping(row, col):
    if col == 'subject':
        cui = row['subject_cui']
        mapping = row['subject_obo']
    elif col == 'object':
        cui = row['object_cui']
        mapping = row['object_obo']
    else:
        print('specify if subject or object mapping required')
        exit(0)
    if mapping == '':
        if cui in onto_dict:
            return onto_dict[cui]
    else:
        return mapping

In [34]:
df['subject_obo'] = df.apply(get_obo_onto_mapping, axis=1, col='subject')
df['object_obo'] = df.apply(get_obo_onto_mapping, axis=1, col='object')

In [35]:
df = df.fillna('')

In [36]:
##finish mappings
##get unmapped terms
dfsubj_unmap = df.loc[df['subject_obo'] == '']
dfsubj_unmap.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9240 entries, 2 to 66240
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   related_common_name   9240 non-null   object 
 1   subject_cui           9240 non-null   object 
 2   subject_name          9240 non-null   object 
 3   subject_source        9240 non-null   object 
 4   predicate             9240 non-null   object 
 5   object_source         9240 non-null   object 
 6   object_cui            9240 non-null   object 
 7   object_name           9240 non-null   object 
 8   subj_reach_grounding  9240 non-null   object 
 9   obj_reach_grounding   9240 non-null   object 
 10  pmid                  9240 non-null   int64  
 11  year                  9240 non-null   object 
 12  belief                9240 non-null   float64
 13  sentence              9240 non-null   object 
 14  pub_type              9240 non-null   object 
 15  source_section      

In [38]:
dfobj_unmap = df.loc[df['object_obo'] == '']
dfobj_unmap.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8410 entries, 3 to 66224
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   related_common_name   8410 non-null   object 
 1   subject_cui           8410 non-null   object 
 2   subject_name          8410 non-null   object 
 3   subject_source        8410 non-null   object 
 4   predicate             8410 non-null   object 
 5   object_source         8410 non-null   object 
 6   object_cui            8410 non-null   object 
 7   object_name           8410 non-null   object 
 8   subj_reach_grounding  8410 non-null   object 
 9   obj_reach_grounding   8410 non-null   object 
 10  pmid                  8410 non-null   int64  
 11  year                  8410 non-null   object 
 12  belief                8410 non-null   float64
 13  sentence              8410 non-null   object 
 14  pub_type              8410 non-null   object 
 15  source_section      

In [39]:
dfsubj_unmap = dfsubj_unmap[['subject_cui', 'subject_source', 'subject_name']]
dfsubj_unmap = dfsubj_unmap.rename(columns=
{'subject_cui':'cui', 'subject_source':'source', 'subject_name':'pref_name'})
dfobj_unmap = dfobj_unmap[['object_cui', 'object_source', 'object_name']]
dfobj_unmap = dfobj_unmap.rename(columns=
{'object_cui':'cui', 'object_source':'source', 'object_name':'pref_name'})

In [40]:
dfunmap = pd.concat([dfsubj_unmap, dfobj_unmap])
dfunmap = dfunmap.drop_duplicates()
dfunmap = dfunmap.reset_index(drop=True)
dfunmap.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5806 entries, 0 to 5805
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   cui        5806 non-null   object
 1   source     5806 non-null   object
 2   pref_name  5806 non-null   object
dtypes: object(3)
memory usage: 136.2+ KB


In [41]:
dfunmap.to_csv('reach_data/unmapped_reach_terms_20240528.tsv', sep='\t', index=False)

In [None]:
##get unmapped subjects and objects and run gilda/ontorunner - add mappings back

In [42]:
df.to_csv('reach_data/reach_all_predications_mapped_20240528.tsv', sep='\t', index=False)

In [45]:
df.head()

Unnamed: 0,related_common_name,subject_cui,subject_name,subject_source,predicate,object_source,object_cui,object_name,subj_reach_grounding,obj_reach_grounding,pmid,year,belief,sentence,pub_type,source_section,predicate_obo,subject_obo,object_obo
0,garlic,C0051767,amsonic_acid,DADS,Acetylation,Histone_H3,C0019647,Histone_H3,"(None, None)","('FPLX', 'Histone_H3')",31092429,2019 May,0.65,DADS increases\nhistone H3 and H4 acetylation...,['Journal Article'],,http://purl.obolibrary.org/obo/GO_0006473,http://purl.obolibrary.org/obo/CHEBI_17364,http://purl.obolibrary.org/obo/PR_000027594
1,garlic,C0051767,amsonic_acid,DADS,Acetylation,Histone_H4,,,"(None, None)",'UP': 'P62805',31092429,2019 May,0.65,DADS increases\nhistone H3 and H4 acetylation...,['Journal Article'],,http://purl.obolibrary.org/obo/GO_0006473,http://purl.obolibrary.org/obo/CHEBI_17364,http://purl.obolibrary.org/obo/PR_P62805
2,garlic,,,2g1,Activation,7-hydroxycoumarin,C0049901,7-hydroxycoumarin,"(None, None)","(None, None)",21459083,2011 Jul 15,0.65,"In mice, Cyp2a5 and 2g1 produce 7-hydroxycouma...","['Journal Article', 'Review']",,http://purl.obolibrary.org/obo/RO_0002448,,http://purl.obolibrary.org/obo/CHEBI_27510
3,garlic,,,2g1,Activation,o-HPA,C0085355,Human_Platelet_Antigens,"(None, None)","(None, None)",21459083,2011 Jul 15,0.65,"In mice, Cyp2a5 and 2g1 produce 7-hydroxycouma...","['Journal Article', 'Review']",,http://purl.obolibrary.org/obo/RO_0002448,,
4,garlic,,,2g1,Activation,Mice,C0026809,Mus,"(None, None)","('MESH', 'D051379')",21459083,2011 Jul 15,0.65,"In mice, Cyp2a5 and 2g1 produce 7-hydroxycouma...","['Journal Article', 'Review']",,http://purl.obolibrary.org/obo/RO_0002448,,http://purl.obolibrary.org/obo/PR_P31944


### Add prefixes and process - run from here after mapping

In [43]:
def add_prefix(row, col):
    ##check for prefix before adding

    obo_prefix = 'http://purl.obolibrary.org/obo/'
    if col == 'predicate':
        predicate_obo = row['predicate_obo']
        if obo_prefix in predicate_obo:
            return predicate_obo
        else:
            return obo_prefix+predicate_obo
    elif col == 'subject':
        subject_obo = row['subject_obo']
        if isinstance(subject_obo, list) and subject_obo:
            subject_obo = subject_obo[0]
        if subject_obo == '':
            return ''
        subject_obo = subject_obo.replace(']', '')
        subject_obo = subject_obo.replace('[', '')
        subject_obo = subject_obo.replace(')', '')
        subject_obo = subject_obo.replace('(', '')
        if 'http' not in subject_obo:
            return obo_prefix+subject_obo
        else:
            return subject_obo
    elif col == 'object':
        object_obo = row['object_obo']
        if isinstance(object_obo, list) and object_obo:
            object_obo = object_obo[0]
        if object_obo == '':
            return ''
        object_obo = object_obo.replace(']', '')
        object_obo = object_obo.replace('[', '')
        object_obo = object_obo.replace(')', '')
        object_obo = object_obo.replace('(', '')
        if 'http' not in object_obo:
            return obo_prefix+object_obo
        else:
            return object_obo

In [44]:
#add OBO identifiers to the OBO mappings (where not present) - see df
#drop rows with no mappings
df['subject_obo'] = df.apply(add_prefix, axis=1, col='subject')
df['object_obo'] = df.apply(add_prefix, axis=1, col='object')
df['predicate_obo'] = df.apply(add_prefix, axis=1, col='predicate')

In [51]:
##fix napdi identifiers
for i in range(len(df.index)):
    subj = df.at[i, 'subject_obo']
    obj = df.at[i, 'object_obo']
    if 'napdi' in subj:
        subjnew = subj.replace('http://purl.obolibrary.org/obo/', 'http://napdi.org/')
        df.at[i, 'subject_obo'] = subjnew
    if 'napdi' in obj:
        objnew = obj.replace('http://purl.obolibrary.org/obo/', 'http://napdi.org/')
        df.at[i, 'object_obo'] = objnew

In [52]:
df.to_csv('reach_data/reach_all_predications_mapped_prefixes_20240528.tsv', sep='\t', index=False)

In [2]:
df = pd.read_csv('reach_data/reach_all_predications_mapped_prefixes_20240528.tsv', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66241 entries, 0 to 66240
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   related_common_name   66241 non-null  object 
 1   subject_cui           57596 non-null  object 
 2   subject_name          57596 non-null  object 
 3   subject_source        66115 non-null  object 
 4   predicate             66241 non-null  object 
 5   object_source         66241 non-null  object 
 6   object_cui            59093 non-null  object 
 7   object_name           59093 non-null  object 
 8   subj_reach_grounding  66115 non-null  object 
 9   obj_reach_grounding   66241 non-null  object 
 10  pmid                  66241 non-null  int64  
 11  year                  66241 non-null  object 
 12  belief                66241 non-null  float64
 13  sentence              66241 non-null  object 
 14  pub_type              66241 non-null  object 
 15  source_section     

In [4]:
df = df.fillna('')

In [5]:
obo_prefix = 'http://purl.obolibrary.org/obo/'
#remove all rows with blank obo mappings and only prefix rows
df_new = df[df['subject_obo'] != '']
df_new = df_new[df_new['subject_obo'] != obo_prefix]
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57001 entries, 0 to 66239
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   related_common_name   57001 non-null  object 
 1   subject_cui           57001 non-null  object 
 2   subject_name          57001 non-null  object 
 3   subject_source        57001 non-null  object 
 4   predicate             57001 non-null  object 
 5   object_source         57001 non-null  object 
 6   object_cui            57001 non-null  object 
 7   object_name           57001 non-null  object 
 8   subj_reach_grounding  57001 non-null  object 
 9   obj_reach_grounding   57001 non-null  object 
 10  pmid                  57001 non-null  int64  
 11  year                  57001 non-null  object 
 12  belief                57001 non-null  float64
 13  sentence              57001 non-null  object 
 14  pub_type              57001 non-null  object 
 15  source_section     

In [6]:
df_new = df_new[df_new['object_obo'] != '']
df_new = df_new[df_new['object_obo'] != obo_prefix]
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49905 entries, 0 to 66239
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   related_common_name   49905 non-null  object 
 1   subject_cui           49905 non-null  object 
 2   subject_name          49905 non-null  object 
 3   subject_source        49905 non-null  object 
 4   predicate             49905 non-null  object 
 5   object_source         49905 non-null  object 
 6   object_cui            49905 non-null  object 
 7   object_name           49905 non-null  object 
 8   subj_reach_grounding  49905 non-null  object 
 9   obj_reach_grounding   49905 non-null  object 
 10  pmid                  49905 non-null  int64  
 11  year                  49905 non-null  object 
 12  belief                49905 non-null  float64
 13  sentence              49905 non-null  object 
 14  pub_type              49905 non-null  object 
 15  source_section     

In [7]:
df_new = df_new.drop_duplicates()
df_new = df_new.reset_index(drop=True)
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49824 entries, 0 to 49823
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   related_common_name   49824 non-null  object 
 1   subject_cui           49824 non-null  object 
 2   subject_name          49824 non-null  object 
 3   subject_source        49824 non-null  object 
 4   predicate             49824 non-null  object 
 5   object_source         49824 non-null  object 
 6   object_cui            49824 non-null  object 
 7   object_name           49824 non-null  object 
 8   subj_reach_grounding  49824 non-null  object 
 9   obj_reach_grounding   49824 non-null  object 
 10  pmid                  49824 non-null  int64  
 11  year                  49824 non-null  object 
 12  belief                49824 non-null  float64
 13  sentence              49824 non-null  object 
 14  pub_type              49824 non-null  object 
 15  source_section     

In [59]:
##new mappings not included yet - 74% mapped
df_new.to_csv('reach_data/reach_all_predicates_mapped_processed_20240528.tsv', sep='\t', index=False)

In [8]:
import pandas as pd
df_new = pd.read_csv('reach_data/reach_all_predicates_mapped_processed_20240528.tsv', sep='\t')
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49824 entries, 0 to 49823
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   related_common_name   49824 non-null  object 
 1   subject_cui           46864 non-null  object 
 2   subject_name          46864 non-null  object 
 3   subject_source        49824 non-null  object 
 4   predicate             49824 non-null  object 
 5   object_source         49824 non-null  object 
 6   object_cui            47341 non-null  object 
 7   object_name           47341 non-null  object 
 8   subj_reach_grounding  49824 non-null  object 
 9   obj_reach_grounding   49824 non-null  object 
 10  pmid                  49824 non-null  int64  
 11  year                  49824 non-null  object 
 12  belief                49824 non-null  float64
 13  sentence              49824 non-null  object 
 14  pub_type              49824 non-null  object 
 15  source_section     

### Create graph

In [9]:
#Create networkx graph from triples
import glob
import hashlib
import json
import networkx as nx  # type: ignore
import os
import os.path

from collections import Counter  # type: ignore
from more_itertools import unique_everseen  # type: ignore
from rdflib import BNode, Graph, Literal, Namespace, URIRef  # type: ignore
from rdflib.namespace import OWL, RDF, RDFS  # type: ignore
from rdflib.plugins.serializers.nt import _quoteLiteral  # type: ignore
import subprocess

from tqdm import tqdm  # type: ignore
from typing import Dict, List, Optional, Set, Tuple, Union

# set-up environment variables
obo = Namespace('http://purl.obolibrary.org/obo/')
oboinowl = Namespace('http://www.geneontology.org/formats/oboInOwl#')
schema = Namespace('http://www.w3.org/2001/XMLSchema#')

In [10]:
dfres = df_new[['subject_obo', 'predicate_obo', 'object_obo']]
dfres = dfres.drop_duplicates()
dfres = dfres.reset_index(drop=True)
dfres.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42178 entries, 0 to 42177
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   subject_obo    42178 non-null  object
 1   predicate_obo  42178 non-null  object
 2   object_obo     42178 non-null  object
dtypes: object(3)
memory usage: 988.7+ KB


In [63]:
#N = 42178 (was 40245 in v2)
dfres.to_csv('reach_data/reach_pmid_all_predicates_processed_triples_20240528.tsv', sep='\t', index=False)

In [11]:
df_new['predicate'].value_counts()

Activation           23193
Inhibition           17691
IncreaseAmount        3966
DecreaseAmount        2865
Phosphorylation        736
Hydroxylation          491
Dephosphorylation      292
Dehydroxylation        171
Demethylation          165
Methylation            109
Acetylation             42
Glycosylation           42
Deacetylation           21
Ubiquitination          18
Deubiquitination         9
Deglycosylation          8
Ribosylation             3
Sumoylation              2
Name: predicate, dtype: int64

In [12]:
df_new['predicate_obo'].value_counts()

http://purl.obolibrary.org/obo/RO_0002448    23193
http://purl.obolibrary.org/obo/RO_0002449    17691
http://purl.obolibrary.org/obo/RO_0011009     3966
http://purl.obolibrary.org/obo/RO_0011010     2865
http://purl.obolibrary.org/obo/RO_0002447      736
http://purl.obolibrary.org/obo/GO_0018126      491
http://purl.obolibrary.org/obo/GO_0006470      292
http://purl.obolibrary.org/obo/RO_0002436      176
http://purl.obolibrary.org/obo/GO_0006482      165
http://purl.obolibrary.org/obo/GO_0006479      109
http://purl.obolibrary.org/obo/GO_0006486       42
http://purl.obolibrary.org/obo/GO_0006473       42
http://purl.obolibrary.org/obo/GO_0006476       21
http://purl.obolibrary.org/obo/RO_0002480       18
http://purl.obolibrary.org/obo/GO_0016579        9
http://purl.obolibrary.org/obo/GO_0006517        8
Name: predicate_obo, dtype: int64

In [13]:
import ast


#### Fix prefixes and nodes

In [14]:
##get unique prefixes for subjects and objects
prefixes = []
for i in range(len(df_new.index)):
    subj = df_new.at[i, 'subject_obo']
    obj = df_new.at[i, 'object_obo']
    prefixes.append(subj.split('/')[-1].split('_')[0])
    prefixes.append(obj.split('/')[-1].split('_')[0])
prefixes = list(set(prefixes))
len(prefixes)

14

In [15]:
prefixes

['GOT',
 'HP',
 'SGOT',
 'UBERON',
 'CL',
 'CHEBI',
 'GO',
 'MONDO',
 'PR',
 'NCBITaxon',
 'OBO',
 'SO',
 'DOID',
 'napdi']

In [16]:
problems = ['OBO', 'SGOT', 'GOT']
for i in range(len(df_new.index)):
    subj = df_new.at[i, 'subject_obo']
    obj = df_new.at[i, 'object_obo']
    subjpref = subj.split('/')[-1].split('_')[0]
    objpref = obj.split('/')[-1].split('_')[0]
    if subjpref in problems:
        if subjpref == 'OBO':
            df_new.at[i, 'subject_obo'] = subj.replace('OBO_', '')
        elif subjpref == 'SGOT':
            df_new.at[i, 'subject_obo'] = subj.replace('SGOT', 'PR_000008153')
        elif subjpref == 'GOT':
            df_new.at[i, 'subject_obo'] = subj.replace('GOT', 'PR_000008153')
    if objpref in problems:
        if objpref == 'OBO':
            df_new.at[i, 'object_obo'] = obj.replace('OBO_', '')
        elif objpref == 'SGOT':
            df_new.at[i, 'object_obo'] = obj.replace('SGOT', 'PR_000008153')
        elif objpref == 'GOT':
            df_new.at[i, 'object_obo'] = obj.replace('GOT', 'PR_000008153')

In [17]:
df_new.to_csv('reach_data/reach_all_predicates_mapped_processed_new_20240528.tsv', sep='\t', index=False)

In [18]:
import pandas as pd
df_new = pd.read_csv('reach_data/reach_all_predicates_mapped_processed_new_20240528.tsv', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [19]:
typedict = {
    'CHEBI': 'chemical',
    'PR': 'protein',
    'GO': 'process',
    'DOID': 'disease',
    'HP': 'phenotype',
    'UBERON': 'anatomy',
    'SO': 'sequence',
    'CL': 'cell',
    'CLO': 'cell_line',
    'NCBITaxon': 'organism',
    'MONDO': 'disease',
    'napdi': 'natural_product',
    'OAE': 'adverse_event',
    'NCBIGene': 'gene'
}

In [20]:
#check prefixes
prefixes = []
for i in range(len(df_new.index)):
    subj = df_new.at[i, 'subject_obo']
    obj = df_new.at[i, 'object_obo']
    prefixes.append(subj.split('/')[-1].split('_')[0])
    prefixes.append(obj.split('/')[-1].split('_')[0])
prefixes = list(set(prefixes))
prefixes

['HP',
 'UBERON',
 'CL',
 'CHEBI',
 'GO',
 'MONDO',
 'OAE',
 'PR',
 'NCBITaxon',
 'SO',
 'DOID',
 'napdi']

In [21]:
#create rdflib graph from dataframe triples and serialize as ntriples file
graph  = Graph()
pred_label = URIRef("http://www.w3.org/2000/01/rdf-schema#label")
for i in range(len(df_new.index)):
    subj = df_new.at[i, 'subject_obo']
    obj = df_new.at[i, 'object_obo']
    pred = df_new.at[i, 'predicate_obo']
    subj_node = URIRef(subj)
    obj_node = URIRef(obj)
    predicate = URIRef(pred)
    subj_name = df_new.at[i, 'subject_name']
    obj_name = df_new.at[i, 'object_name']
    graph.add((subj_node, predicate, obj_node))
    graph.add((subj_node, pred_label, Literal(subj_name)))
    graph.add((obj_node, pred_label, Literal(obj_name)))

In [22]:
graph.serialize('output_graphs/machineread_reach_version2.nt', format='nt')

In [23]:
len(graph)

49567

In [24]:
def n3(node: Union[URIRef, BNode, Literal]) -> str:
    """Method takes an RDFLib node of type BNode, URIRef, or Literal and serializes it to meet the RDF 1.1 NTriples
    format.
    Src: https://github.com/RDFLib/rdflib/blob/c11f7b503b50b7c3cdeec0f36261fa09b0615380/rdflib/plugins/serializers/nt.py
    Args:
        node: An RDFLib
    Returns:
        serialized_node: A string containing the serialized
    """
    if isinstance(node, Literal): serialized_node = "%s" % _quoteLiteral(node)
    else: serialized_node = "%s" % node.n3()
    return serialized_node

In [25]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49824 entries, 0 to 49823
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   related_common_name   49824 non-null  object 
 1   subject_cui           46864 non-null  object 
 2   subject_name          46864 non-null  object 
 3   subject_source        49824 non-null  object 
 4   predicate             49824 non-null  object 
 5   object_source         49824 non-null  object 
 6   object_cui            47341 non-null  object 
 7   object_name           47341 non-null  object 
 8   subj_reach_grounding  49824 non-null  object 
 9   obj_reach_grounding   49824 non-null  object 
 10  pmid                  49824 non-null  int64  
 11  year                  49824 non-null  object 
 12  belief                49824 non-null  float64
 13  sentence              49824 non-null  object 
 14  pub_type              49824 non-null  object 
 15  source_section     

In [26]:
##Creating edge metadata dict of format:
#'CHEBI_xx-NCBITaxon_xx':
# {
#    'type': 'chemical_substance-organism',
#    'relations': {
#       'r1': {
#          'source': [],
#          'year': [],
#          'pmid': [],
#          'sentence': [],
#          'belief': [],
#          'weight': [],
#          'pubtype': [],
#          'source_section': [],
# }
#       'r2': {
# }
#}

In [27]:
df_new = df_new.fillna('')

In [28]:
##create metadata table with columns subject_uri, object_uri, subject_curie, object_curie, source (reach), edge type, year, pmid, sentence, belief, pubtype, source section
edge_metadata_df = pd.DataFrame(columns=['subject_uri', 'object_uri', 'relation', 'subject_curie', 'object_curie', 'source', 'edge_type', 'year', 'pmid', 'sentence', 'belief', 'pubtype', 'source_section'])

In [29]:
uri_to_curie_map_reach = {}

In [30]:
#convert rdflib graph to multidigraph - code borrowed from PheKnowLator: kg_utils.py
#use the pred key to also create a dictionary with metadata about the edge - 
#pub_year, pmid, source graph, belief
nx_mdg = nx.MultiDiGraph()
edge_metadict = {}

for s, p, o in tqdm(graph):
    #do not save label predicate to gpickle
    subj = str(s)
    obj = str(o)
    pred = str(p)
    if pred == 'http://www.w3.org/2000/01/rdf-schema#label':
        continue
    else:
        ##defining metadata dict for edge type
        subj_type = typedict[subj.split('/')[-1].split('_')[0]]
        obj_type = typedict[obj.split('/')[-1].split('_')[0]]
        subj_curie = subj.split('/')[-1]
        obj_curie = obj.split('/')[-1]
        if subj not in uri_to_curie_map_reach:
            uri_to_curie_map_reach[subj] = subj_curie
        if obj not in uri_to_curie_map_reach:
            uri_to_curie_map_reach[obj] = obj_curie
        edge_key = subj_curie + '-' + obj_curie
        if edge_key not in edge_metadict:
            edge_metadict[edge_key] = {
                'type': subj_type + '-' + obj_type,
                'relations': {}
            }
        pred_curie = pred.split('/')[-1]
        pred_key = hashlib.md5('{}{}{}'.format(n3(s), n3(p), n3(o)).encode()).hexdigest()
        pmid = str(df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj)  & 
                       (df_new['predicate_obo'] == pred)]['pmid'].values[0])
        timestamp = str(df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj)  & 
                                   (df_new['predicate_obo'] == pred)]['year'].values[0])
        belief_score = df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj)  & 
                                   (df_new['predicate_obo'] == pred)]['belief'].values[0]
        sentence = df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj)  & 
                                   (df_new['predicate_obo'] == pred)]['sentence'].values[0]
        source_section = df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj)  
                                & (df_new['predicate_obo'] == pred)]['source_section'].values[0]
        pub_type_list = df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj)
                                & (df_new['predicate_obo'] == pred)]['pub_type'].values[0]
        pub_type = ast.literal_eval(pub_type_list)
        pub_type = ' '.join(pub_type)
        #add edge to graph
        nx_mdg.add_node(s, key=n3(s))
        nx_mdg.add_node(o, key=n3(o))
        nx_mdg.add_edge(s, o, **{'key': p, 'predicate_key': pred_key, 'weight':0.0,
                                  'source_graph': 'machine_read'})
        
        #add edge metadata to edge_metadict and edge_metadata_df
        if pred_curie not in edge_metadict[edge_key]['relations']:
            edge_metadict[edge_key]['relations'][pred_curie] = {
                'source': [],
                'year': [],
                'pmid': [],
                'sentence': [],
                'belief': [],
                'weight': [],
                'pubtype': [],
                'source_section': [],
            }
        edge_metadict[edge_key]['relations'][pred_curie]['source'].append('reach')
        edge_metadict[edge_key]['relations'][pred_curie]['year'].append(timestamp)
        edge_metadict[edge_key]['relations'][pred_curie]['pmid'].append(pmid)
        edge_metadict[edge_key]['relations'][pred_curie]['sentence'].append(sentence)
        edge_metadict[edge_key]['relations'][pred_curie]['belief'].append(belief_score)
        edge_metadict[edge_key]['relations'][pred_curie]['weight'].append(0.0)
        edge_metadict[edge_key]['relations'][pred_curie]['pubtype'].append(pub_type)
        edge_metadict[edge_key]['relations'][pred_curie]['source_section'].append(source_section)

        temp_df = pd.DataFrame([[subj, obj, pred_curie, subj_curie, obj_curie, 'reach', pred_curie, timestamp, pmid, sentence, belief_score, pub_type, source_section]],
                                columns=edge_metadata_df.columns)
        edge_metadata_df = pd.concat([edge_metadata_df, temp_df], ignore_index=True)
            
nx.write_gpickle(nx_mdg, "output_graphs/machineread_reach_version2.gpickle")

  0%|          | 0/49567 [00:00<?, ?it/s]

100%|██████████| 49567/49567 [1:27:44<00:00,  9.41it/s]


In [31]:
for k, v in edge_metadict.items():
    print(k)
    print(v)
    break

PR_P0C797-PR_P03372
{'type': 'protein-protein', 'relations': {'RO_0002448': {'source': ['reach'], 'year': ['2019 Jun 1'], 'pmid': ['30802639'], 'sentence': ['Treatment with the ER stress inducer tunicamycin (Tm) but not 3-NP elevated the ER stress marker GRP78 (Fig. 3A), excluding the possibility that 3-NP induces the ER UPR.'], 'belief': [0.65], 'weight': [0.0], 'pubtype': ['Journal Article'], 'source_section': ['']}, 'RO_0002449': {'source': ['reach'], 'year': ['2019 Jun 15'], 'pmid': ['31085592'], 'sentence': ['In contrast, ER-targeting\nNP-encapsulated curcumin at a lower concentration may induce equivalent\nER calcium depletion and recover NOX2 function without excessive\ndamage to the cell viability (right side of the ﬁgure).'], 'belief': [0.65], 'weight': [0.0], 'pubtype': ['Journal Article'], 'source_section': ['']}, 'RO_0011009': {'source': ['reach'], 'year': ['2019 Jun 15'], 'pmid': ['31085592'], 'sentence': ['Compatible with the results in Fig. 2C and 2D showing the\nthese\n

In [32]:
##write edge metadata to pickle
import pickle
with open('output_graphs/edge_metadata_reach_version2.pickle', 'wb') as filep:
    pickle.dump(edge_metadict, filep)

In [40]:
with open('output_graphs/uri_to_curie_map_reach.pickle', 'wb') as filep2:
    pickle.dump(uri_to_curie_map_reach, filep2)

In [41]:
edge_metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42178 entries, 0 to 42177
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   subject_uri     42178 non-null  object 
 1   object_uri      42178 non-null  object 
 2   relation        42178 non-null  object 
 3   subject_curie   42178 non-null  object 
 4   object_curie    42178 non-null  object 
 5   source          42178 non-null  object 
 6   edge_type       42178 non-null  object 
 7   year            42178 non-null  object 
 8   pmid            42178 non-null  object 
 9   sentence        42178 non-null  object 
 10  belief          42178 non-null  float64
 11  pubtype         42178 non-null  object 
 12  source_section  42178 non-null  object 
dtypes: float64(1), object(12)
memory usage: 4.2+ MB


In [42]:
edge_metadata_df.to_csv('output_graphs/edge_metadata_reach_version2.tsv', sep='\t', index=False)

In [43]:
#this should have less edges than rdflib graph after removing 'labels'
nodes = nx.number_of_nodes(nx_mdg)
edges = nx.number_of_edges(nx_mdg)
density = nx.density(nx_mdg)
avg_deg = float(edges)/nodes
print(nodes, edges, density, avg_deg)

6529 42178 0.0009895988185441124 6.460101087455966


In [44]:
#save node labels as dictionary
#key: URI, value is label
label_dict = {}
##change label for activation and inhibition, and make all lowercase
for i in range(len(df_new.index)):
    subj = str(df_new.at[i, 'subject_obo'])
    obj = str(df_new.at[i, 'object_obo'])
    pred = str(df_new.at[i, 'predicate_obo'])
    if subj not in label_dict:
        label_dict[subj] = {}
        label_dict[subj]['entity_type'] = 'NODES'
        label_dict[subj]['label'] = df_new.at[i, 'subject_name']
        label_dict[subj]['cui'] = df_new.at[i, 'subject_cui']
    if obj not in label_dict:
        label_dict[obj] = {}
        label_dict[obj]['entity_type'] = 'NODES'
        label_dict[obj]['label'] = df_new.at[i, 'object_name']
        label_dict[obj]['cui'] = df_new.at[i, 'object_cui']
    if pred not in label_dict:
        label_dict[pred] = {}
        label_dict[pred]['entity_type'] = 'RELATIONS'
        predlabel = df_new.at[i, 'predicate'].lower()
        label_dict[pred]['label'] = df_new.at[i, 'predicate']
len(label_dict)

6542

In [45]:
import pickle
with open('output_graphs/reach_version2_NodeLabels.pickle', 'wb') as file_p2:
    pickle.dump(label_dict, file_p2)

In [46]:
dfmap = pd.DataFrame.from_dict(label_dict, orient='index')
dfmap = dfmap.rename(columns={"index":"entity_uri"})
dfmap = dfmap.reset_index()
dfmap.head()

Unnamed: 0,index,entity_type,label,cui
0,http://purl.obolibrary.org/obo/CHEBI_17364,NODES,amsonic_acid,C0051767
1,http://purl.obolibrary.org/obo/PR_000027594,NODES,Histone_H3,C0019647
2,http://purl.obolibrary.org/obo/GO_0006473,RELATIONS,Acetylation,
3,http://purl.obolibrary.org/obo/PR_P62805,NODES,,
4,http://purl.obolibrary.org/obo/PR_Q9ZVX2,NODES,ABLEPHARON-MACROSTOMIA_SYNDROME,C1860224


In [47]:
dfmap.to_csv('output_graphs/reach_NodeLabels.tsv', index=False, sep='\t')