## Process SemRep data for all NPs after extraction and CUI mapping

Steps:
1. Read in TSV data
2. Process data
3. Mappings
4. Add prefixes
5. Create graphs

TODO:
1. Add error files
2. Run NER over new unmapped
3. Metadata new file

In [1]:
import pandas as pd
import numpy as np
import pickle
from rdflib.namespace import OWL, RDF, RDFS
import os, re

### Input data

In [4]:
files = os.listdir('semrep_data')
tsv_files = [file for file in files if file.endswith('.tsv')]
len(tsv_files)

58

In [5]:
##input all TSV files from semrep_data folder as pandas dataframes
# get all files in the folder
df = pd.DataFrame()
for filename in tsv_files:
    # read the file as a pandas dataframe
    dfnp = pd.read_csv('semrep_data/' + filename, sep='\t')
    # get the name of the file without the extension
    dfnp['related_common_name'] = filename.split('_')[0]
    dfnp = dfnp.drop_duplicates()
    df = pd.concat([df, dfnp], ignore_index=True)
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 538819 entries, 0 to 538818
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   index                538819 non-null  int64 
 1   pmid                 538819 non-null  int64 
 2   subject_cui          524007 non-null  object
 3   subject_name         524007 non-null  object
 4   subject_type         538819 non-null  object
 5   relation             538819 non-null  object
 6   object_cui           515492 non-null  object
 7   object_name          515492 non-null  object
 8   object_type          538819 non-null  object
 9   year                 538819 non-null  object
 10  sentence             538819 non-null  object
 11  source_section       135119 non-null  object
 12  pub_type             279137 non-null  object
 13  related_common_name  538819 non-null  object
dtypes: int64(2), object(12)
memory usage: 57.6+ MB


In [6]:
def save_data_statistics(df):
    nps = df['related_common_name'].unique()
    info = {
        'related_common_name': [],
        'PMIDs': [],
        'statements': []
    }
    for np_name in nps:
        dfnp = df.loc[df['related_common_name'] == np_name]
        info['related_common_name'].append(np_name)
        info['PMIDs'].append(len(dfnp['pmid'].unique()))
        info['statements'].append(len(dfnp))
    df_stats = pd.DataFrame.from_dict(info)
    df_stats.to_csv('semrep_data/semrep_data_statistics.tsv', sep='\t', index=False)

In [7]:
df = df.fillna('')
save_data_statistics(df)
df.head()

Unnamed: 0,index,pmid,subject_cui,subject_name,subject_type,relation,object_cui,object_name,object_type,year,sentence,source_section,pub_type,related_common_name
0,0,18064444,C0010206,coumarin,"orch,phsu",INTERACTS_WITH,,,"gngm,aapp",2008 Mar,18064444_ascii.txt.tx.1 Abstract Objective To ...,,"['Journal Article', 'Randomized Controlled Tri...",cinnamon
1,1,18064444,C0010206,coumarin,"orch,phsu",compared_with,C0028040,Nicotine,"hops,orch",2008 Mar,18064444_ascii.txt.tx.1 Abstract Objective To ...,,"['Journal Article', 'Randomized Controlled Tri...",cinnamon
2,2,18064444,C0028040,Nicotine,"hops,orch",INTERACTS_WITH,,,"gngm,aapp",2008 Mar,18064444_ascii.txt.tx.1 Abstract Objective To ...,,"['Journal Article', 'Randomized Controlled Tri...",cinnamon
3,3,18064444,C0052430,artemisinine,"orch,phsu",INTERACTS_WITH,,,"gngm,aapp",2008 Mar,18064444_ascii.txt.tx.1 Abstract Objective To ...,,"['Journal Article', 'Randomized Controlled Tri...",cinnamon
4,4,18064444,C1708335,Healthy Volunteers,"popg,humn",LOCATION_OF,,,"gngm,aapp",2008 Mar,18064444_ascii.txt.tx.1 Abstract Objective To ...,,"['Journal Article', 'Randomized Controlled Tri...",cinnamon


### Process data

In [8]:
#use re to fix year as format is not consistent
for i in range(len(df.index)):
    try:
        pub_date = str(df.at[i, 'year'])
        x = re.findall(r'\d+', pub_date)
        if x:
            df.at[i, 'year'] = x[0]
    except Exception as e:
        print('Error at index:', i, e)
        continue
df.head()

Unnamed: 0,index,pmid,subject_cui,subject_name,subject_type,relation,object_cui,object_name,object_type,year,sentence,source_section,pub_type,related_common_name
0,0,18064444,C0010206,coumarin,"orch,phsu",INTERACTS_WITH,,,"gngm,aapp",2008,18064444_ascii.txt.tx.1 Abstract Objective To ...,,"['Journal Article', 'Randomized Controlled Tri...",cinnamon
1,1,18064444,C0010206,coumarin,"orch,phsu",compared_with,C0028040,Nicotine,"hops,orch",2008,18064444_ascii.txt.tx.1 Abstract Objective To ...,,"['Journal Article', 'Randomized Controlled Tri...",cinnamon
2,2,18064444,C0028040,Nicotine,"hops,orch",INTERACTS_WITH,,,"gngm,aapp",2008,18064444_ascii.txt.tx.1 Abstract Objective To ...,,"['Journal Article', 'Randomized Controlled Tri...",cinnamon
3,3,18064444,C0052430,artemisinine,"orch,phsu",INTERACTS_WITH,,,"gngm,aapp",2008,18064444_ascii.txt.tx.1 Abstract Objective To ...,,"['Journal Article', 'Randomized Controlled Tri...",cinnamon
4,4,18064444,C1708335,Healthy Volunteers,"popg,humn",LOCATION_OF,,,"gngm,aapp",2008,18064444_ascii.txt.tx.1 Abstract Objective To ...,,"['Journal Article', 'Randomized Controlled Tri...",cinnamon


In [9]:
preds = ['affects', 
'affects(spec)',
'associated_with',
'associated_with(spec)',
'associated_with(infer)',
'augments',
'augments(spec)',
'causes',
'causes(spec)',
'coexists_with',
'coexists_with(spec)',
'complicates',
'disrupts',
'disrupts(spec)',
'inhibits',
'inhibits(spec)',
'inhibits(infer)',
'interacts_with',
'interacts_with(spec)',
'interacts_with(infer)',
'part_of',
'part_of(spec)',
'precedes',
'precedes(spec)',
'predisposes',
'predisposes(spec)',
'prevents',
'prevents(spec)',
'produces',
'produces(spec)',
'stimulates',
'stimulates(spec)',
'treats',
'treats(spec)',
'treats(infer)'
]

In [10]:
df['predicate'] = df['relation'].str.lower()
df = df.drop(['relation'], axis=1)
dfn = df[df['predicate'].isin(preds)]
dfn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 329060 entries, 0 to 538817
Data columns (total 14 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   index                329060 non-null  int64 
 1   pmid                 329060 non-null  int64 
 2   subject_cui          329060 non-null  object
 3   subject_name         329060 non-null  object
 4   subject_type         329060 non-null  object
 5   object_cui           329060 non-null  object
 6   object_name          329060 non-null  object
 7   object_type          329060 non-null  object
 8   year                 329060 non-null  object
 9   sentence             329060 non-null  object
 10  source_section       329060 non-null  object
 11  pub_type             329060 non-null  object
 12  related_common_name  329060 non-null  object
 13  predicate            329060 non-null  object
dtypes: int64(2), object(12)
memory usage: 37.7+ MB


In [11]:
dfn['predicate'].value_counts()

interacts_with            62960
part_of                   44931
affects                   35633
coexists_with             32715
inhibits                  32125
stimulates                29875
treats                    28055
causes                    15688
disrupts                  13563
augments                  12055
associated_with            7403
produces                   6080
prevents                   2889
predisposes                2359
treats(infer)              1274
precedes                    690
associated_with(infer)       97
interacts_with(infer)        97
interacts_with(spec)         94
treats(spec)                 82
coexists_with(spec)          67
inhibits(spec)               51
complicates                  49
affects(spec)                43
causes(spec)                 38
stimulates(spec)             32
part_of(spec)                23
associated_with(spec)        20
disrupts(spec)               19
augments(spec)               19
prevents(spec)               16
produces

In [12]:
dfn['subject_map'] = None
dfn['object_map'] = None
dfn['predicate_obo'] = None
dfn['subject_obo'] = None
dfn['object_obo'] = None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [13]:
predMapSemRep = {
'affects': 'RO_0002596',
'affects(spec)': 'RO_0002596',
'associated_with': 'RO_0002610',
'associated_with(spec)': 'RO_0002610',
'associated_with(infer)': 'RO_0002610',
'augments': 'RO_0002598',
'augments(spec)': 'RO_0002598',
'causes': 'RO_0002566',
'causes(spec)': 'RO_0002566',
'coexists_with': 'RO_0002490',
'coexists_with(spec)': 'RO_0002490',
'complicates': 'RO_0003309',
'disrupts': 'RO_0002212',
'disrupts(spec)': 'RO_0002212',
'inhibits': 'RO_0002449',
'inhibits(spec)': 'RO_0002449',
'inhibits(infer)': 'RO_0002449',
'interacts_with': 'RO_0002434',
'interacts_with(spec)': 'RO_0002434',
'interacts_with(infer)': 'RO_0002434',
'part_of': 'BFO_0000050',
'part_of(spec)': 'BFO_0000050',
'precedes': 'BFO_0000063',
'precedes(spec)': 'BFO_0000063',
'predisposes': 'RO_0003302',
'predisposes(spec)': 'RO_0003302',
'prevents': 'RO_0002599',
'prevents(spec)': 'RO_0002599',
'produces': 'RO_0003000',
'produces(spec)': 'RO_0003000',
'stimulates': 'RO_0002213',
'stimulates(spec)': 'RO_0002213',
'treats': 'RO_0002606',
'treats(spec)': 'RO_0002606',
'treats(infer)': 'RO_0002606'
}

In [14]:
def relation_mapping(row):
    rel = row['predicate'].lower()
    if rel in predMapSemRep:
        return predMapSemRep[rel]
    else:
        return ''

In [15]:
dfn['predicate_obo'] = dfn.apply(relation_mapping, axis=1)
dfn = dfn.reset_index(drop=True)
dfn.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 329060 entries, 0 to 329059
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   index                329060 non-null  int64 
 1   pmid                 329060 non-null  int64 
 2   subject_cui          329060 non-null  object
 3   subject_name         329060 non-null  object
 4   subject_type         329060 non-null  object
 5   object_cui           329060 non-null  object
 6   object_name          329060 non-null  object
 7   object_type          329060 non-null  object
 8   year                 329060 non-null  object
 9   sentence             329060 non-null  object
 10  source_section       329060 non-null  object
 11  pub_type             329060 non-null  object
 12  related_common_name  329060 non-null  object
 13  predicate            329060 non-null  object
 14  subject_map          0 non-null       object
 15  object_map           0 non-null   

In [16]:
#filter by semantic types here and then start mapping
excluded_semtype = ['acty','bhvr','evnt','gora','mcha','ocac', #Occupational Activity
'clas',
'cnce',
'ftcn',
'grpa',
'idcn',
'inpr',
'lang',
'qlco',
'qnco',
'rnlw',
'spco',
'tmco',
'enty',
'mnob',
'phob',
'bmod',
'ocdi',
'hcro',
'orgt',
'pros',
'shro',
'eehu',
'hcpp']

In [17]:
dfn1 = dfn[~dfn['subject_type'].isin(excluded_semtype)]
dfn2 = dfn1[~dfn1['object_type'].isin(excluded_semtype)]

In [18]:
##exclude all concepts that occur in SemMedDB GENERIC.CONCEPT table
#Get CSV file - https://lhncbc.nlm.nih.gov/ii/tools/SemRep_SemMedDB_SKR/SemMedDB_download.html -- doesn't work
#download from halil covid-19 repository - 
semmed = pd.read_csv('cui_to_ontology_maps/semmedVER43_2020_R_GENERIC_CONCEPT.csv', header=None)
semmed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 259 entries, 0 to 258
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       259 non-null    int64 
 1   1       259 non-null    object
 2   2       259 non-null    object
dtypes: int64(1), object(2)
memory usage: 6.2+ KB


In [19]:
semmed = semmed.rename(columns={1: 'CUI', 2: 'concept_name'})
semmed = semmed.drop([0], axis=1)
generic_cui = semmed.CUI.tolist()
dfn3 = dfn2[~dfn2['subject_cui'].isin(generic_cui)]
dfn4 = dfn3[~dfn3['object_cui'].isin(generic_cui)]


In [20]:
#handle subclassof when mapping to obo??

In [21]:
#map from UMLS to GO, HPO
with open('cui_to_ontology_maps/go_hpo_map_dict.pickle', 'rb') as filep:
    go_hpo_mapping_dict = pickle.load(filep)
len(go_hpo_mapping_dict)

87745

In [22]:
##MAPPING #1
def umls_go_hpo_map(row, col):
    if col == 'subject':
        cui = row['subject_cui']
    elif col == 'object':
        cui = row['object_cui']
    else:
        print('specify if subject or object mapping required')
        exit(0)
    if cui in go_hpo_mapping_dict:
        if len(go_hpo_mapping_dict):
            return go_hpo_mapping_dict[cui][0]
    return None

In [23]:
dfn4['subject_obo'] = dfn4.apply(umls_go_hpo_map, axis=1, col='subject')
dfn4['object_obo'] = dfn4.apply(umls_go_hpo_map, axis=1, col='object')
dfn4 = dfn4.reset_index(drop=True)
dfn4.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291682 entries, 0 to 291681
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   index                291682 non-null  int64 
 1   pmid                 291682 non-null  int64 
 2   subject_cui          291682 non-null  object
 3   subject_name         291682 non-null  object
 4   subject_type         291682 non-null  object
 5   object_cui           291682 non-null  object
 6   object_name          291682 non-null  object
 7   object_type          291682 non-null  object
 8   year                 291682 non-null  object
 9   sentence             291682 non-null  object
 10  source_section       291682 non-null  object
 11  pub_type             291682 non-null  object
 12  related_common_name  291682 non-null  object
 13  predicate            291682 non-null  object
 14  subject_map          0 non-null       object
 15  object_map           0 non-null   

In [24]:
dfn_subset = dfn4

In [25]:
##existing ontorunner mapping (updated in 2024 with new gilda and ontorunner mappings (see NER-OBO), updated dictionary in process_reach_data.ipynb)
with open('cui_to_ontology_maps/CUItoOBO_with_genes_20240528.pickle', 'rb') as filep:
    obomap_dict = pickle.load(filep)
len(obomap_dict)

8552

In [26]:
##MAPPING #2
def get_obo_mapping(row, col):
    if col == 'subject':
        cui = row['subject_cui']
        obomap = row['subject_obo']
    elif col == 'object':
        cui = row['object_cui']
        obomap = row['object_obo']
    else:
        print('specify if subject or object mapping required')
        exit(0)
    if isinstance(obomap, str):
        return obomap
    elif cui in obomap_dict:
        return obomap_dict[cui]
    return None

In [27]:
dfn_subset['subject_obo'] = dfn_subset.apply(get_obo_mapping, axis=1, col='subject')
dfn_subset['object_obo'] = dfn_subset.apply(get_obo_mapping, axis=1, col='object')
dfn_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291682 entries, 0 to 291681
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   index                291682 non-null  int64 
 1   pmid                 291682 non-null  int64 
 2   subject_cui          291682 non-null  object
 3   subject_name         291682 non-null  object
 4   subject_type         291682 non-null  object
 5   object_cui           291682 non-null  object
 6   object_name          291682 non-null  object
 7   object_type          291682 non-null  object
 8   year                 291682 non-null  object
 9   sentence             291682 non-null  object
 10  source_section       291682 non-null  object
 11  pub_type             291682 non-null  object
 12  related_common_name  291682 non-null  object
 13  predicate            291682 non-null  object
 14  subject_map          0 non-null       object
 15  object_map           0 non-null   

In [28]:
##add constituent mappings
constituent_mapping_dict = {}
dfcons = pd.read_csv('cui_to_ontology_maps/chebi-extensions-constituents-NP-20240229.tsv', sep='\t')
dfcons.head()

Unnamed: 0,constituent_name,URI
0,12beta-acetoxycimigenol 3-o-beta-d-xylopyranoside,http://napdi.org/napdi_srs_imports:12beta_acet...
1,2'-o-acetylactein,http://napdi.org/napdi_srs_imports:2_o_acetyla...
2,2'-o-acetylcimicifugoside h1,http://napdi.org/napdi_srs_imports:2_o_acetylc...
3,23-epi-26-deoxyactein,http://purl.obolibrary.org/obo/CHEBI_70243
4,23-o-acetylshengmanol,http://napdi.org/napdi_srs_imports:23_o_acetyl...


In [29]:
##use for mapping after prefixing
for i in range(len(dfcons.index)):
    constituent_name = dfcons.at[i, 'constituent_name']
    uri = dfcons.at[i, 'URI']
    constituent_mapping_dict[constituent_name] = uri
len(constituent_mapping_dict)

671

In [30]:
##MAPPING #3
def map_constituents(row, col):
    if col == 'subject':
        label = row['subject_name'].lower()
        obomap = row['subject_obo']
    elif col == 'object':
        label = row['object_name'].lower()
        obomap = row['object_obo']
    else:
        print('specify if subject or object mapping required')
        exit(0)
    if isinstance(obomap, str):
        if label in constituent_mapping_dict:
            return constituent_mapping_dict[label]
        else:
            return obomap
    else:
        if label in constituent_mapping_dict:
            return constituent_mapping_dict[label]
    return None

In [31]:
dfn_subset['subject_obo'] = dfn_subset.apply(map_constituents, axis=1, col='subject')
dfn_subset['object_obo'] = dfn_subset.apply(map_constituents, axis=1, col='object')
dfn_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291682 entries, 0 to 291681
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   index                291682 non-null  int64 
 1   pmid                 291682 non-null  int64 
 2   subject_cui          291682 non-null  object
 3   subject_name         291682 non-null  object
 4   subject_type         291682 non-null  object
 5   object_cui           291682 non-null  object
 6   object_name          291682 non-null  object
 7   object_type          291682 non-null  object
 8   year                 291682 non-null  object
 9   sentence             291682 non-null  object
 10  source_section       291682 non-null  object
 11  pub_type             291682 non-null  object
 12  related_common_name  291682 non-null  object
 13  predicate            291682 non-null  object
 14  subject_map          0 non-null       object
 15  object_map           0 non-null   

In [32]:
#unmapped_cui = []
#unmapped_string_umls = []
unmapped_terms_dict = {}
for i in range(len(dfn_subset.index)):
    if not dfn_subset.at[i, 'subject_obo']:
        subcui = dfn_subset.at[i, 'subject_cui']
        if subcui not in unmapped_terms_dict:
            unmapped_terms_dict[subcui] = dfn_subset.at[i, 'subject_name']
    if not dfn_subset.at[i, 'object_obo']:
        objcui = dfn_subset.at[i, 'object_cui']
        if objcui not in unmapped_terms_dict:
            unmapped_terms_dict[objcui] = dfn_subset.at[i, 'object_name']
print(len(unmapped_terms_dict))

9054


In [33]:
##print 10 keys and values of unmapped terms dict
{k: unmapped_terms_dict[k] for k in list(unmapped_terms_dict)[:20]}

{'': '',
 'C0443640': 'Specific antibody',
 'C0301630': 'Reduction (chemical)',
 'C3536832': 'Air',
 'C0026030': 'Microsomes, Liver',
 'C0031591': 'Phosgene',
 'C0600688': 'Toxic effect',
 'C0036492': 'Seals (Animal)',
 'C1505125': 'RRAD protein, human',
 'C0597298': 'Protein Isoforms',
 'C2946261': 'Level',
 'C0150312': 'Present',
 'C0360100': 'Organic solvent product',
 'C0596040': 'adduct',
 'C3887559': 'Recombinant Human Macrophage Inflammatory Protein-1 Beta',
 'C1882726': 'Rat Liver',
 'C1517914': 'Mouse Liver',
 'C0440731': 'Fetal brain',
 'C1828472': 'Cytochrome p450 CYP2A6 enzyme',
 'C0042333': 'Variation (Genetics)'}

In [34]:
##rename columns
dfunmap = pd.DataFrame.from_dict(unmapped_terms_dict, orient='index')
dfunmap = dfunmap.reset_index()
dfunmap = dfunmap.rename(columns={'index': 'CUI', 0: 'concept_name'})
dfunmap.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9054 entries, 0 to 9053
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CUI           9054 non-null   object
 1   concept_name  9054 non-null   object
dtypes: object(2)
memory usage: 141.6+ KB


In [35]:
##save dfunmap as TSV
dfunmap.to_csv('semrep_data/unmapped_semrep_subset_2024.tsv', sep='\t', index=False)

In [36]:
dfn_subset.to_csv('semrep_data/semrep_all_predications_mapped_20240528.tsv', sep='\t', index=False)

### Add prefix

In [37]:
def add_prefix(row, col):
    obo_prefix = 'http://purl.obolibrary.org/obo/'
    napdi_prefix = 'http://napdi.org/'
    if col == 'predicate':
        predicate_obo = row['predicate_obo']
        if isinstance(predicate_obo, str):
            return obo_prefix+predicate_obo
        else:
            return ''
            
    elif col == 'subject':
        
        subject_obo = row['subject_obo']
        if isinstance(subject_obo, str):
            if subject_obo == None:
                return ''
            if 'http' not in subject_obo:
                if 'napdi' in subject_obo:
                    return napdi_prefix+subject_obo
                else:
                    return obo_prefix+subject_obo
            else:
                return subject_obo
        else:
            return ''
    elif col == 'object':
        object_obo = row['object_obo']
        if isinstance(object_obo, str):
            if object_obo == None:
                return ''
            if 'http' not in object_obo:
                if 'napdi' in object_obo:
                    return napdi_prefix+object_obo
                else:
                    return obo_prefix+object_obo
            else:
                return object_obo
        else:
            return ''

In [38]:
dfn_subset['subject_obo'] = dfn_subset.apply(add_prefix, axis=1, col='subject')
dfn_subset['object_obo'] = dfn_subset.apply(add_prefix, axis=1, col='object')
dfn_subset['predicate_obo'] = dfn_subset.apply(add_prefix, axis=1, col='predicate')

In [39]:
dfn_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291682 entries, 0 to 291681
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   index                291682 non-null  int64 
 1   pmid                 291682 non-null  int64 
 2   subject_cui          291682 non-null  object
 3   subject_name         291682 non-null  object
 4   subject_type         291682 non-null  object
 5   object_cui           291682 non-null  object
 6   object_name          291682 non-null  object
 7   object_type          291682 non-null  object
 8   year                 291682 non-null  object
 9   sentence             291682 non-null  object
 10  source_section       291682 non-null  object
 11  pub_type             291682 non-null  object
 12  related_common_name  291682 non-null  object
 13  predicate            291682 non-null  object
 14  subject_map          0 non-null       object
 15  object_map           0 non-null   

In [40]:
dfn_subset.head()

Unnamed: 0,index,pmid,subject_cui,subject_name,subject_type,object_cui,object_name,object_type,year,sentence,source_section,pub_type,related_common_name,predicate,subject_map,object_map,predicate_obo,subject_obo,object_obo
0,0,18064444,C0010206,coumarin,"orch,phsu",,,"gngm,aapp",2008,18064444_ascii.txt.tx.1 Abstract Objective To ...,,"['Journal Article', 'Randomized Controlled Tri...",cinnamon,interacts_with,,,http://purl.obolibrary.org/obo/RO_0002434,http://purl.obolibrary.org/obo/CHEBI_28794,
1,2,18064444,C0028040,Nicotine,"hops,orch",,,"gngm,aapp",2008,18064444_ascii.txt.tx.1 Abstract Objective To ...,,"['Journal Article', 'Randomized Controlled Tri...",cinnamon,interacts_with,,,http://purl.obolibrary.org/obo/RO_0002434,http://purl.obolibrary.org/obo/CHEBI_18723,
2,3,18064444,C0052430,artemisinine,"orch,phsu",,,"gngm,aapp",2008,18064444_ascii.txt.tx.1 Abstract Objective To ...,,"['Journal Article', 'Randomized Controlled Tri...",cinnamon,interacts_with,,,http://purl.obolibrary.org/obo/RO_0002434,http://purl.obolibrary.org/obo/CHEBI_223316,
3,11,12584152,,,"gngm,aapp",C0086418,Homo sapiens,"grup,humn",2003,12584152_ascii.txt.tx.1 ABSTRACT: The oxidativ...,,['Journal Article'],cinnamon,part_of,,,http://purl.obolibrary.org/obo/BFO_0000050,,http://purl.obolibrary.org/obo/NCBITaxon_9606
4,12,12584152,C0006556,"DNA, Complementary",nnon,,,"gngm,aapp",2003,12584152_ascii.txt.tx.1 ABSTRACT: The oxidativ...,,['Journal Article'],cinnamon,interacts_with,,,http://purl.obolibrary.org/obo/RO_0002434,http://purl.obolibrary.org/obo/SO_0000352,


In [41]:
dfn_subset = dfn_subset.drop(columns=['subject_map', 'object_map'])

In [42]:
dfn_subset.to_csv('semrep_data/semrep_all_predications_mapped_with_prefix_20240528.tsv', sep='\t', index=False)

In [43]:
##drop unmapped concepts - only 37% left so needs mapping
df_new = dfn_subset[dfn_subset['subject_obo'] != '']
df_new = df_new[df_new['object_obo'] != '']
df_new = df_new[df_new['predicate_obo'] != '']
df_new = df_new.drop_duplicates()
df_new = df_new.reset_index(drop=True)
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143641 entries, 0 to 143640
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   index                143641 non-null  int64 
 1   pmid                 143641 non-null  int64 
 2   subject_cui          143641 non-null  object
 3   subject_name         143641 non-null  object
 4   subject_type         143641 non-null  object
 5   object_cui           143641 non-null  object
 6   object_name          143641 non-null  object
 7   object_type          143641 non-null  object
 8   year                 143641 non-null  object
 9   sentence             143641 non-null  object
 10  source_section       143641 non-null  object
 11  pub_type             143641 non-null  object
 12  related_common_name  143641 non-null  object
 13  predicate            143641 non-null  object
 14  predicate_obo        143641 non-null  object
 15  subject_obo          143641 non-nu

In [44]:
df_new['predicate'].value_counts()

interacts_with            28318
part_of                   21489
coexists_with             15912
inhibits                  15091
stimulates                14449
affects                   13770
treats                     7673
disrupts                   6887
causes                     5970
augments                   5546
associated_with            3230
produces                   2531
prevents                   1083
predisposes                 719
treats(infer)               454
precedes                    118
interacts_with(infer)        48
coexists_with(spec)          46
interacts_with(spec)         45
associated_with(infer)       41
inhibits(spec)               35
treats(spec)                 32
affects(spec)                28
causes(spec)                 25
stimulates(spec)             16
complicates                  15
associated_with(spec)        15
disrupts(spec)               14
prevents(spec)               11
part_of(spec)                10
augments(spec)                9
produces

In [45]:
df_new.to_csv('semrep_data/semrep_all_predicates_mapped_processed_20240528.tsv', sep='\t', index=False)

In [2]:
import pandas as pd
df_new = pd.read_csv('semrep_data/semrep_all_predicates_mapped_processed_20240528.tsv', sep='\t')
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143641 entries, 0 to 143640
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   index                143641 non-null  int64 
 1   pmid                 143641 non-null  int64 
 2   subject_cui          143641 non-null  object
 3   subject_name         143641 non-null  object
 4   subject_type         143641 non-null  object
 5   object_cui           143641 non-null  object
 6   object_name          143641 non-null  object
 7   object_type          143641 non-null  object
 8   year                 143641 non-null  int64 
 9   sentence             143641 non-null  object
 10  source_section       34642 non-null   object
 11  pub_type             74612 non-null   object
 12  related_common_name  143641 non-null  object
 13  predicate            143641 non-null  object
 14  predicate_obo        143641 non-null  object
 15  subject_obo          143641 non-nu

### Create graphs

In [2]:
#Create networkx graph from triples
import glob
import hashlib
import json
import networkx as nx  # type: ignore
import os
import os.path

from collections import Counter  # type: ignore
from more_itertools import unique_everseen  # type: ignore
from rdflib import BNode, Graph, Literal, Namespace, URIRef  # type: ignore
from rdflib.namespace import OWL, RDF, RDFS  # type: ignore
from rdflib.plugins.serializers.nt import _quoteLiteral  # type: ignore
import subprocess

from tqdm import tqdm  # type: ignore
from typing import Dict, List, Optional, Set, Tuple, Union
import ast
# set-up environment variables
obo = Namespace('http://purl.obolibrary.org/obo/')
oboinowl = Namespace('http://www.geneontology.org/formats/oboInOwl#')
schema = Namespace('http://www.w3.org/2001/XMLSchema#')
napdi = Namespace('http://napdi.org/napdi_srs_imports:')

In [4]:
dfres = df_new[['subject_obo', 'predicate_obo', 'object_obo']]
dfres = dfres.drop_duplicates()
dfres = dfres.reset_index(drop=True)
dfres.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58810 entries, 0 to 58809
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   subject_obo    58810 non-null  object
 1   predicate_obo  58810 non-null  object
 2   object_obo     58810 non-null  object
dtypes: object(3)
memory usage: 1.3+ MB


In [5]:
dfres.head()

Unnamed: 0,subject_obo,predicate_obo,object_obo
0,http://purl.obolibrary.org/obo/CHEBI_38559,http://purl.obolibrary.org/obo/RO_0002434,http://purl.obolibrary.org/obo/CHEBI_35255
1,http://purl.obolibrary.org/obo/UBERON_0002107,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/NCBITaxon_9606
2,http://purl.obolibrary.org/obo/PR_P05181,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/NCBITaxon_9606
3,http://purl.obolibrary.org/obo/CHEBI_16526,http://purl.obolibrary.org/obo/RO_0002610,http://purl.obolibrary.org/obo/NCBITaxon_1207117
4,http://purl.obolibrary.org/obo/CHEBI_16526,http://purl.obolibrary.org/obo/RO_0002610,http://purl.obolibrary.org/obo/HP_0012418


#### Fix prefixes

In [6]:
##get unique prefixes for subjects and objects
prefixes = []
for i in range(len(df_new.index)):
    subj = df_new.at[i, 'subject_obo']
    obj = df_new.at[i, 'object_obo']
    prefixes.append(subj.split('/')[-1].split('_')[0])
    prefixes.append(obj.split('/')[-1].split('_')[0])
prefixes = list(set(prefixes))
len(prefixes)

18

In [7]:
prefixes

['NCBITaxon',
 'CL',
 'napdi',
 'UBERON',
 'MONDO',
 'PR',
 'GO',
 'OBO',
 'RO',
 'SLC47A1[SYNONYM',
 'CLO',
 'DOID',
 'cellline#human',
 'CHEBI',
 'PW',
 'HP',
 'SO',
 'PATO']

In [8]:
problems = ['SLC47A1[SYNONYM']
for i in range(len(df_new.index)):
    subj = df_new.at[i, 'subject_obo']
    obj = df_new.at[i, 'object_obo']
    subjpref = subj.split('/')[-1].split('_')[0]
    objpref = obj.split('/')[-1].split('_')[0]
    if subjpref in problems:
        print(subj)
        break
    if objpref in problems:
        print(obj)
        break

http://purl.obolibrary.org/obo/SLC47A1[SYNONYM


In [9]:
##drop rows where subject_obo or object_obo = http_//www.ebi.ac.uk/cellline#human_cell_line
df_new = df_new[df_new['subject_obo'] != 'http_//www.ebi.ac.uk/cellline#human_cell_line']
df_new = df_new[df_new['object_obo'] != 'http_//www.ebi.ac.uk/cellline#human_cell_line']
df_new = df_new.reset_index(drop=True)

In [10]:
#drop rows where subject_obo or object_obo prefix = RO
df_new = df_new[df_new['subject_obo'].str.split('/').str[-1].str.split('_').str[0] != 'RO']
df_new = df_new[df_new['object_obo'].str.split('/').str[-1].str.split('_').str[0] != 'RO']
df_new = df_new.reset_index(drop=True)
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143478 entries, 0 to 143477
Data columns (total 17 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   index                143478 non-null  int64 
 1   pmid                 143478 non-null  int64 
 2   subject_cui          143478 non-null  object
 3   subject_name         143478 non-null  object
 4   subject_type         143478 non-null  object
 5   object_cui           143478 non-null  object
 6   object_name          143478 non-null  object
 7   object_type          143478 non-null  object
 8   year                 143478 non-null  int64 
 9   sentence             143478 non-null  object
 10  source_section       34610 non-null   object
 11  pub_type             74534 non-null   object
 12  related_common_name  143478 non-null  object
 13  predicate            143478 non-null  object
 14  predicate_obo        143478 non-null  object
 15  subject_obo          143478 non-nu

In [11]:
problems = ['OBO', 'cellline#human', 'RO', 'SLC47A1[SYNONYM']
for i in range(len(df_new.index)):
    subj = df_new.at[i, 'subject_obo']
    obj = df_new.at[i, 'object_obo']
    subjpref = subj.split('/')[-1].split('_')[0]
    objpref = obj.split('/')[-1].split('_')[0]
    if subjpref in problems:
        if subjpref == 'OBO':
            df_new.at[i, 'subject_obo'] = subj.replace('OBO_', '')
        elif subjpref == 'SLC47A1[SYNONYM':
            df_new.at[i, 'subject_obo'] = 'http://purl.obolibrary.org/obo/PR_000015152'
    if objpref in problems:
        if objpref == 'OBO':
            df_new.at[i, 'object_obo'] = obj.replace('OBO_', '')
        elif objpref == 'SLC47A1[SYNONYM':
            df_new.at[i, 'object_obo'] = 'http://purl.obolibrary.org/obo/PR_000015152'

In [12]:
##check prefix again
prefixes = []
for i in range(len(df_new.index)):
    subj = df_new.at[i, 'subject_obo']
    obj = df_new.at[i, 'object_obo']
    prefixes.append(subj.split('/')[-1].split('_')[0])
    prefixes.append(obj.split('/')[-1].split('_')[0])
prefixes = list(set(prefixes))
prefixes

['NCBITaxon',
 'CL',
 'napdi',
 'PATO',
 'UBERON',
 'MONDO',
 'PR',
 'GO',
 'CLO',
 'DOID',
 'CHEBI',
 'VO',
 'PW',
 'HP',
 'SO',
 'OAE']

In [13]:
##save to file
df_new.to_csv('semrep_data/semrep_all_predicates_mapped_processed_new_20240528.tsv', sep='\t', index=False)

In [3]:
import pandas as pd
df_new = pd.read_csv('semrep_data/semrep_all_predicates_mapped_processed_new_20240528.tsv', sep='\t')

In [4]:
typedict = {
    'CHEBI': 'chemical',
    'PR': 'protein',
    'GO': 'process',
    'DOID': 'disease',
    'HP': 'phenotype',
    'UBERON': 'anatomy',
    'SO': 'sequence',
    'CL': 'cell',
    'VO': 'vaccine',
    'NCBITaxon': 'organism',
    'MONDO': 'disease',
    'napdi': 'natural_product',
    'OAE': 'adverse_event',
    'NCBIGene': 'gene',
    'PW': 'pathway',
    'CLO': 'cell_line',
    'PATO': 'phenotype'
}

In [5]:
#create rdflib graph from dataframe triples and serialize as ntriples file
graph  = Graph()
pred_label = URIRef("http://www.w3.org/2000/01/rdf-schema#label")
for i in range(len(df_new.index)):
    subj = df_new.at[i, 'subject_obo']
    obj = df_new.at[i, 'object_obo']
    pred = df_new.at[i, 'predicate_obo']
    subj_node = URIRef(subj)
    obj_node = URIRef(obj)
    predicate = URIRef(pred)
    subj_name = df_new.at[i, 'subject_name']
    obj_name = df_new.at[i, 'object_name']
    graph.add((subj_node, predicate, obj_node))
    graph.add((subj_node, pred_label, Literal(subj_name)))
    graph.add((obj_node, pred_label, Literal(obj_name)))

In [6]:
graph.serialize('output_graphs/machineread_semrep_version2.nt', format='nt')

In [7]:
len(graph)

67802

In [8]:
import ast

In [9]:
def n3(node: Union[URIRef, BNode, Literal]) -> str:
    """Method takes an RDFLib node of type BNode, URIRef, or Literal and serializes it to meet the RDF 1.1 NTriples
    format.
    Src: https://github.com/RDFLib/rdflib/blob/c11f7b503b50b7c3cdeec0f36261fa09b0615380/rdflib/plugins/serializers/nt.py
    Args:
        node: An RDFLib
    Returns:
        serialized_node: A string containing the serialized
    """
    if isinstance(node, Literal): serialized_node = "%s" % _quoteLiteral(node)
    else: serialized_node = "%s" % node.n3()
    return serialized_node

In [10]:
##read in edge metadata dict
import pickle
with open('output_graphs/edge_metadata_reach_version2.pickle', 'rb') as fp:
    edge_metadata = pickle.load(fp)
len(edge_metadata)

36252

In [11]:
#read in uri_to_curie_map_reach
with open('output_graphs/uri_to_curie_map_reach.pickle', 'rb') as fp:
    uri_to_curie_map = pickle.load(fp)
len(uri_to_curie_map)

6529

In [12]:
df_new = df_new.fillna('')

In [13]:
##create edge metadata df for semrep
edge_metadata_df = pd.DataFrame(columns=['subject_uri', 'object_uri', 'relation', 'subject_curie', 'object_curie', 'source', 'edge_type', 'year', 'pmid', 'sentence', 'belief', 'pubtype', 'source_section'])

In [14]:
#convert rdflib graph to multidigraph - code borrowed from PheKnowLator: kg_utils.py
#use the pred key to also create a dictionary with metadata about the edge - 
#pub_year, pmid, source graph, belief
nx_mdg = nx.MultiDiGraph()

In [15]:
for s, p, o in tqdm(graph):
    #do not save label predicate to gpickle
    subj = str(s)
    obj = str(o)
    pred = str(p)
    if pred == 'http://www.w3.org/2000/01/rdf-schema#label':
        continue
    else:
        ##defining metadata dict for edge type
        subj_type = typedict[subj.split('/')[-1].split('_')[0]]
        obj_type = typedict[obj.split('/')[-1].split('_')[0]]
        subj_curie = subj.split('/')[-1]
        obj_curie = obj.split('/')[-1]
        if subj not in uri_to_curie_map:
            uri_to_curie_map[subj] = subj_curie
        if obj not in uri_to_curie_map:
            uri_to_curie_map[obj] = obj_curie
        edge_key = subj_curie + '-' + obj_curie
        if edge_key not in edge_metadata:
            edge_metadata[edge_key] = {
            'type': subj_type + '-' + obj_type,
            'relations': {}
            }
        pred_curie = pred.split('/')[-1]
        pred_key = hashlib.md5('{}{}{}'.format(n3(s), n3(p), n3(o)).encode()).hexdigest()
        pmid = str(df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj) & (df_new['predicate_obo'] == pred)]['pmid'].values[0])
        timestamp = str(df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj) & (df_new['predicate_obo'] == pred)]['year'].values[0])
        belief_score = 0.8
        sentence = df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj) & (df_new['predicate_obo'] == pred)]['sentence'].values[0]
        source_section = df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj) & (df_new['predicate_obo'] == pred)]['source_section'].values[0]
        pub_type_list = df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj) & (df_new['predicate_obo'] == pred)]['pub_type'].values[0]
        if pub_type_list == '':
            pub_type = ''
        else:
            pub_type = ast.literal_eval(pub_type_list)
            pub_type = ' '.join(pub_type)
        #add edge to graph
        nx_mdg.add_node(s, key=n3(s))
        nx_mdg.add_node(o, key=n3(o))
        nx_mdg.add_edge(s, o, **{'key': p, 'predicate_key': pred_key, 'weight':0.0, 'source_graph': 'machine_read'})

        #add edge metadata to edge_metadict
        if pred_curie not in edge_metadata[edge_key]['relations']:
            edge_metadata[edge_key]['relations'][pred_curie] = {
                'source': [],
                'year': [],
                'pmid': [],
                'sentence': [],
                'belief': [],
                'weight': [],
                'pubtype': [],
                'source_section': [],
                }
        edge_metadata[edge_key]['relations'][pred_curie]['source'].append('semrep')
        edge_metadata[edge_key]['relations'][pred_curie]['year'].append(timestamp)
        edge_metadata[edge_key]['relations'][pred_curie]['pmid'].append(pmid)
        edge_metadata[edge_key]['relations'][pred_curie]['sentence'].append(sentence)
        edge_metadata[edge_key]['relations'][pred_curie]['belief'].append(belief_score)
        edge_metadata[edge_key]['relations'][pred_curie]['weight'].append(0.0)
        edge_metadata[edge_key]['relations'][pred_curie]['pubtype'].append(pub_type)
        edge_metadata[edge_key]['relations'][pred_curie]['source_section'].append(source_section)

        temp_df = pd.DataFrame([[subj, obj, pred_curie, subj_curie, obj_curie, 'semrep', pred_curie, timestamp, pmid, sentence, belief_score, pub_type, source_section]], columns=edge_metadata_df.columns)
        edge_metadata_df = pd.concat([edge_metadata_df, temp_df], ignore_index=True)
nx.write_gpickle(nx_mdg, 'output_graphs/machineread_semrep_version2.gpickle')

  0%|          | 0/67802 [00:00<?, ?it/s]

100%|██████████| 67802/67802 [4:12:51<00:00,  4.47it/s]  


In [16]:
##write edge metadata to pickle
with open('output_graphs/edge_metadata_reach_semrep_version2.pickle', 'wb') as filep:
    pickle.dump(edge_metadata, filep)

In [17]:
with open('output_graphs/uri_to_curie_map_reach_semrep.pickle', 'wb') as filep2:
    pickle.dump(uri_to_curie_map, filep2)

In [18]:
edge_metadata_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58754 entries, 0 to 58753
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   subject_uri     58754 non-null  object 
 1   object_uri      58754 non-null  object 
 2   relation        58754 non-null  object 
 3   subject_curie   58754 non-null  object 
 4   object_curie    58754 non-null  object 
 5   source          58754 non-null  object 
 6   edge_type       58754 non-null  object 
 7   year            58754 non-null  object 
 8   pmid            58754 non-null  object 
 9   sentence        58754 non-null  object 
 10  belief          58754 non-null  float64
 11  pubtype         58754 non-null  object 
 12  source_section  58754 non-null  object 
dtypes: float64(1), object(12)
memory usage: 5.8+ MB


In [19]:
edge_metadata_df.to_csv('output_graphs/edge_metadata_semrep_version2.tsv', sep='\t', index=False)

In [20]:
triples = len(graph)
nodes = len(set(list(graph.subjects()) + list(graph.objects())))
rels = len(set(list(graph.predicates())))
print(triples, nodes, rels)

67802 17447 17


In [21]:
#this should have less edges than rdflib graph after removing 'labels'
nodes = nx.number_of_nodes(nx_mdg)
edges = nx.number_of_edges(nx_mdg)
density = nx.density(nx_mdg)
avg_deg = float(edges)/nodes
print(nodes, edges, density, avg_deg)

8399 58754 0.0008329788747404438 6.995356590070246


In [22]:
#save node labels as dictionary
#key: URI, value is label
label_dict = {}
for i in range(len(df_new.index)):
    subj = str(df_new.at[i, 'subject_obo'])
    obj = str(df_new.at[i, 'object_obo'])
    pred = str(df_new.at[i, 'predicate_obo'])
    if subj not in label_dict:
        label_dict[subj] = {}
        label_dict[subj]['entity_type'] = 'NODES'
        label_dict[subj]['label'] = df_new.at[i, 'subject_name']
        label_dict[subj]['cui'] = df_new.at[i, 'subject_cui']
    if obj not in label_dict:
        label_dict[obj] = {}
        label_dict[obj]['entity_type'] = 'NODES'
        label_dict[obj]['label'] = df_new.at[i, 'object_name']
        label_dict[obj]['cui'] = df_new.at[i, 'object_cui']
    if pred not in label_dict:
        label_dict[pred] = {}
        label_dict[pred]['entity_type'] = 'RELATIONS'
        label_dict[pred]['label'] = df_new.at[i, 'predicate']
len(label_dict)

8415

In [23]:
with open('output_graphs/machineread_semrep_NodeLabels.pickle', 'wb') as file_p:
    pickle.dump(label_dict, file_p)

In [24]:
dfmap = pd.DataFrame.from_dict(label_dict, orient='index')
dfmap = dfmap.reset_index()
dfmap = dfmap.rename(columns={"index":"entity_uri"})
dfmap.to_csv('output_graphs/machineread_semrep_NodeLabels.tsv', index=False, sep='\t')