## NaPDI machine reading - process SemRep triples, map to OBO and generate machine reading graph with triples

Last run on 2022-04-06 with green tea, kratom and microbiome output

In [1]:
import pandas as pd
import numpy as np
import pickle
from rdflib.namespace import OWL, RDF, RDFS

In [2]:
import os
files = os.listdir('semrep_data/')
#read all files in semrep_data
files[0][-3:]
df = pd.DataFrame(columns=['index', 'pmid', 'subject_cui', 'subject_type', 'relation', 'object_cui', 'object_type', 'year', 'sentence'])
for file in files:
    if file[-3:] == 'tsv':
        print('Loading file: ', file)
        df_temp = pd.read_csv('semrep_data/'+file, sep='\t')
        print(df_temp.info())
        df = pd.concat([df, df_temp], ignore_index=True)
df.info()

Loading file:  greentea_pmid_all_predicates_semrep-errors-fixed.tsv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48957 entries, 0 to 48956
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         48957 non-null  int64 
 1   pmid          48957 non-null  int64 
 2   subject_cui   47969 non-null  object
 3   subject_name  47969 non-null  object
 4   subject_type  48957 non-null  object
 5   relation      48957 non-null  object
 6   object_cui    47668 non-null  object
 7   object_name   47668 non-null  object
 8   object_type   48957 non-null  object
 9   year          48957 non-null  object
 10  sentence      48957 non-null  object
dtypes: int64(2), object(9)
memory usage: 4.1+ MB
None
Loading file:  microbiome_pmid_all_predicates_semrep-errors-fixed.tsv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14893 entries, 0 to 14892
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
-

In [3]:
df.head()

Unnamed: 0,index,pmid,subject_cui,subject_type,relation,object_cui,object_type,year,sentence,subject_name,object_name
0,0,29913456,C1704263,"orch,phsu",AFFECTS,C2349205,fndg,2018,29913456_ascii.txt.tx.1 Abstract Background/Ai...,Green Tea Extract,Normal heart
1,1,29913456,C0014518,dsyn,PROCESS_OF,C0596888,anim,2018,29913456_ascii.txt.tx.2 Methods: The study pop...,Toxic Epidermal Necrolysis,mature animal
2,2,29913456,C0043047,"inch,phsu",ADMINISTERED_TO,C0034693,mamm,2018,\n,Water,Rattus norvegicus
3,3,29913456,C0232164,ortf,PROCESS_OF,C0003062,anim,2018,"29913456_ascii.txt.tx.1 Then, in vivo and ex v...",Cardiac function,Animals
4,4,29913456,C0185115,topp,USES,C0006675,"bacs,elii,phsu",2018,29913456_ascii.txt.tx.2 Results: GTE cardiom...,Extraction,Calcium


In [4]:
#processing TSV
'''
0. Concatenate files and remove duplicates, get stats of unique PMIDs and triples
1. Fix sentence (empty takes value of previous sentence)
2. Extract year from date and save
3. Map from CUI to GO, HPO where available
4. Map with OntoRunNER
'''

'\n0. Concatenate files and remove duplicates, get stats of unique PMIDs and triples\n1. Fix sentence (empty takes value of previous sentence)\n2. Extract year from date and save\n3. Map from CUI to GO, HPO where available\n4. Map with OntoRunNER\n'

In [5]:
df = df.drop_duplicates(ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277812 entries, 0 to 277811
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   index         277812 non-null  object
 1   pmid          277812 non-null  object
 2   subject_cui   272500 non-null  object
 3   subject_type  277812 non-null  object
 4   relation      277812 non-null  object
 5   object_cui    271016 non-null  object
 6   object_type   277812 non-null  object
 7   year          277542 non-null  object
 8   sentence      277812 non-null  object
 9   subject_name  272500 non-null  object
 10  object_name   271016 non-null  object
dtypes: object(11)
memory usage: 23.3+ MB


In [6]:
df = df.fillna('')

In [7]:
len(df.pmid.unique())

775

In [8]:
import re

In [9]:
#use re to fix year as format is not consistent
for i in range(len(df.index)):
    pub_date = df.at[i, 'year']
    x = re.findall(r'\d+', pub_date)
    if x:
        df.at[i, 'year'] = x[0]
df.head()

Unnamed: 0,index,pmid,subject_cui,subject_type,relation,object_cui,object_type,year,sentence,subject_name,object_name
0,0,29913456,C1704263,"orch,phsu",AFFECTS,C2349205,fndg,2018,29913456_ascii.txt.tx.1 Abstract Background/Ai...,Green Tea Extract,Normal heart
1,1,29913456,C0014518,dsyn,PROCESS_OF,C0596888,anim,2018,29913456_ascii.txt.tx.2 Methods: The study pop...,Toxic Epidermal Necrolysis,mature animal
2,2,29913456,C0043047,"inch,phsu",ADMINISTERED_TO,C0034693,mamm,2018,\n,Water,Rattus norvegicus
3,3,29913456,C0232164,ortf,PROCESS_OF,C0003062,anim,2018,"29913456_ascii.txt.tx.1 Then, in vivo and ex v...",Cardiac function,Animals
4,4,29913456,C0185115,topp,USES,C0006675,"bacs,elii,phsu",2018,29913456_ascii.txt.tx.2 Results: GTE cardiom...,Extraction,Calcium


In [10]:
source_sent = df['sentence'].tolist()

In [11]:
len(source_sent)

277812

In [12]:
##fix source sentence
last = len(source_sent)-1
sentences = []
count = 0
for sent in source_sent:
    sent = sent.strip()
    flag = 0
    if sent == '' or '|||' in sent:
        for i in range(count, -1, -1):
            if source_sent[i].strip() == '' or '|||' in source_sent[i].strip():
                continue
            else:
                sentences.append(source_sent[i].strip())
                flag = 1
                break
        if flag == 0:
            sentences.append(sent.strip())
    else:
        sentences.append(sent.strip())
    count += 1
            
len(sentences)

277812

In [13]:
sentences = pd.Series(sentences)

In [14]:
df['source_text'] = sentences
df.head()

Unnamed: 0,index,pmid,subject_cui,subject_type,relation,object_cui,object_type,year,sentence,subject_name,object_name,source_text
0,0,29913456,C1704263,"orch,phsu",AFFECTS,C2349205,fndg,2018,29913456_ascii.txt.tx.1 Abstract Background/Ai...,Green Tea Extract,Normal heart,29913456_ascii.txt.tx.1 Abstract Background/Ai...
1,1,29913456,C0014518,dsyn,PROCESS_OF,C0596888,anim,2018,29913456_ascii.txt.tx.2 Methods: The study pop...,Toxic Epidermal Necrolysis,mature animal,29913456_ascii.txt.tx.2 Methods: The study pop...
2,2,29913456,C0043047,"inch,phsu",ADMINISTERED_TO,C0034693,mamm,2018,\n,Water,Rattus norvegicus,29913456_ascii.txt.tx.2 Methods: The study pop...
3,3,29913456,C0232164,ortf,PROCESS_OF,C0003062,anim,2018,"29913456_ascii.txt.tx.1 Then, in vivo and ex v...",Cardiac function,Animals,"29913456_ascii.txt.tx.1 Then, in vivo and ex v..."
4,4,29913456,C0185115,topp,USES,C0006675,"bacs,elii,phsu",2018,29913456_ascii.txt.tx.2 Results: GTE cardiom...,Extraction,Calcium,29913456_ascii.txt.tx.2 Results: GTE cardiom...


In [15]:
df = df.drop(['sentence', 'index'], axis=1)

In [16]:
df.head()

Unnamed: 0,pmid,subject_cui,subject_type,relation,object_cui,object_type,year,subject_name,object_name,source_text
0,29913456,C1704263,"orch,phsu",AFFECTS,C2349205,fndg,2018,Green Tea Extract,Normal heart,29913456_ascii.txt.tx.1 Abstract Background/Ai...
1,29913456,C0014518,dsyn,PROCESS_OF,C0596888,anim,2018,Toxic Epidermal Necrolysis,mature animal,29913456_ascii.txt.tx.2 Methods: The study pop...
2,29913456,C0043047,"inch,phsu",ADMINISTERED_TO,C0034693,mamm,2018,Water,Rattus norvegicus,29913456_ascii.txt.tx.2 Methods: The study pop...
3,29913456,C0232164,ortf,PROCESS_OF,C0003062,anim,2018,Cardiac function,Animals,"29913456_ascii.txt.tx.1 Then, in vivo and ex v..."
4,29913456,C0185115,topp,USES,C0006675,"bacs,elii,phsu",2018,Extraction,Calcium,29913456_ascii.txt.tx.2 Results: GTE cardiom...


In [17]:
preds = ['affects',
'associated_with',
'augments',
'causes',
'coexists_with',
'complicates',
'disrupts',
'inhibits',
'interacts_with',
'part_of',
'precedes',
'predisposes',
'prevents',
'produces',
'stimulates',
'treats']

In [18]:
df['predicate'] = df['relation'].str.lower()
df.head()

Unnamed: 0,pmid,subject_cui,subject_type,relation,object_cui,object_type,year,subject_name,object_name,source_text,predicate
0,29913456,C1704263,"orch,phsu",AFFECTS,C2349205,fndg,2018,Green Tea Extract,Normal heart,29913456_ascii.txt.tx.1 Abstract Background/Ai...,affects
1,29913456,C0014518,dsyn,PROCESS_OF,C0596888,anim,2018,Toxic Epidermal Necrolysis,mature animal,29913456_ascii.txt.tx.2 Methods: The study pop...,process_of
2,29913456,C0043047,"inch,phsu",ADMINISTERED_TO,C0034693,mamm,2018,Water,Rattus norvegicus,29913456_ascii.txt.tx.2 Methods: The study pop...,administered_to
3,29913456,C0232164,ortf,PROCESS_OF,C0003062,anim,2018,Cardiac function,Animals,"29913456_ascii.txt.tx.1 Then, in vivo and ex v...",process_of
4,29913456,C0185115,topp,USES,C0006675,"bacs,elii,phsu",2018,Extraction,Calcium,29913456_ascii.txt.tx.2 Results: GTE cardiom...,uses


In [19]:
df = df.drop(['relation'], axis=1)

In [20]:
dfn = df[df['predicate'].isin(preds)]
dfn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158549 entries, 0 to 277811
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   pmid          158549 non-null  int64 
 1   subject_cui   158549 non-null  object
 2   subject_type  158549 non-null  object
 3   object_cui    158549 non-null  object
 4   object_type   158549 non-null  object
 5   year          158549 non-null  object
 6   subject_name  158549 non-null  object
 7   object_name   158549 non-null  object
 8   source_text   158549 non-null  object
 9   predicate     158549 non-null  object
dtypes: int64(1), object(9)
memory usage: 13.3+ MB


In [21]:
dfn['predicate'].value_counts()

interacts_with     26146
affects            18902
part_of            18850
inhibits           15788
coexists_with      15521
stimulates         14490
treats             13062
causes              9540
disrupts            7800
augments            7435
associated_with     3968
produces            3213
prevents            2264
predisposes         1217
precedes             309
complicates           44
Name: predicate, dtype: int64

In [22]:
dfn['subject_map'] = None
dfn['object_map'] = None
dfn['predicate_obo'] = None
dfn['subject_obo'] = None
dfn['object_obo'] = None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [23]:
##add relation ontology mapping
predMapSemRep = {
'affects': 'RO_0002596',
'associated_with': 'RO_0002610',
'augments': 'RO_0002598',
'causes': 'RO_0002566',
'coexists_with': 'RO_0002490',
'complicates': 'RO_0003309',
'disrupts': 'RO_0002212',
'inhibits': 'RO_0002449',
'interacts_with': 'RO_0002434',
'part_of': 'BFO_0000050',
'precedes': 'BFO_0000063',
'predisposes': 'RO_0003302',
'prevents': 'RO_0002599',
'produces': 'RO_0003000',
'stimulates': 'RO_0002213',
'treats': 'RO_0002606'
}

In [24]:
def relation_mapping(row):
    rel = row['predicate'].lower()
    if rel in predMapSemRep:
        return predMapSemRep[rel]
    else:
        return ''

In [25]:
dfn['predicate_obo'] = dfn.apply(relation_mapping, axis=1)
dfn.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,pmid,subject_cui,subject_type,object_cui,object_type,year,subject_name,object_name,source_text,predicate,subject_map,object_map,predicate_obo,subject_obo,object_obo
0,29913456,C1704263,"orch,phsu",C2349205,fndg,2018,Green Tea Extract,Normal heart,29913456_ascii.txt.tx.1 Abstract Background/Ai...,affects,,,RO_0002596,,
5,29913456,,"gngm,aapp",C2744579,"aapp,gngm,enzy",2018,,"ATP8A2 protein, human",29913456_ascii.txt.tx.2 Consistent with functi...,stimulates,,,RO_0002213,,
6,29913456,C0020538,dsyn,C0878544,dsyn,2018,Hypertensive disease,Cardiomyopathies,29913456_ascii.txt.tx.1 Previous experimental ...,coexists_with,,,RO_0002490,,
8,29913456,C0059438,"orch,phsu",C0035126,inpo,2018,epigallocatechin gallate,Reperfusion Injury,29913456_ascii.txt.tx.1 Previous experimental ...,treats,,,RO_0002606,,
9,29913456,C0242973,patf,C0878544,dsyn,2018,Ventricular Dysfunction,Cardiomyopathies,29913456_ascii.txt.tx.1 Previous experimental ...,coexists_with,,,RO_0002490,,


In [26]:
dfn = dfn.reset_index(drop=True)
dfn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 158549 entries, 0 to 158548
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   pmid           158549 non-null  int64 
 1   subject_cui    158549 non-null  object
 2   subject_type   158549 non-null  object
 3   object_cui     158549 non-null  object
 4   object_type    158549 non-null  object
 5   year           158549 non-null  object
 6   subject_name   158549 non-null  object
 7   object_name    158549 non-null  object
 8   source_text    158549 non-null  object
 9   predicate      158549 non-null  object
 10  subject_map    0 non-null       object
 11  object_map     0 non-null       object
 12  predicate_obo  158549 non-null  object
 13  subject_obo    0 non-null       object
 14  object_obo     0 non-null       object
dtypes: int64(1), object(14)
memory usage: 18.1+ MB


In [27]:
#filter by semantic types here and then start mapping
excluded_semtype = ['acty','bhvr','evnt','gora','mcha','ocac', #Occupational Activity
'clas',
'cnce',
'ftcn',
'grpa',
'idcn',
'inpr',
'lang',
'qlco',
'qnco',
'rnlw',
'spco',
'tmco',
'enty',
'mnob',
'phob',
'bmod',
'ocdi',
'hcro',
'orgt',
'pros',
'shro',
'eehu',
'hcpp']

In [28]:
dfn1 = dfn[~dfn['subject_type'].isin(excluded_semtype)]
dfn1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158542 entries, 0 to 158548
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   pmid           158542 non-null  int64 
 1   subject_cui    158542 non-null  object
 2   subject_type   158542 non-null  object
 3   object_cui     158542 non-null  object
 4   object_type    158542 non-null  object
 5   year           158542 non-null  object
 6   subject_name   158542 non-null  object
 7   object_name    158542 non-null  object
 8   source_text    158542 non-null  object
 9   predicate      158542 non-null  object
 10  subject_map    0 non-null       object
 11  object_map     0 non-null       object
 12  predicate_obo  158542 non-null  object
 13  subject_obo    0 non-null       object
 14  object_obo     0 non-null       object
dtypes: int64(1), object(14)
memory usage: 19.4+ MB


In [29]:
dfn2 = dfn1[~dfn1['object_type'].isin(excluded_semtype)]
dfn2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158519 entries, 0 to 158548
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   pmid           158519 non-null  int64 
 1   subject_cui    158519 non-null  object
 2   subject_type   158519 non-null  object
 3   object_cui     158519 non-null  object
 4   object_type    158519 non-null  object
 5   year           158519 non-null  object
 6   subject_name   158519 non-null  object
 7   object_name    158519 non-null  object
 8   source_text    158519 non-null  object
 9   predicate      158519 non-null  object
 10  subject_map    0 non-null       object
 11  object_map     0 non-null       object
 12  predicate_obo  158519 non-null  object
 13  subject_obo    0 non-null       object
 14  object_obo     0 non-null       object
dtypes: int64(1), object(14)
memory usage: 19.4+ MB


In [30]:
##exclude all concepts that occur in SemMedDB GENERIC.CONCEPT table
#Get CSV file - https://lhncbc.nlm.nih.gov/ii/tools/SemRep_SemMedDB_SKR/SemMedDB_download.html -- doesn't work
#download from halil covid-19 repository - 
semmed = pd.read_csv('cui_to_ontology_maps/semmedVER43_2020_R_GENERIC_CONCEPT.csv', header=None)
semmed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 259 entries, 0 to 258
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       259 non-null    int64 
 1   1       259 non-null    object
 2   2       259 non-null    object
dtypes: int64(1), object(2)
memory usage: 6.2+ KB


In [31]:
semmed.head()

Unnamed: 0,0,1,2
0,1983,C0001687,Adverse effects NEC
1,1984,C0002526,"Amino Acids, Peptides, and Proteins"
2,1985,C0003043,Animalia
3,1986,C0003062,Animals
4,1987,C0005515,Biological Factors


In [32]:
semmed = semmed.rename(columns={1: 'CUI', 2: 'concept_name'})
semmed.head()

Unnamed: 0,0,CUI,concept_name
0,1983,C0001687,Adverse effects NEC
1,1984,C0002526,"Amino Acids, Peptides, and Proteins"
2,1985,C0003043,Animalia
3,1986,C0003062,Animals
4,1987,C0005515,Biological Factors


In [33]:
semmed = semmed.drop([0], axis=1)
semmed.head()

Unnamed: 0,CUI,concept_name
0,C0001687,Adverse effects NEC
1,C0002526,"Amino Acids, Peptides, and Proteins"
2,C0003043,Animalia
3,C0003062,Animals
4,C0005515,Biological Factors


In [34]:
#drop rows where subject and object matches generic concepts
generic_cui = semmed.CUI.tolist()
len(generic_cui)

259

In [35]:
dfn2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158519 entries, 0 to 158548
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   pmid           158519 non-null  int64 
 1   subject_cui    158519 non-null  object
 2   subject_type   158519 non-null  object
 3   object_cui     158519 non-null  object
 4   object_type    158519 non-null  object
 5   year           158519 non-null  object
 6   subject_name   158519 non-null  object
 7   object_name    158519 non-null  object
 8   source_text    158519 non-null  object
 9   predicate      158519 non-null  object
 10  subject_map    0 non-null       object
 11  object_map     0 non-null       object
 12  predicate_obo  158519 non-null  object
 13  subject_obo    0 non-null       object
 14  object_obo     0 non-null       object
dtypes: int64(1), object(14)
memory usage: 19.4+ MB


In [36]:
dfn3 = dfn2[~dfn2['subject_cui'].isin(generic_cui)]
dfn3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 152450 entries, 0 to 158548
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   pmid           152450 non-null  int64 
 1   subject_cui    152450 non-null  object
 2   subject_type   152450 non-null  object
 3   object_cui     152450 non-null  object
 4   object_type    152450 non-null  object
 5   year           152450 non-null  object
 6   subject_name   152450 non-null  object
 7   object_name    152450 non-null  object
 8   source_text    152450 non-null  object
 9   predicate      152450 non-null  object
 10  subject_map    0 non-null       object
 11  object_map     0 non-null       object
 12  predicate_obo  152450 non-null  object
 13  subject_obo    0 non-null       object
 14  object_obo     0 non-null       object
dtypes: int64(1), object(14)
memory usage: 18.6+ MB


In [37]:
dfn4 = dfn3[~dfn3['object_cui'].isin(generic_cui)]
dfn4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 145638 entries, 0 to 158548
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   pmid           145638 non-null  int64 
 1   subject_cui    145638 non-null  object
 2   subject_type   145638 non-null  object
 3   object_cui     145638 non-null  object
 4   object_type    145638 non-null  object
 5   year           145638 non-null  object
 6   subject_name   145638 non-null  object
 7   object_name    145638 non-null  object
 8   source_text    145638 non-null  object
 9   predicate      145638 non-null  object
 10  subject_map    0 non-null       object
 11  object_map     0 non-null       object
 12  predicate_obo  145638 non-null  object
 13  subject_obo    0 non-null       object
 14  object_obo     0 non-null       object
dtypes: int64(1), object(14)
memory usage: 17.8+ MB


In [38]:
#handle subclassof when mapping to obo

In [39]:
#map from UMLS to GO, HPO
with open('cui_to_ontology_maps/go_hpo_map_dict.pickle', 'rb') as filep:
    go_hpo_mapping_dict = pickle.load(filep)
len(go_hpo_mapping_dict)

87745

In [40]:
def umls_go_hpo_map(row, col):
    if col == 'subject':
        cui = row['subject_cui']
    elif col == 'object':
        cui = row['object_cui']
    else:
        print('specify if subject or object mapping required')
        exit(0)
    if cui in go_hpo_mapping_dict:
        if len(go_hpo_mapping_dict):
            return go_hpo_mapping_dict[cui][0]
    return None

In [41]:
dfn4['subject_obo'] = dfn4.apply(umls_go_hpo_map, axis=1, col='subject')
dfn4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 145638 entries, 0 to 158548
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   pmid           145638 non-null  int64 
 1   subject_cui    145638 non-null  object
 2   subject_type   145638 non-null  object
 3   object_cui     145638 non-null  object
 4   object_type    145638 non-null  object
 5   year           145638 non-null  object
 6   subject_name   145638 non-null  object
 7   object_name    145638 non-null  object
 8   source_text    145638 non-null  object
 9   predicate      145638 non-null  object
 10  subject_map    0 non-null       object
 11  object_map     0 non-null       object
 12  predicate_obo  145638 non-null  object
 13  subject_obo    7160 non-null    object
 14  object_obo     0 non-null       object
dtypes: int64(1), object(14)
memory usage: 17.8+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [42]:
dfn4['object_obo'] = dfn4.apply(umls_go_hpo_map, axis=1, col='object')
dfn4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 145638 entries, 0 to 158548
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   pmid           145638 non-null  int64 
 1   subject_cui    145638 non-null  object
 2   subject_type   145638 non-null  object
 3   object_cui     145638 non-null  object
 4   object_type    145638 non-null  object
 5   year           145638 non-null  object
 6   subject_name   145638 non-null  object
 7   object_name    145638 non-null  object
 8   source_text    145638 non-null  object
 9   predicate      145638 non-null  object
 10  subject_map    0 non-null       object
 11  object_map     0 non-null       object
 12  predicate_obo  145638 non-null  object
 13  subject_obo    7160 non-null    object
 14  object_obo     29282 non-null   object
dtypes: int64(1), object(14)
memory usage: 17.8+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [43]:
dfn4 = dfn4.reset_index(drop=True)

In [44]:
dfn4.head()

Unnamed: 0,pmid,subject_cui,subject_type,object_cui,object_type,year,subject_name,object_name,source_text,predicate,subject_map,object_map,predicate_obo,subject_obo,object_obo
0,29913456,C1704263,"orch,phsu",C2349205,fndg,2018,Green Tea Extract,Normal heart,29913456_ascii.txt.tx.1 Abstract Background/Ai...,affects,,,RO_0002596,,
1,29913456,,"gngm,aapp",C2744579,"aapp,gngm,enzy",2018,,"ATP8A2 protein, human",29913456_ascii.txt.tx.2 Consistent with functi...,stimulates,,,RO_0002213,,
2,29913456,C0020538,dsyn,C0878544,dsyn,2018,Hypertensive disease,Cardiomyopathies,29913456_ascii.txt.tx.1 Previous experimental ...,coexists_with,,,RO_0002490,HP_0000822,HP_0001638
3,29913456,C0059438,"orch,phsu",C0035126,inpo,2018,epigallocatechin gallate,Reperfusion Injury,29913456_ascii.txt.tx.1 Previous experimental ...,treats,,,RO_0002606,,
4,29913456,C0242973,patf,C0878544,dsyn,2018,Ventricular Dysfunction,Cardiomyopathies,29913456_ascii.txt.tx.1 Previous experimental ...,coexists_with,,,RO_0002490,,HP_0001638


In [45]:
dfn_subset = dfn4

In [46]:
dfn_subset.head()

Unnamed: 0,pmid,subject_cui,subject_type,object_cui,object_type,year,subject_name,object_name,source_text,predicate,subject_map,object_map,predicate_obo,subject_obo,object_obo
0,29913456,C1704263,"orch,phsu",C2349205,fndg,2018,Green Tea Extract,Normal heart,29913456_ascii.txt.tx.1 Abstract Background/Ai...,affects,,,RO_0002596,,
1,29913456,,"gngm,aapp",C2744579,"aapp,gngm,enzy",2018,,"ATP8A2 protein, human",29913456_ascii.txt.tx.2 Consistent with functi...,stimulates,,,RO_0002213,,
2,29913456,C0020538,dsyn,C0878544,dsyn,2018,Hypertensive disease,Cardiomyopathies,29913456_ascii.txt.tx.1 Previous experimental ...,coexists_with,,,RO_0002490,HP_0000822,HP_0001638
3,29913456,C0059438,"orch,phsu",C0035126,inpo,2018,epigallocatechin gallate,Reperfusion Injury,29913456_ascii.txt.tx.1 Previous experimental ...,treats,,,RO_0002606,,
4,29913456,C0242973,patf,C0878544,dsyn,2018,Ventricular Dysfunction,Cardiomyopathies,29913456_ascii.txt.tx.1 Previous experimental ...,coexists_with,,,RO_0002490,,HP_0001638


In [47]:
##run the following cells if running ontorunner on unmapped concepts

In [48]:
unmapped_cui = []
unmapped_string_umls = []
for i in range(len(dfn_subset.index)):
    if not dfn_subset.at[i, 'subject_obo']:
        subcui = dfn_subset.at[i, 'subject_cui']
        if subcui not in unmapped_cui:
            unmapped_cui.append(dfn_subset.at[i, 'subject_cui'])
            unmapped_string_umls.append(dfn_subset.at[i, 'subject_name'])
    if not dfn_subset.at[i, 'object_obo']:
        objcui = dfn_subset.at[i, 'object_cui']
        if objcui not in unmapped_cui:
            unmapped_cui.append(dfn_subset.at[i, 'object_cui'])
            unmapped_string_umls.append(dfn_subset.at[i, 'object_name'])
print(len(unmapped_cui), len(unmapped_string_umls))

6577 6577


In [49]:
unmapped_string_umls[:10]

['Green Tea Extract',
 'Normal heart',
 '',
 'ATP8A2 protein, human',
 'epigallocatechin gallate',
 'Reperfusion Injury',
 'Ventricular Dysfunction',
 'Troponin',
 'DNA Modification Methylases',
 'silence']

In [50]:
unmapped_cui[2]

''

In [58]:
with open('semrep_data/unmapped_semrep_subset.txt', 'w') as fileo:
    for item in unmapped_string_umls:
        fileo.write(item+'\n')

In [59]:
with open('semrep_data/unmapped_semrep_cui_subset.txt', 'w') as fileco:
    for item in unmapped_cui:
        fileco.write(item+'\n')

In [None]:
#add ontorunner mapping

In [52]:
obomap = pd.read_csv('semrep_data/unmapped_semrep_subset_20220406.csv')
obomap.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7183 entries, 0 to 7182
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            7183 non-null   int64 
 1   text          7183 non-null   object
 2   concept_name  3951 non-null   object
 3   CURIE         3951 non-null   object
 4   Unnamed: 4    2 non-null      object
dtypes: int64(1), object(4)
memory usage: 280.7+ KB


In [53]:
obomap.head()

Unnamed: 0,id,text,concept_name,CURIE,Unnamed: 4
0,0,Green Tea Extract,tea,napdi_srs_imports:camellia_sinensis_leaf,
1,1,Normal heart,heart,UBERON:0000948,
2,2,"ATP8A2 protein, human",ATP8A2[SYNONYM_OF:phospholipid-transporting AT...,PR:000029291_SYNONYM,
3,3,epigallocatechin gallate,epigallocatechin gallate,napdi_srs_imports:epigallocatechin_gallate,
4,4,Reperfusion Injury,,,


In [54]:
with open('semrep_data/unmapped_semrep_cui_subset.txt', 'r') as fileco:
    unmapped_cui = fileco.readlines()

In [55]:
unmapped_cui[2].strip()

''

In [56]:
unmapped_cui_new = []
for item in unmapped_cui:
    cui = item.strip()
    if cui != '':
        unmapped_cui_new.append(cui)
len(unmapped_cui_new)

7183

In [57]:
obomap['CUI'] = unmapped_cui_new
obomap.head()

Unnamed: 0,id,text,concept_name,CURIE,Unnamed: 4,CUI
0,0,Green Tea Extract,tea,napdi_srs_imports:camellia_sinensis_leaf,,C1704263
1,1,Normal heart,heart,UBERON:0000948,,C2349205
2,2,"ATP8A2 protein, human",ATP8A2[SYNONYM_OF:phospholipid-transporting AT...,PR:000029291_SYNONYM,,C2744579
3,3,epigallocatechin gallate,epigallocatechin gallate,napdi_srs_imports:epigallocatechin_gallate,,C0059438
4,4,Reperfusion Injury,,,,C0035126


In [56]:
##clean CURIEs to remove 'SYNONYM' and add underscore. Also find appropriate prefix
##napdi - http://napdi.org/napdi_srs_imports:7_hydroxy_mitragynine

In [58]:
obomap = obomap.fillna('')
obomap.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7183 entries, 0 to 7182
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            7183 non-null   int64 
 1   text          7183 non-null   object
 2   concept_name  7183 non-null   object
 3   CURIE         7183 non-null   object
 4   Unnamed: 4    7183 non-null   object
 5   CUI           7183 non-null   object
dtypes: int64(1), object(5)
memory usage: 336.8+ KB


In [59]:
x = 'PR:00000_SYNONYM'
x.replace(':', '_')

'PR_00000_SYNONYM'

In [60]:
def process_CURIE(row):
    curie = row['CURIE']
    if curie == '':
        return curie
    elif 'napdi' in curie:
        return curie
    elif 'SYNONYM' in curie:
        temp = curie.split('_')[0]
        temp = temp.replace(':', '_')
        return temp
    else:
        temp = curie.replace(':', '_')
        return temp

In [61]:
obomap['CURIE_new'] = obomap.apply(process_CURIE, axis=1)
obomap.head()

Unnamed: 0,id,text,concept_name,CURIE,Unnamed: 4,CUI,CURIE_new
0,0,Green Tea Extract,tea,napdi_srs_imports:camellia_sinensis_leaf,,C1704263,napdi_srs_imports:camellia_sinensis_leaf
1,1,Normal heart,heart,UBERON:0000948,,C2349205,UBERON_0000948
2,2,"ATP8A2 protein, human",ATP8A2[SYNONYM_OF:phospholipid-transporting AT...,PR:000029291_SYNONYM,,C2744579,PR_000029291
3,3,epigallocatechin gallate,epigallocatechin gallate,napdi_srs_imports:epigallocatechin_gallate,,C0059438,napdi_srs_imports:epigallocatechin_gallate
4,4,Reperfusion Injury,,,,C0035126,


In [62]:
obomap.at[436, 'CURIE_new'] = 'MONDO_0001060'

In [63]:
obomap.iloc[436]

id                                                            436
text                                         Early gastric cancer
concept_name                                 Early gastric cancer
CURIE           early gastric cancer[SYNONYM_OF:microinvasive ...
Unnamed: 4                                  MONDO:0001060_SYNONYM
CUI                                                      C0349530
CURIE_new                                           MONDO_0001060
Name: 436, dtype: object

In [64]:
obosub = obomap.loc[obomap['CURIE_new'] != '']
obosub.head()

Unnamed: 0,id,text,concept_name,CURIE,Unnamed: 4,CUI,CURIE_new
0,0,Green Tea Extract,tea,napdi_srs_imports:camellia_sinensis_leaf,,C1704263,napdi_srs_imports:camellia_sinensis_leaf
1,1,Normal heart,heart,UBERON:0000948,,C2349205,UBERON_0000948
2,2,"ATP8A2 protein, human",ATP8A2[SYNONYM_OF:phospholipid-transporting AT...,PR:000029291_SYNONYM,,C2744579,PR_000029291
3,3,epigallocatechin gallate,epigallocatechin gallate,napdi_srs_imports:epigallocatechin_gallate,,C0059438,napdi_srs_imports:epigallocatechin_gallate
7,7,DNA Modification Methylases,modification methylase[SYNONYM_OF:DNA restrict...,PR:000050198_SYNONYM,,C0012873,PR_000050198


In [65]:
obosub = obosub.reset_index(drop=True)
obosub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3951 entries, 0 to 3950
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            3951 non-null   int64 
 1   text          3951 non-null   object
 2   concept_name  3951 non-null   object
 3   CURIE         3951 non-null   object
 4   Unnamed: 4    3951 non-null   object
 5   CUI           3951 non-null   object
 6   CURIE_new     3951 non-null   object
dtypes: int64(1), object(6)
memory usage: 216.2+ KB


In [66]:
obomap.iloc[436]

id                                                            436
text                                         Early gastric cancer
concept_name                                 Early gastric cancer
CURIE           early gastric cancer[SYNONYM_OF:microinvasive ...
Unnamed: 4                                  MONDO:0001060_SYNONYM
CUI                                                      C0349530
CURIE_new                                           MONDO_0001060
Name: 436, dtype: object

In [67]:
obosub.to_csv('cui_to_ontology_maps/mapped_semrep_subset_20220406.csv', index=False)

In [68]:
##create mapping dictionary
obomap_dict = {}
for i in range(len(obosub.index)):
    cui = obosub.at[i, 'CUI']
    curie = obosub.at[i, 'CURIE_new']
    obomap_dict[cui] = curie
len(obomap_dict)

3951

In [69]:
with open('cui_to_ontology_maps/CUItoOBO_20220406.pickle', 'wb') as filep:
    pickle.dump(obomap_dict, filep)

In [70]:
def get_obo_mapping(row, col):
    if col == 'subject':
        cui = row['subject_cui']
    elif col == 'object':
        cui = row['object_cui']
    else:
        print('specify if subject or object mapping required')
        exit(0)
    if cui in obomap_dict:
        return obomap_dict[cui]
    return None

In [71]:
dfn_subset.head()

Unnamed: 0,pmid,subject_cui,subject_type,object_cui,object_type,year,subject_name,object_name,source_text,predicate,subject_map,object_map,predicate_obo,subject_obo,object_obo
0,29913456,C1704263,"orch,phsu",C2349205,fndg,2018,Green Tea Extract,Normal heart,29913456_ascii.txt.tx.1 Abstract Background/Ai...,affects,,,RO_0002596,,
1,29913456,,"gngm,aapp",C2744579,"aapp,gngm,enzy",2018,,"ATP8A2 protein, human",29913456_ascii.txt.tx.2 Consistent with functi...,stimulates,,,RO_0002213,,
2,29913456,C0020538,dsyn,C0878544,dsyn,2018,Hypertensive disease,Cardiomyopathies,29913456_ascii.txt.tx.1 Previous experimental ...,coexists_with,,,RO_0002490,HP_0000822,HP_0001638
3,29913456,C0059438,"orch,phsu",C0035126,inpo,2018,epigallocatechin gallate,Reperfusion Injury,29913456_ascii.txt.tx.1 Previous experimental ...,treats,,,RO_0002606,,
4,29913456,C0242973,patf,C0878544,dsyn,2018,Ventricular Dysfunction,Cardiomyopathies,29913456_ascii.txt.tx.1 Previous experimental ...,coexists_with,,,RO_0002490,,HP_0001638


In [72]:
#MAPPING
dfn_subset['subject_obo'] = dfn_subset.apply(get_obo_mapping, axis=1, col='subject')
dfn_subset.head()

Unnamed: 0,pmid,subject_cui,subject_type,object_cui,object_type,year,subject_name,object_name,source_text,predicate,subject_map,object_map,predicate_obo,subject_obo,object_obo
0,29913456,C1704263,"orch,phsu",C2349205,fndg,2018,Green Tea Extract,Normal heart,29913456_ascii.txt.tx.1 Abstract Background/Ai...,affects,,,RO_0002596,napdi_srs_imports:camellia_sinensis_leaf,
1,29913456,,"gngm,aapp",C2744579,"aapp,gngm,enzy",2018,,"ATP8A2 protein, human",29913456_ascii.txt.tx.2 Consistent with functi...,stimulates,,,RO_0002213,,
2,29913456,C0020538,dsyn,C0878544,dsyn,2018,Hypertensive disease,Cardiomyopathies,29913456_ascii.txt.tx.1 Previous experimental ...,coexists_with,,,RO_0002490,,HP_0001638
3,29913456,C0059438,"orch,phsu",C0035126,inpo,2018,epigallocatechin gallate,Reperfusion Injury,29913456_ascii.txt.tx.1 Previous experimental ...,treats,,,RO_0002606,napdi_srs_imports:epigallocatechin_gallate,
4,29913456,C0242973,patf,C0878544,dsyn,2018,Ventricular Dysfunction,Cardiomyopathies,29913456_ascii.txt.tx.1 Previous experimental ...,coexists_with,,,RO_0002490,,HP_0001638


In [73]:
dfn_subset['object_obo'] = dfn_subset.apply(get_obo_mapping, axis=1, col='object')
dfn_subset.head()

Unnamed: 0,pmid,subject_cui,subject_type,object_cui,object_type,year,subject_name,object_name,source_text,predicate,subject_map,object_map,predicate_obo,subject_obo,object_obo
0,29913456,C1704263,"orch,phsu",C2349205,fndg,2018,Green Tea Extract,Normal heart,29913456_ascii.txt.tx.1 Abstract Background/Ai...,affects,,,RO_0002596,napdi_srs_imports:camellia_sinensis_leaf,UBERON_0000948
1,29913456,,"gngm,aapp",C2744579,"aapp,gngm,enzy",2018,,"ATP8A2 protein, human",29913456_ascii.txt.tx.2 Consistent with functi...,stimulates,,,RO_0002213,,PR_000029291
2,29913456,C0020538,dsyn,C0878544,dsyn,2018,Hypertensive disease,Cardiomyopathies,29913456_ascii.txt.tx.1 Previous experimental ...,coexists_with,,,RO_0002490,,
3,29913456,C0059438,"orch,phsu",C0035126,inpo,2018,epigallocatechin gallate,Reperfusion Injury,29913456_ascii.txt.tx.1 Previous experimental ...,treats,,,RO_0002606,napdi_srs_imports:epigallocatechin_gallate,
4,29913456,C0242973,patf,C0878544,dsyn,2018,Ventricular Dysfunction,Cardiomyopathies,29913456_ascii.txt.tx.1 Previous experimental ...,coexists_with,,,RO_0002490,,


In [74]:
dfn_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145638 entries, 0 to 145637
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   pmid           145638 non-null  int64 
 1   subject_cui    145638 non-null  object
 2   subject_type   145638 non-null  object
 3   object_cui     145638 non-null  object
 4   object_type    145638 non-null  object
 5   year           145638 non-null  object
 6   subject_name   145638 non-null  object
 7   object_name    145638 non-null  object
 8   source_text    145638 non-null  object
 9   predicate      145638 non-null  object
 10  subject_map    0 non-null       object
 11  object_map     0 non-null       object
 12  predicate_obo  145638 non-null  object
 13  subject_obo    103194 non-null  object
 14  object_obo     75713 non-null   object
dtypes: int64(1), object(14)
memory usage: 16.7+ MB


In [93]:
##add labels for all subjects and objects

In [75]:
dfn_subset.to_csv('semrep_data/semrep_all_predications_mapped_20220406.csv', index=False)

In [77]:
##read file after mapping
dfn_subset = pd.read_csv('semrep_data/semrep_all_predications_mapped_20220406.csv')
dfn_subset.head()

Unnamed: 0,pmid,subject_cui,subject_type,object_cui,object_type,year,subject_name,object_name,source_text,predicate,subject_map,object_map,predicate_obo,subject_obo,object_obo
0,29913456,C1704263,"orch,phsu",C2349205,fndg,2018.0,Green Tea Extract,Normal heart,29913456_ascii.txt.tx.1 Abstract Background/Ai...,affects,,,RO_0002596,napdi_srs_imports:camellia_sinensis_leaf,UBERON_0000948
1,29913456,,"gngm,aapp",C2744579,"aapp,gngm,enzy",2018.0,,"ATP8A2 protein, human",29913456_ascii.txt.tx.2 Consistent with functi...,stimulates,,,RO_0002213,,PR_000029291
2,29913456,C0020538,dsyn,C0878544,dsyn,2018.0,Hypertensive disease,Cardiomyopathies,29913456_ascii.txt.tx.1 Previous experimental ...,coexists_with,,,RO_0002490,,
3,29913456,C0059438,"orch,phsu",C0035126,inpo,2018.0,epigallocatechin gallate,Reperfusion Injury,29913456_ascii.txt.tx.1 Previous experimental ...,treats,,,RO_0002606,napdi_srs_imports:epigallocatechin_gallate,
4,29913456,C0242973,patf,C0878544,dsyn,2018.0,Ventricular Dysfunction,Cardiomyopathies,29913456_ascii.txt.tx.1 Previous experimental ...,coexists_with,,,RO_0002490,,


In [78]:
def add_prefix(row, col):
    obo_prefix = 'http://purl.obolibrary.org/obo/'
    napdi_prefix = 'http://napdi.org/'
    if col == 'predicate':
        predicate_obo = row['predicate_obo']
        if isinstance(predicate_obo, str):
            return obo_prefix+predicate_obo
        else:
            return ''
            
    elif col == 'subject':
        
        subject_obo = row['subject_obo']
        if isinstance(subject_obo, str):
            if subject_obo == None:
                return ''
            if 'napdi' in subject_obo:
                return napdi_prefix+subject_obo
            else:
                return obo_prefix+subject_obo
        else:
            return ''
    elif col == 'object':
        object_obo = row['object_obo']
        if isinstance(object_obo, str):
            if object_obo == None:
                return ''
            if 'napdi' in object_obo:
                return napdi_prefix+object_obo
            else:
                return obo_prefix+object_obo
        else:
            return ''

In [79]:
#add OBO identifiers to the OBO mappings (where not present) - see df
#drop rows with no mappings
dfn_subset['subject_obo'] = dfn_subset.apply(add_prefix, axis=1, col='subject')
dfn_subset['object_obo'] = dfn_subset.apply(add_prefix, axis=1, col='object')
dfn_subset['predicate_obo'] = dfn_subset.apply(add_prefix, axis=1, col='predicate')

In [80]:
dfn_subset.head()

Unnamed: 0,pmid,subject_cui,subject_type,object_cui,object_type,year,subject_name,object_name,source_text,predicate,subject_map,object_map,predicate_obo,subject_obo,object_obo
0,29913456,C1704263,"orch,phsu",C2349205,fndg,2018.0,Green Tea Extract,Normal heart,29913456_ascii.txt.tx.1 Abstract Background/Ai...,affects,,,http://purl.obolibrary.org/obo/RO_0002596,http://napdi.org/napdi_srs_imports:camellia_si...,http://purl.obolibrary.org/obo/UBERON_0000948
1,29913456,,"gngm,aapp",C2744579,"aapp,gngm,enzy",2018.0,,"ATP8A2 protein, human",29913456_ascii.txt.tx.2 Consistent with functi...,stimulates,,,http://purl.obolibrary.org/obo/RO_0002213,,http://purl.obolibrary.org/obo/PR_000029291
2,29913456,C0020538,dsyn,C0878544,dsyn,2018.0,Hypertensive disease,Cardiomyopathies,29913456_ascii.txt.tx.1 Previous experimental ...,coexists_with,,,http://purl.obolibrary.org/obo/RO_0002490,,
3,29913456,C0059438,"orch,phsu",C0035126,inpo,2018.0,epigallocatechin gallate,Reperfusion Injury,29913456_ascii.txt.tx.1 Previous experimental ...,treats,,,http://purl.obolibrary.org/obo/RO_0002606,http://napdi.org/napdi_srs_imports:epigallocat...,
4,29913456,C0242973,patf,C0878544,dsyn,2018.0,Ventricular Dysfunction,Cardiomyopathies,29913456_ascii.txt.tx.1 Previous experimental ...,coexists_with,,,http://purl.obolibrary.org/obo/RO_0002490,,


In [81]:
dfn_subset.to_csv('semrep_data/semrep_all_predications_mapped_with_prefix_20220406.csv', index=False)

In [82]:
dfn_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145638 entries, 0 to 145637
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   pmid           145638 non-null  int64  
 1   subject_cui    141153 non-null  object 
 2   subject_type   145638 non-null  object 
 3   object_cui     141410 non-null  object 
 4   object_type    145638 non-null  object 
 5   year           145492 non-null  float64
 6   subject_name   141153 non-null  object 
 7   object_name    141410 non-null  object 
 8   source_text    145638 non-null  object 
 9   predicate      145638 non-null  object 
 10  subject_map    0 non-null       float64
 11  object_map     0 non-null       float64
 12  predicate_obo  145638 non-null  object 
 13  subject_obo    145638 non-null  object 
 14  object_obo     145638 non-null  object 
dtypes: float64(3), int64(1), object(11)
memory usage: 16.7+ MB


In [83]:
df_new = dfn_subset[dfn_subset['subject_obo'] != '']
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103194 entries, 0 to 145637
Data columns (total 15 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   pmid           103194 non-null  int64  
 1   subject_cui    103194 non-null  object 
 2   subject_type   103194 non-null  object 
 3   object_cui     99735 non-null   object 
 4   object_type    103194 non-null  object 
 5   year           103085 non-null  float64
 6   subject_name   103194 non-null  object 
 7   object_name    99735 non-null   object 
 8   source_text    103194 non-null  object 
 9   predicate      103194 non-null  object 
 10  subject_map    0 non-null       float64
 11  object_map     0 non-null       float64
 12  predicate_obo  103194 non-null  object 
 13  subject_obo    103194 non-null  object 
 14  object_obo     103194 non-null  object 
dtypes: float64(3), int64(1), object(11)
memory usage: 12.6+ MB


In [84]:
df_new = df_new[df_new['object_obo'] != '']
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54677 entries, 0 to 145637
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   pmid           54677 non-null  int64  
 1   subject_cui    54677 non-null  object 
 2   subject_type   54677 non-null  object 
 3   object_cui     54677 non-null  object 
 4   object_type    54677 non-null  object 
 5   year           54603 non-null  float64
 6   subject_name   54677 non-null  object 
 7   object_name    54677 non-null  object 
 8   source_text    54677 non-null  object 
 9   predicate      54677 non-null  object 
 10  subject_map    0 non-null      float64
 11  object_map     0 non-null      float64
 12  predicate_obo  54677 non-null  object 
 13  subject_obo    54677 non-null  object 
 14  object_obo     54677 non-null  object 
dtypes: float64(3), int64(1), object(11)
memory usage: 6.7+ MB


In [85]:
df_new['predicate'].value_counts()

interacts_with     13517
inhibits            8559
stimulates          7942
coexists_with       7404
part_of             6507
treats              3641
affects             1808
augments            1424
disrupts            1215
produces            1114
causes               692
associated_with      516
prevents             225
predisposes          100
complicates            7
precedes               6
Name: predicate, dtype: int64

In [86]:
df_new = df_new[df_new['predicate_obo'] != '']
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54677 entries, 0 to 145637
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   pmid           54677 non-null  int64  
 1   subject_cui    54677 non-null  object 
 2   subject_type   54677 non-null  object 
 3   object_cui     54677 non-null  object 
 4   object_type    54677 non-null  object 
 5   year           54603 non-null  float64
 6   subject_name   54677 non-null  object 
 7   object_name    54677 non-null  object 
 8   source_text    54677 non-null  object 
 9   predicate      54677 non-null  object 
 10  subject_map    0 non-null      float64
 11  object_map     0 non-null      float64
 12  predicate_obo  54677 non-null  object 
 13  subject_obo    54677 non-null  object 
 14  object_obo     54677 non-null  object 
dtypes: float64(3), int64(1), object(11)
memory usage: 6.7+ MB


In [87]:
df_new = df_new.drop_duplicates()
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13676 entries, 0 to 145566
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   pmid           13676 non-null  int64  
 1   subject_cui    13676 non-null  object 
 2   subject_type   13676 non-null  object 
 3   object_cui     13676 non-null  object 
 4   object_type    13676 non-null  object 
 5   year           13602 non-null  float64
 6   subject_name   13676 non-null  object 
 7   object_name    13676 non-null  object 
 8   source_text    13676 non-null  object 
 9   predicate      13676 non-null  object 
 10  subject_map    0 non-null      float64
 11  object_map     0 non-null      float64
 12  predicate_obo  13676 non-null  object 
 13  subject_obo    13676 non-null  object 
 14  object_obo     13676 non-null  object 
dtypes: float64(3), int64(1), object(11)
memory usage: 1.7+ MB


In [88]:
df_new = df_new.reset_index(drop=True)
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13676 entries, 0 to 13675
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   pmid           13676 non-null  int64  
 1   subject_cui    13676 non-null  object 
 2   subject_type   13676 non-null  object 
 3   object_cui     13676 non-null  object 
 4   object_type    13676 non-null  object 
 5   year           13602 non-null  float64
 6   subject_name   13676 non-null  object 
 7   object_name    13676 non-null  object 
 8   source_text    13676 non-null  object 
 9   predicate      13676 non-null  object 
 10  subject_map    0 non-null      float64
 11  object_map     0 non-null      float64
 12  predicate_obo  13676 non-null  object 
 13  subject_obo    13676 non-null  object 
 14  object_obo     13676 non-null  object 
dtypes: float64(3), int64(1), object(11)
memory usage: 1.6+ MB


In [89]:
df_new = df_new.drop(['subject_map', 'object_map'], axis=1)
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13676 entries, 0 to 13675
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   pmid           13676 non-null  int64  
 1   subject_cui    13676 non-null  object 
 2   subject_type   13676 non-null  object 
 3   object_cui     13676 non-null  object 
 4   object_type    13676 non-null  object 
 5   year           13602 non-null  float64
 6   subject_name   13676 non-null  object 
 7   object_name    13676 non-null  object 
 8   source_text    13676 non-null  object 
 9   predicate      13676 non-null  object 
 10  predicate_obo  13676 non-null  object 
 11  subject_obo    13676 non-null  object 
 12  object_obo     13676 non-null  object 
dtypes: float64(1), int64(1), object(11)
memory usage: 1.4+ MB


In [90]:
df_new.to_csv('semrep_data/semrep_predications_mapped_only_20220406.csv', index=False)

In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('semrep_data/semrep_predications_mapped_only_20220406.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13676 entries, 0 to 13675
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   pmid           13676 non-null  int64  
 1   subject_cui    13676 non-null  object 
 2   subject_type   13676 non-null  object 
 3   object_cui     13676 non-null  object 
 4   object_type    13676 non-null  object 
 5   year           13602 non-null  float64
 6   subject_name   13676 non-null  object 
 7   object_name    13676 non-null  object 
 8   source_text    13676 non-null  object 
 9   predicate      13676 non-null  object 
 10  predicate_obo  13676 non-null  object 
 11  subject_obo    13676 non-null  object 
 12  object_obo     13676 non-null  object 
dtypes: float64(1), int64(1), object(11)
memory usage: 1.4+ MB


In [4]:
df['predicate'].value_counts()

interacts_with     3287
inhibits           2014
coexists_with      1945
stimulates         1902
part_of            1846
treats              927
affects             447
augments            334
produces            314
disrupts            242
causes              190
associated_with     135
prevents             59
predisposes          31
complicates           2
precedes              1
Name: predicate, dtype: int64

In [91]:
####create ntriples graph and run closure

In [92]:
#Create networkx graph from triples
import glob
import hashlib
import json
import networkx as nx  # type: ignore
import os
import os.path

from collections import Counter  # type: ignore
from more_itertools import unique_everseen  # type: ignore
from rdflib import BNode, Graph, Literal, Namespace, URIRef  # type: ignore
from rdflib.namespace import OWL, RDF, RDFS  # type: ignore
from rdflib.plugins.serializers.nt import _quoteLiteral  # type: ignore
import subprocess

from tqdm import tqdm  # type: ignore
from typing import Dict, List, Optional, Set, Tuple, Union

# set-up environment variables
obo = Namespace('http://purl.obolibrary.org/obo/')
oboinowl = Namespace('http://www.geneontology.org/formats/oboInOwl#')
schema = Namespace('http://www.w3.org/2001/XMLSchema#')
napdi = Namespace('http://napdi.org/napdi-srs-imports:')

In [93]:
dfres = df_new[['subject_obo', 'predicate_obo', 'object_obo']]
dfres.head()

Unnamed: 0,subject_obo,predicate_obo,object_obo
0,http://napdi.org/napdi_srs_imports:camellia_si...,http://purl.obolibrary.org/obo/RO_0002596,http://purl.obolibrary.org/obo/UBERON_0000948
1,http://purl.obolibrary.org/obo/CL_0000187,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/UBERON_0002082
2,http://purl.obolibrary.org/obo/CL_0000187,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/UBERON_0002082
3,http://purl.obolibrary.org/obo/CHEBI_17234,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CL_0000187
4,http://purl.obolibrary.org/obo/CHEBI_15361,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CL_0000187


In [94]:
df_new.head()

Unnamed: 0,pmid,subject_cui,subject_type,object_cui,object_type,year,subject_name,object_name,source_text,predicate,predicate_obo,subject_obo,object_obo
0,29913456,C1704263,"orch,phsu",C2349205,fndg,2018.0,Green Tea Extract,Normal heart,29913456_ascii.txt.tx.1 Abstract Background/Ai...,affects,http://purl.obolibrary.org/obo/RO_0002596,http://napdi.org/napdi_srs_imports:camellia_si...,http://purl.obolibrary.org/obo/UBERON_0000948
1,29913456,C0596981,cell,C0018827,bpoc,2018.0,Muscle Cells,Heart Ventricle,29913456_ascii.txt.tx.4 From the heart of 6 CT...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CL_0000187,http://purl.obolibrary.org/obo/UBERON_0002082
2,29913456,C0596981,cell,C0018827,bpoc,2018.0,Muscle Cells,Heart Ventricle,29913456_ascii.txt.tx.1 Mechanical properties ...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CL_0000187,http://purl.obolibrary.org/obo/UBERON_0002082
3,29913456,C0017725,"bacs,orch,phsu",C0225828,cell,2018.0,Glucose,"Myocytes, Cardiac",29913456_ascii.txt.tx.4 Mitochondrial respirat...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CHEBI_17234,http://purl.obolibrary.org/obo/CL_0000187
4,29913456,C0034354,orch,C0225828,cell,2018.0,Pyruvates,"Myocytes, Cardiac",29913456_ascii.txt.tx.4 Mitochondrial respirat...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CHEBI_15361,http://purl.obolibrary.org/obo/CL_0000187


In [95]:
dfres.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13676 entries, 0 to 13675
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   subject_obo    13676 non-null  object
 1   predicate_obo  13676 non-null  object
 2   object_obo     13676 non-null  object
dtypes: object(3)
memory usage: 320.7+ KB


In [96]:
dfres = dfres.drop_duplicates()
dfres.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9232 entries, 0 to 13666
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   subject_obo    9232 non-null   object
 1   predicate_obo  9232 non-null   object
 2   object_obo     9232 non-null   object
dtypes: object(3)
memory usage: 288.5+ KB


In [97]:
#create rdflib graph from dataframe triples and serialize as ntriples file
graph  = Graph()
pred_label = URIRef("http://www.w3.org/2000/01/rdf-schema#label")
for i in range(len(df_new.index)):
    subj = df_new.at[i, 'subject_obo']
    obj = df_new.at[i, 'object_obo']
    pred = df_new.at[i, 'predicate_obo']
    subj_node = URIRef(subj)
    obj_node = URIRef(obj)
    predicate = URIRef(pred)
    subj_name = df_new.at[i, 'subject_name']
    obj_name = df_new.at[i, 'object_name']
    graph.add((subj_node, predicate, obj_node))
    graph.add((subj_node, pred_label, Literal(subj_name)))
    graph.add((obj_node, pred_label, Literal(obj_name)))

In [98]:
graph.serialize('output_graphs/machineread_semrep_version3.nt', format='nt')

In [99]:
##save graph as gpickle, nodelabels as TSV also (run closure in spearate notebook)

In [100]:
df_new.head()

Unnamed: 0,pmid,subject_cui,subject_type,object_cui,object_type,year,subject_name,object_name,source_text,predicate,predicate_obo,subject_obo,object_obo
0,29913456,C1704263,"orch,phsu",C2349205,fndg,2018.0,Green Tea Extract,Normal heart,29913456_ascii.txt.tx.1 Abstract Background/Ai...,affects,http://purl.obolibrary.org/obo/RO_0002596,http://napdi.org/napdi_srs_imports:camellia_si...,http://purl.obolibrary.org/obo/UBERON_0000948
1,29913456,C0596981,cell,C0018827,bpoc,2018.0,Muscle Cells,Heart Ventricle,29913456_ascii.txt.tx.4 From the heart of 6 CT...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CL_0000187,http://purl.obolibrary.org/obo/UBERON_0002082
2,29913456,C0596981,cell,C0018827,bpoc,2018.0,Muscle Cells,Heart Ventricle,29913456_ascii.txt.tx.1 Mechanical properties ...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CL_0000187,http://purl.obolibrary.org/obo/UBERON_0002082
3,29913456,C0017725,"bacs,orch,phsu",C0225828,cell,2018.0,Glucose,"Myocytes, Cardiac",29913456_ascii.txt.tx.4 Mitochondrial respirat...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CHEBI_17234,http://purl.obolibrary.org/obo/CL_0000187
4,29913456,C0034354,orch,C0225828,cell,2018.0,Pyruvates,"Myocytes, Cardiac",29913456_ascii.txt.tx.4 Mitochondrial respirat...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CHEBI_15361,http://purl.obolibrary.org/obo/CL_0000187


In [101]:
def n3(node: Union[URIRef, BNode, Literal]) -> str:
    """Method takes an RDFLib node of type BNode, URIRef, or Literal and serializes it to meet the RDF 1.1 NTriples
    format.
    Src: https://github.com/RDFLib/rdflib/blob/c11f7b503b50b7c3cdeec0f36261fa09b0615380/rdflib/plugins/serializers/nt.py
    Args:
        node: An RDFLib
    Returns:
        serialized_node: A string containing the serialized
    """
    if isinstance(node, Literal): serialized_node = "%s" % _quoteLiteral(node)
    else: serialized_node = "%s" % node.n3()
    return serialized_node

In [102]:
df_new.head()

Unnamed: 0,pmid,subject_cui,subject_type,object_cui,object_type,year,subject_name,object_name,source_text,predicate,predicate_obo,subject_obo,object_obo
0,29913456,C1704263,"orch,phsu",C2349205,fndg,2018.0,Green Tea Extract,Normal heart,29913456_ascii.txt.tx.1 Abstract Background/Ai...,affects,http://purl.obolibrary.org/obo/RO_0002596,http://napdi.org/napdi_srs_imports:camellia_si...,http://purl.obolibrary.org/obo/UBERON_0000948
1,29913456,C0596981,cell,C0018827,bpoc,2018.0,Muscle Cells,Heart Ventricle,29913456_ascii.txt.tx.4 From the heart of 6 CT...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CL_0000187,http://purl.obolibrary.org/obo/UBERON_0002082
2,29913456,C0596981,cell,C0018827,bpoc,2018.0,Muscle Cells,Heart Ventricle,29913456_ascii.txt.tx.1 Mechanical properties ...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CL_0000187,http://purl.obolibrary.org/obo/UBERON_0002082
3,29913456,C0017725,"bacs,orch,phsu",C0225828,cell,2018.0,Glucose,"Myocytes, Cardiac",29913456_ascii.txt.tx.4 Mitochondrial respirat...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CHEBI_17234,http://purl.obolibrary.org/obo/CL_0000187
4,29913456,C0034354,orch,C0225828,cell,2018.0,Pyruvates,"Myocytes, Cardiac",29913456_ascii.txt.tx.4 Mitochondrial respirat...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CHEBI_15361,http://purl.obolibrary.org/obo/CL_0000187


In [97]:
x = 'http://purl.obolibrary.org/obo/CL_0000187' 
y = 'http://purl.obolibrary.org/obo/BFO_0000050'
z = 'http://purl.obolibrary.org/obo/UBERON_0002082'

In [98]:
df_new.loc[(df_new['subject_obo'] == x) & (df_new['object_obo'] == z)  & (df_new['predicate_obo'] == y)]

Unnamed: 0,pmid,subject_cui,subject_type,object_cui,object_type,year,subject_name,object_name,source_text,predicate,predicate_obo,subject_obo,object_obo
1,29913456,C0596981,cell,C0018827,bpoc,2018.0,Muscle Cells,Heart Ventricle,29913456_ascii.txt.tx.4 From the heart of 6 CT...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CL_0000187,http://purl.obolibrary.org/obo/UBERON_0002082
2,29913456,C0596981,cell,C0018827,bpoc,2018.0,Muscle Cells,Heart Ventricle,29913456_ascii.txt.tx.1 Mechanical properties ...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CL_0000187,http://purl.obolibrary.org/obo/UBERON_0002082
11,29913456,C0596981,cell,C0018827,bpoc,2018.0,Muscle Cells,Heart Ventricle,29913456_ascii.txt.tx.3 Mean values +/- SEM of...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CL_0000187,http://purl.obolibrary.org/obo/UBERON_0002082
122,23116965,C0596981,cell,C0018827,bpoc,2013.0,Muscle Cells,Heart Ventricle,"23116965_ascii.txt.tx.1 Here, we reported t...",part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CL_0000187,http://purl.obolibrary.org/obo/UBERON_0002082
130,23116965,C0596981,cell,C0018827,bpoc,2013.0,Muscle Cells,Heart Ventricle,23116965_ascii.txt.tx.2 Previous studies in ...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CL_0000187,http://purl.obolibrary.org/obo/UBERON_0002082
1148,20798525,C0225828,cell,C0018827,bpoc,2010.0,"Myocytes, Cardiac",Heart Ventricle,20798525_ascii.txt.tx.3|relation|C0110613|Conn...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CL_0000187,http://purl.obolibrary.org/obo/UBERON_0002082
2059,27306406,C0225828,cell,C0018827,bpoc,2016.0,"Myocytes, Cardiac",Heart Ventricle,27306406_ascii.txt.tx.1 Gap junctions are comp...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CL_0000187,http://purl.obolibrary.org/obo/UBERON_0002082
4348,28768059,C0225828,cell,C0018827,bpoc,2017.0,"Myocytes, Cardiac",Heart Ventricle,28768059_ascii.txt.tx.1 We found that protein ...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CL_0000187,http://purl.obolibrary.org/obo/UBERON_0002082
4677,33228183,C0225828,cell,C2355627,bpoc,2020.0,"Myocytes, Cardiac",Ventricle,"33228183_ascii.txt.tx.2 Furthermore, RA treatm...",part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CL_0000187,http://purl.obolibrary.org/obo/UBERON_0002082
5607,18406205,C0596981,cell,C0018827,bpoc,2008.0,Muscle Cells,Heart Ventricle,18406205_ascii.txt.tx.1 Diastolic cell length ...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CL_0000187,http://purl.obolibrary.org/obo/UBERON_0002082


In [104]:
df_new.loc[(df_new['subject_obo'] == x) & (df_new['object_obo'] == z)  & 
                               (df_new['predicate_obo'] == y)]['pmid'].values[0]

29913456

In [105]:
df_new.head()

Unnamed: 0,pmid,subject_cui,subject_type,object_cui,object_type,year,subject_name,object_name,source_text,predicate,predicate_obo,subject_obo,object_obo
0,29913456,C1704263,"orch,phsu",C2349205,fndg,2018.0,Green Tea Extract,Normal heart,29913456_ascii.txt.tx.1 Abstract Background/Ai...,affects,http://purl.obolibrary.org/obo/RO_0002596,http://napdi.org/napdi_srs_imports:camellia_si...,http://purl.obolibrary.org/obo/UBERON_0000948
1,29913456,C0596981,cell,C0018827,bpoc,2018.0,Muscle Cells,Heart Ventricle,29913456_ascii.txt.tx.4 From the heart of 6 CT...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CL_0000187,http://purl.obolibrary.org/obo/UBERON_0002082
2,29913456,C0596981,cell,C0018827,bpoc,2018.0,Muscle Cells,Heart Ventricle,29913456_ascii.txt.tx.1 Mechanical properties ...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CL_0000187,http://purl.obolibrary.org/obo/UBERON_0002082
3,29913456,C0017725,"bacs,orch,phsu",C0225828,cell,2018.0,Glucose,"Myocytes, Cardiac",29913456_ascii.txt.tx.4 Mitochondrial respirat...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CHEBI_17234,http://purl.obolibrary.org/obo/CL_0000187
4,29913456,C0034354,orch,C0225828,cell,2018.0,Pyruvates,"Myocytes, Cardiac",29913456_ascii.txt.tx.4 Mitochondrial respirat...,part_of,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/CHEBI_15361,http://purl.obolibrary.org/obo/CL_0000187


In [106]:
for s,p,o in tqdm(graph):
    print(s,p,o)
    break

  0%|          | 0/9771 [00:00<?, ?it/s]

http://purl.obolibrary.org/obo/PR_000008555 http://purl.obolibrary.org/obo/RO_0002434 http://purl.obolibrary.org/obo/CHEBI_35610





In [103]:
#convert rdflib graph to multidigraph - code borrowed from PheKnowLator: kg_utils.py
#use the pred key to also create a dictionary with metadata about the edge - 
#pub_year, pmid, source graph, belief
nx_mdg = nx.MultiDiGraph()
for s, p, o in tqdm(graph):
    #do not save label predicate to gpickle
    subj = str(s)
    obj = str(o)
    pred = str(p)
    if pred == 'http://www.w3.org/2000/01/rdf-schema#label':
        continue
    else:

        pred_key = hashlib.md5('{}{}{}'.format(n3(s), n3(p), n3(o)).encode()).hexdigest()
        pmid = str(df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj)  & 
                       (df_new['predicate_obo'] == pred)]['pmid'].values[0])
        timestamp = str(df_new.loc[(df_new['subject_obo'] == subj) & (df_new['object_obo'] == obj)  & 
                                   (df_new['predicate_obo'] == pred)]['year'].values[0])
        belief_score = 0.8
        nx_mdg.add_node(s, key=n3(s))
        nx_mdg.add_node(o, key=n3(o))
        nx_mdg.add_edge(s, o, **{'key': p, 'predicate_key': pred_key, 'weight':0.0,
                                 'pmid': pmid, 'timestamp': timestamp, 'source_graph': 'machine_read',
                                'belief': belief_score})
nx.write_gpickle(nx_mdg, "output_graphs/machineread_semrep_version3.gpickle")

100%|██████████| 12079/12079 [01:48<00:00, 111.11it/s]


In [104]:
triples = len(graph)
nodes = len(set(list(graph.subjects()) + list(graph.objects())))
rels = len(set(list(graph.predicates())))
print(triples, nodes, rels)

12079 5557 17


In [105]:
#this should have less edges than rdflib graph after removing 'labels'
nodes = nx.number_of_nodes(nx_mdg)
edges = nx.number_of_edges(nx_mdg)
density = nx.density(nx_mdg)
avg_deg = float(edges)/nodes
print(nodes, edges, density, avg_deg)

2710 9232 0.0012575275254413673 3.4066420664206642


In [None]:
#figure out how to extract information from saved gpickle predicate dictionary - pmid, 

In [104]:
NV = nx_mdg.nodes()
i = 0
for (k,v) in NV.items():
    if i==20:
        break
    print('{}:{}'.format(k,v))
    i=i+1

http://purl.obolibrary.org/obo/CHEBI_36856:{'key': '<http://purl.obolibrary.org/obo/CHEBI_36856>'}
http://purl.obolibrary.org/obo/PR_Q05421:{'key': '<http://purl.obolibrary.org/obo/PR_Q05421>'}
http://purl.obolibrary.org/obo/PR_Q5EG47:{'key': '<http://purl.obolibrary.org/obo/PR_Q5EG47>'}
http://purl.obolibrary.org/obo/GO_0005243:{'key': '<http://purl.obolibrary.org/obo/GO_0005243>'}
http://purl.obolibrary.org/obo/CHEBI_80961:{'key': '<http://purl.obolibrary.org/obo/CHEBI_80961>'}
http://purl.obolibrary.org/obo/CHEBI_16856:{'key': '<http://purl.obolibrary.org/obo/CHEBI_16856>'}
http://purl.obolibrary.org/obo/CHEBI_4911:{'key': '<http://purl.obolibrary.org/obo/CHEBI_4911>'}
http://purl.obolibrary.org/obo/CHEBI_39124:{'key': '<http://purl.obolibrary.org/obo/CHEBI_39124>'}
http://purl.obolibrary.org/obo/CHEBI_27881:{'key': '<http://purl.obolibrary.org/obo/CHEBI_27881>'}
http://purl.obolibrary.org/obo/PR_000001044:{'key': '<http://purl.obolibrary.org/obo/PR_000001044>'}
http://purl.obolibra

In [105]:
i = -1
for (s,o,k,d) in nx_mdg.edges(data=True, keys=True):
    i+=1
    if i==2:
        break
    print(s)
    print(o)
    print(k)
    print(d)

http://purl.obolibrary.org/obo/CHEBI_36856
http://purl.obolibrary.org/obo/PR_Q05421
http://purl.obolibrary.org/obo/RO_0002449
{'predicate_key': '92e324b79c1218fb412629e2cc185585', 'weight': 0.0, 'pmid': '29356593', 'timestamp': '2018 Feb', 'source_graph': 'machine_read'}
http://purl.obolibrary.org/obo/CHEBI_36856
http://purl.obolibrary.org/obo/PR_Q05421
http://purl.obolibrary.org/obo/RO_0002436
{'predicate_key': 'e79d4f1ccf29afa3b87bde913d61c4eb', 'weight': 0.0, 'pmid': '29356593', 'timestamp': '2018 Feb', 'source_graph': 'machine_read'}


In [106]:
#save node labels as dictionary
#key: URI, value is label
label_dict = {}
for i in range(len(df_new.index)):
    subj = str(df_new.at[i, 'subject_obo'])
    obj = str(df_new.at[i, 'object_obo'])
    pred = str(df_new.at[i, 'predicate_obo'])
    if subj not in label_dict:
        label_dict[subj] = {}
        label_dict[subj]['entity_type'] = 'NODES'
        label_dict[subj]['label'] = df_new.at[i, 'subject_name']
        label_dict[subj]['cui'] = df_new.at[i, 'subject_cui']
    if obj not in label_dict:
        label_dict[obj] = {}
        label_dict[obj]['entity_type'] = 'NODES'
        label_dict[obj]['label'] = df_new.at[i, 'object_name']
        label_dict[obj]['cui'] = df_new.at[i, 'object_cui']
    if pred not in label_dict:
        label_dict[pred] = {}
        label_dict[pred]['entity_type'] = 'RELATIONS'
        label_dict[pred]['label'] = df_new.at[i, 'predicate']
len(label_dict)

2726

In [107]:
import pickle
with open('output_graphs/machineread_semrep_version3_NodeLabels.pickle', 'wb') as file_p:
    pickle.dump(label_dict, file_p)

In [108]:
dfmap = pd.DataFrame.from_dict(label_dict, orient='index')
dfmap.head()

Unnamed: 0,entity_type,label,cui
http://napdi.org/napdi_srs_imports:camellia_sinensis_leaf,NODES,Green Tea Extract,C1704263
http://purl.obolibrary.org/obo/UBERON_0000948,NODES,Normal heart,C2349205
http://purl.obolibrary.org/obo/RO_0002596,RELATIONS,affects,
http://purl.obolibrary.org/obo/CL_0000187,NODES,Muscle Cells,C0596981
http://purl.obolibrary.org/obo/UBERON_0002082,NODES,Heart Ventricle,C0018827


In [109]:
dfmap = dfmap.reset_index()
dfmap.head()

Unnamed: 0,index,entity_type,label,cui
0,http://napdi.org/napdi_srs_imports:camellia_si...,NODES,Green Tea Extract,C1704263
1,http://purl.obolibrary.org/obo/UBERON_0000948,NODES,Normal heart,C2349205
2,http://purl.obolibrary.org/obo/RO_0002596,RELATIONS,affects,
3,http://purl.obolibrary.org/obo/CL_0000187,NODES,Muscle Cells,C0596981
4,http://purl.obolibrary.org/obo/UBERON_0002082,NODES,Heart Ventricle,C0018827


In [110]:
dfmap = dfmap.rename(columns={"index":"entity_uri"})
dfmap.head()

Unnamed: 0,entity_uri,entity_type,label,cui
0,http://napdi.org/napdi_srs_imports:camellia_si...,NODES,Green Tea Extract,C1704263
1,http://purl.obolibrary.org/obo/UBERON_0000948,NODES,Normal heart,C2349205
2,http://purl.obolibrary.org/obo/RO_0002596,RELATIONS,affects,
3,http://purl.obolibrary.org/obo/CL_0000187,NODES,Muscle Cells,C0596981
4,http://purl.obolibrary.org/obo/UBERON_0002082,NODES,Heart Ventricle,C0018827


In [115]:
dfmap.to_csv('output_graphs/machineread_semrep_version3_NodeLabels.tsv', index=False, sep='\t')

In [None]:
##run closure in separate notebook - see machine_read_closure.ipynb