## NaPDI machine reading - process SemRep triples, map to OBO and generate machine reading graph with triples for all negation triples.

Last run on 2022-04-07 with green tea, kratom and microbiome output

In [1]:
import pandas as pd
import numpy as np
import pickle
from rdflib.namespace import OWL, RDF, RDFS

In [2]:
import os
files = os.listdir('semrep_data/')
#read all files in semrep_data
files[0][-3:]
df = pd.DataFrame(columns=['index', 'pmid', 'subject_cui', 'subject_type', 'relation', 'object_cui', 'object_type', 'year', 'sentence'])
for file in files:
    if file[-3:] == 'tsv':
        print('Loading file: ', file)
        df_temp = pd.read_csv('semrep_data/'+file, sep='\t')
        print(df_temp.info())
        df = pd.concat([df, df_temp], ignore_index=True)
df.info()

Loading file:  greentea_pmid_all_predicates_semrep-errors-fixed.tsv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48957 entries, 0 to 48956
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   index         48957 non-null  int64 
 1   pmid          48957 non-null  int64 
 2   subject_cui   47969 non-null  object
 3   subject_name  47969 non-null  object
 4   subject_type  48957 non-null  object
 5   relation      48957 non-null  object
 6   object_cui    47668 non-null  object
 7   object_name   47668 non-null  object
 8   object_type   48957 non-null  object
 9   year          48957 non-null  object
 10  sentence      48957 non-null  object
dtypes: int64(2), object(9)
memory usage: 4.1+ MB
None
Loading file:  microbiome_pmid_all_predicates_semrep-errors-fixed.tsv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14893 entries, 0 to 14892
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
-

In [3]:
df.head()

Unnamed: 0,index,pmid,subject_cui,subject_type,relation,object_cui,object_type,year,sentence,subject_name,object_name
0,0,29913456,C1704263,"orch,phsu",AFFECTS,C2349205,fndg,2018,29913456_ascii.txt.tx.1 Abstract Background/Ai...,Green Tea Extract,Normal heart
1,1,29913456,C0014518,dsyn,PROCESS_OF,C0596888,anim,2018,29913456_ascii.txt.tx.2 Methods: The study pop...,Toxic Epidermal Necrolysis,mature animal
2,2,29913456,C0043047,"inch,phsu",ADMINISTERED_TO,C0034693,mamm,2018,\n,Water,Rattus norvegicus
3,3,29913456,C0232164,ortf,PROCESS_OF,C0003062,anim,2018,"29913456_ascii.txt.tx.1 Then, in vivo and ex v...",Cardiac function,Animals
4,4,29913456,C0185115,topp,USES,C0006675,"bacs,elii,phsu",2018,29913456_ascii.txt.tx.2 Results: GTE cardiom...,Extraction,Calcium


In [4]:
#processing TSV
'''
0. Concatenate files and remove duplicates, get stats of unique PMIDs and triples
1. Fix sentence (empty takes value of previous sentence)
2. Extract year from date and save
3. Map from CUI to GO, HPO where available
4. Map with OntoRunNER
'''

'\n0. Concatenate files and remove duplicates, get stats of unique PMIDs and triples\n1. Fix sentence (empty takes value of previous sentence)\n2. Extract year from date and save\n3. Map from CUI to GO, HPO where available\n4. Map with OntoRunNER\n'

In [5]:
df = df.drop_duplicates(ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277812 entries, 0 to 277811
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   index         277812 non-null  object
 1   pmid          277812 non-null  object
 2   subject_cui   272500 non-null  object
 3   subject_type  277812 non-null  object
 4   relation      277812 non-null  object
 5   object_cui    271016 non-null  object
 6   object_type   277812 non-null  object
 7   year          277542 non-null  object
 8   sentence      277812 non-null  object
 9   subject_name  272500 non-null  object
 10  object_name   271016 non-null  object
dtypes: object(11)
memory usage: 23.3+ MB


In [6]:
df = df.fillna('')

In [7]:
len(df.pmid.unique())

775

In [8]:
import re

In [9]:
#use re to fix year as format is not consistent
for i in range(len(df.index)):
    pub_date = df.at[i, 'year']
    x = re.findall(r'\d+', pub_date)
    if x:
        df.at[i, 'year'] = x[0]
df.head()

Unnamed: 0,index,pmid,subject_cui,subject_type,relation,object_cui,object_type,year,sentence,subject_name,object_name
0,0,29913456,C1704263,"orch,phsu",AFFECTS,C2349205,fndg,2018,29913456_ascii.txt.tx.1 Abstract Background/Ai...,Green Tea Extract,Normal heart
1,1,29913456,C0014518,dsyn,PROCESS_OF,C0596888,anim,2018,29913456_ascii.txt.tx.2 Methods: The study pop...,Toxic Epidermal Necrolysis,mature animal
2,2,29913456,C0043047,"inch,phsu",ADMINISTERED_TO,C0034693,mamm,2018,\n,Water,Rattus norvegicus
3,3,29913456,C0232164,ortf,PROCESS_OF,C0003062,anim,2018,"29913456_ascii.txt.tx.1 Then, in vivo and ex v...",Cardiac function,Animals
4,4,29913456,C0185115,topp,USES,C0006675,"bacs,elii,phsu",2018,29913456_ascii.txt.tx.2 Results: GTE cardiom...,Extraction,Calcium


In [10]:
source_sent = df['sentence'].tolist()

In [11]:
len(source_sent)

277812

In [12]:
##fix source sentence
last = len(source_sent)-1
sentences = []
count = 0
for sent in source_sent:
    sent = sent.strip()
    flag = 0
    if sent == '' or '|||' in sent:
        for i in range(count, -1, -1):
            if source_sent[i].strip() == '' or '|||' in source_sent[i].strip():
                continue
            else:
                sentences.append(source_sent[i].strip())
                flag = 1
                break
        if flag == 0:
            sentences.append(sent.strip())
    else:
        sentences.append(sent.strip())
    count += 1
            
len(sentences)

277812

In [13]:
sentences = pd.Series(sentences)

In [14]:
df['source_text'] = sentences
df.head()

Unnamed: 0,index,pmid,subject_cui,subject_type,relation,object_cui,object_type,year,sentence,subject_name,object_name,source_text
0,0,29913456,C1704263,"orch,phsu",AFFECTS,C2349205,fndg,2018,29913456_ascii.txt.tx.1 Abstract Background/Ai...,Green Tea Extract,Normal heart,29913456_ascii.txt.tx.1 Abstract Background/Ai...
1,1,29913456,C0014518,dsyn,PROCESS_OF,C0596888,anim,2018,29913456_ascii.txt.tx.2 Methods: The study pop...,Toxic Epidermal Necrolysis,mature animal,29913456_ascii.txt.tx.2 Methods: The study pop...
2,2,29913456,C0043047,"inch,phsu",ADMINISTERED_TO,C0034693,mamm,2018,\n,Water,Rattus norvegicus,29913456_ascii.txt.tx.2 Methods: The study pop...
3,3,29913456,C0232164,ortf,PROCESS_OF,C0003062,anim,2018,"29913456_ascii.txt.tx.1 Then, in vivo and ex v...",Cardiac function,Animals,"29913456_ascii.txt.tx.1 Then, in vivo and ex v..."
4,4,29913456,C0185115,topp,USES,C0006675,"bacs,elii,phsu",2018,29913456_ascii.txt.tx.2 Results: GTE cardiom...,Extraction,Calcium,29913456_ascii.txt.tx.2 Results: GTE cardiom...


In [15]:
df = df.drop(['sentence', 'index'], axis=1)

In [16]:
df.head()

Unnamed: 0,pmid,subject_cui,subject_type,relation,object_cui,object_type,year,subject_name,object_name,source_text
0,29913456,C1704263,"orch,phsu",AFFECTS,C2349205,fndg,2018,Green Tea Extract,Normal heart,29913456_ascii.txt.tx.1 Abstract Background/Ai...
1,29913456,C0014518,dsyn,PROCESS_OF,C0596888,anim,2018,Toxic Epidermal Necrolysis,mature animal,29913456_ascii.txt.tx.2 Methods: The study pop...
2,29913456,C0043047,"inch,phsu",ADMINISTERED_TO,C0034693,mamm,2018,Water,Rattus norvegicus,29913456_ascii.txt.tx.2 Methods: The study pop...
3,29913456,C0232164,ortf,PROCESS_OF,C0003062,anim,2018,Cardiac function,Animals,"29913456_ascii.txt.tx.1 Then, in vivo and ex v..."
4,29913456,C0185115,topp,USES,C0006675,"bacs,elii,phsu",2018,Extraction,Calcium,29913456_ascii.txt.tx.2 Results: GTE cardiom...


In [18]:
df['predicate'] = df['relation'].str.lower()
df.head()

Unnamed: 0,pmid,subject_cui,subject_type,relation,object_cui,object_type,year,subject_name,object_name,source_text,predicate
0,29913456,C1704263,"orch,phsu",AFFECTS,C2349205,fndg,2018,Green Tea Extract,Normal heart,29913456_ascii.txt.tx.1 Abstract Background/Ai...,affects
1,29913456,C0014518,dsyn,PROCESS_OF,C0596888,anim,2018,Toxic Epidermal Necrolysis,mature animal,29913456_ascii.txt.tx.2 Methods: The study pop...,process_of
2,29913456,C0043047,"inch,phsu",ADMINISTERED_TO,C0034693,mamm,2018,Water,Rattus norvegicus,29913456_ascii.txt.tx.2 Methods: The study pop...,administered_to
3,29913456,C0232164,ortf,PROCESS_OF,C0003062,anim,2018,Cardiac function,Animals,"29913456_ascii.txt.tx.1 Then, in vivo and ex v...",process_of
4,29913456,C0185115,topp,USES,C0006675,"bacs,elii,phsu",2018,Extraction,Calcium,29913456_ascii.txt.tx.2 Results: GTE cardiom...,uses


In [21]:
df['predicate'].unique()

array(['affects', 'process_of', 'administered_to', 'uses', 'stimulates',
       'coexists_with', 'location_of', 'treats', 'part_of', 'augments',
       'isa', 'interacts_with', 'associated_with', 'measures',
       'method_of', 'compared_with', 'higher_than', 'produces', 'causes',
       'prevents', 'inhibits', 'predisposes', 'disrupts', 'lower_than',
       'diagnoses', 'neg_interacts_with', 'treats(infer)', 'treats(spec)',
       'neg_affects', 'neg_location_of', 'neg_inhibits', 'neg_isa',
       'prevents(spec)', 'coexists_with(spec)', 'neg_stimulates',
       'converts_to', 'neg_augments', 'neg_disrupts', 'neg_coexists_with',
       'location_of(spec)', 'predisposes(spec)', 'process_of(spec)',
       'affects(spec)', 'augments(spec)', 'neg_causes',
       'associated_with(spec)', 'interacts_with(spec)', 'neg_same_as',
       'same_as', 'interacts_with(infer)', 'manifestation_of',
       'stimulates(spec)', 'precedes', 'inhibits(spec)', 'neg_part_of',
       'neg_treats', 'part_of(s

In [22]:
preds_neg = []
for item in df['predicate'].unique().tolist():
    if 'neg_' in item:
        preds_neg.append(item)
preds_neg

['neg_interacts_with',
 'neg_affects',
 'neg_location_of',
 'neg_inhibits',
 'neg_isa',
 'neg_stimulates',
 'neg_augments',
 'neg_disrupts',
 'neg_coexists_with',
 'neg_causes',
 'neg_same_as',
 'neg_part_of',
 'neg_treats',
 'neg_treats(spec)',
 'neg_produces',
 'neg_stimulates(spec)',
 'neg_location_of(spec)',
 'neg_uses',
 'neg_measures',
 'neg_affects(spec)',
 'neg_process_of',
 'neg_predisposes',
 'neg_treats(infer)',
 'neg_inhibits(spec)',
 'neg_prevents',
 'neg_higher_than',
 'neg_associated_with',
 'neg_interacts_with(spec)',
 'neg_disrupts(spec)',
 'neg_converts_to',
 'neg_diagnoses',
 'neg_administered_to',
 'neg_coexists_with(spec)',
 'neg_method_of',
 'neg_associated_with(spec)',
 'neg_administered_to(spec)',
 'neg_occurs_in']

In [23]:
df = df.drop(['relation'], axis=1)

In [24]:
dfn = df[df['predicate'].isin(preds_neg)]
dfn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6852 entries, 186 to 277552
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   pmid          6852 non-null   int64 
 1   subject_cui   6852 non-null   object
 2   subject_type  6852 non-null   object
 3   object_cui    6852 non-null   object
 4   object_type   6852 non-null   object
 5   year          6852 non-null   object
 6   subject_name  6852 non-null   object
 7   object_name   6852 non-null   object
 8   source_text   6852 non-null   object
 9   predicate     6852 non-null   object
dtypes: int64(1), object(9)
memory usage: 588.8+ KB


In [26]:
dfpos = df[~df['predicate'].isin(preds_neg)]
dfpos.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 270960 entries, 0 to 277811
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   pmid          270960 non-null  int64 
 1   subject_cui   270960 non-null  object
 2   subject_type  270960 non-null  object
 3   object_cui    270960 non-null  object
 4   object_type   270960 non-null  object
 5   year          270960 non-null  object
 6   subject_name  270960 non-null  object
 7   object_name   270960 non-null  object
 8   source_text   270960 non-null  object
 9   predicate     270960 non-null  object
dtypes: int64(1), object(9)
memory usage: 22.7+ MB


In [27]:
dfpos['predicate'].value_counts()

location_of               45533
interacts_with            26146
affects                   18902
part_of                   18850
isa                       18569
process_of                15880
inhibits                  15788
coexists_with             15521
stimulates                14490
treats                    13062
uses                      11122
causes                     9540
disrupts                   7800
augments                   7435
measures                   4059
associated_with            3968
compared_with              3808
produces                   3213
administered_to            3101
diagnoses                  2615
prevents                   2264
method_of                  2216
predisposes                1217
higher_than                 608
interacts_with(spec)        452
treats(spec)                443
location_of(spec)           435
coexists_with(spec)         390
treats(infer)               381
converts_to                 332
precedes                    309
inhibits

In [28]:
dfn['predicate'].value_counts()

neg_affects                  1544
neg_interacts_with           1529
neg_stimulates                583
neg_location_of               581
neg_inhibits                  427
neg_treats                    411
neg_disrupts                  285
neg_coexists_with             240
neg_augments                  207
neg_causes                    199
neg_process_of                167
neg_part_of                   104
neg_same_as                    80
neg_administered_to            54
neg_isa                        48
neg_predisposes                46
neg_associated_with            44
neg_diagnoses                  44
neg_measures                   42
neg_affects(spec)              32
neg_produces                   31
neg_interacts_with(spec)       29
neg_prevents                   26
neg_higher_than                16
neg_inhibits(spec)             13
neg_stimulates(spec)           10
neg_coexists_with(spec)        10
neg_method_of                  10
neg_disrupts(spec)              9
neg_treats(spe

In [29]:
dfn['predicate_obo'] = None
dfn['subject_obo'] = None
dfn['object_obo'] = None
dfpos['predicate_obo'] = None
dfpos['subject_obo'] = None
dfpos['object_obo'] = None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [30]:
dfn = dfn.reset_index(drop=True)
dfn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6852 entries, 0 to 6851
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   pmid           6852 non-null   int64 
 1   subject_cui    6852 non-null   object
 2   subject_type   6852 non-null   object
 3   object_cui     6852 non-null   object
 4   object_type    6852 non-null   object
 5   year           6852 non-null   object
 6   subject_name   6852 non-null   object
 7   object_name    6852 non-null   object
 8   source_text    6852 non-null   object
 9   predicate      6852 non-null   object
 10  predicate_obo  0 non-null      object
 11  subject_obo    0 non-null      object
 12  object_obo     0 non-null      object
dtypes: int64(1), object(12)
memory usage: 696.0+ KB


In [31]:
dfpos = dfpos.reset_index(drop=True)
dfpos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270960 entries, 0 to 270959
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   pmid           270960 non-null  int64 
 1   subject_cui    270960 non-null  object
 2   subject_type   270960 non-null  object
 3   object_cui     270960 non-null  object
 4   object_type    270960 non-null  object
 5   year           270960 non-null  object
 6   subject_name   270960 non-null  object
 7   object_name    270960 non-null  object
 8   source_text    270960 non-null  object
 9   predicate      270960 non-null  object
 10  predicate_obo  0 non-null       object
 11  subject_obo    0 non-null       object
 12  object_obo     0 non-null       object
dtypes: int64(1), object(12)
memory usage: 26.9+ MB


In [38]:
#handle subclassof when mapping to obo

In [32]:
#map from UMLS to GO, HPO
with open('cui_to_ontology_maps/go_hpo_map_dict.pickle', 'rb') as filep:
    go_hpo_mapping_dict = pickle.load(filep)
len(go_hpo_mapping_dict)

87745

In [33]:
def umls_go_hpo_map(row, col):
    if col == 'subject':
        cui = row['subject_cui']
    elif col == 'object':
        cui = row['object_cui']
    else:
        print('specify if subject or object mapping required')
        exit(0)
    if cui in go_hpo_mapping_dict:
        if len(go_hpo_mapping_dict):
            return go_hpo_mapping_dict[cui][0]
    return None

In [34]:
dfn['subject_obo'] = dfn.apply(umls_go_hpo_map, axis=1, col='subject')
dfn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6852 entries, 0 to 6851
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   pmid           6852 non-null   int64 
 1   subject_cui    6852 non-null   object
 2   subject_type   6852 non-null   object
 3   object_cui     6852 non-null   object
 4   object_type    6852 non-null   object
 5   year           6852 non-null   object
 6   subject_name   6852 non-null   object
 7   object_name    6852 non-null   object
 8   source_text    6852 non-null   object
 9   predicate      6852 non-null   object
 10  predicate_obo  0 non-null      object
 11  subject_obo    210 non-null    object
 12  object_obo     0 non-null      object
dtypes: int64(1), object(12)
memory usage: 696.0+ KB


In [35]:
dfn['object_obo'] = dfn.apply(umls_go_hpo_map, axis=1, col='object')
dfn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6852 entries, 0 to 6851
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   pmid           6852 non-null   int64 
 1   subject_cui    6852 non-null   object
 2   subject_type   6852 non-null   object
 3   object_cui     6852 non-null   object
 4   object_type    6852 non-null   object
 5   year           6852 non-null   object
 6   subject_name   6852 non-null   object
 7   object_name    6852 non-null   object
 8   source_text    6852 non-null   object
 9   predicate      6852 non-null   object
 10  predicate_obo  0 non-null      object
 11  subject_obo    210 non-null    object
 12  object_obo     1260 non-null   object
dtypes: int64(1), object(12)
memory usage: 696.0+ KB


In [36]:
dfpos['subject_obo'] = dfpos.apply(umls_go_hpo_map, axis=1, col='subject')
dfpos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270960 entries, 0 to 270959
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   pmid           270960 non-null  int64 
 1   subject_cui    270960 non-null  object
 2   subject_type   270960 non-null  object
 3   object_cui     270960 non-null  object
 4   object_type    270960 non-null  object
 5   year           270960 non-null  object
 6   subject_name   270960 non-null  object
 7   object_name    270960 non-null  object
 8   source_text    270960 non-null  object
 9   predicate      270960 non-null  object
 10  predicate_obo  0 non-null       object
 11  subject_obo    22985 non-null   object
 12  object_obo     0 non-null       object
dtypes: int64(1), object(12)
memory usage: 26.9+ MB


In [37]:
dfpos['object_obo'] = dfpos.apply(umls_go_hpo_map, axis=1, col='object')
dfpos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270960 entries, 0 to 270959
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   pmid           270960 non-null  int64 
 1   subject_cui    270960 non-null  object
 2   subject_type   270960 non-null  object
 3   object_cui     270960 non-null  object
 4   object_type    270960 non-null  object
 5   year           270960 non-null  object
 6   subject_name   270960 non-null  object
 7   object_name    270960 non-null  object
 8   source_text    270960 non-null  object
 9   predicate      270960 non-null  object
 10  predicate_obo  0 non-null       object
 11  subject_obo    22985 non-null   object
 12  object_obo     38058 non-null   object
dtypes: int64(1), object(12)
memory usage: 26.9+ MB


In [50]:
with open('cui_to_ontology_maps/CUItoOBO_20220406.pickle', 'rb') as filep:
    obomap_dict = pickle.load(filep)

In [51]:
def get_obo_mapping(row, col):
    if col == 'subject':
        cui = row['subject_cui']
    elif col == 'object':
        cui = row['object_cui']
    else:
        print('specify if subject or object mapping required')
        exit(0)
    if cui in obomap_dict:
        return obomap_dict[cui]
    return None

In [52]:
#MAPPING
dfn['subject_obo'] = dfn.apply(get_obo_mapping, axis=1, col='subject')
dfn.head()

Unnamed: 0,pmid,subject_cui,subject_type,object_cui,object_type,year,subject_name,object_name,source_text,predicate,predicate_obo,subject_obo,object_obo
0,19353999,C0059438,"orch,phsu",C1332826,"gngm,aapp",2009,epigallocatechin gallate,CYP1A2 gene,19353999_ascii.txt.tx.1 A recent report has fo...,neg_interacts_with,,napdi_srs_imports:epigallocatechin_gallate,
1,12659723,C0007404,"orch,phsu",C1420188,"gngm,aapp",2003,Catechin,SLC35A2 gene,12659723_ascii.txt.tx.2 Lhoste et al./ Food an...,neg_interacts_with,,CHEBI_23053,
2,12659723,C0022203,npop,C0011135,orgf,2003,Isomerism (chemical phenomenon),Defecation,"12659723_ascii.txt.tx.2 Other polyphenols, e.g...",neg_affects,,,GO_0030421
3,32645157,C0041912,dsyn,C1154978,orgf,2020,Upper Respiratory Infections,cold acclimation,32645157_ascii.txt.tx.1 After undergoing cold ...,neg_affects,,MONDO_0024355,GO_0009631
4,32645157,C1167041,celc,C0120285,"aapp,gngm,irda",2020,vacuolar membrane,Green Fluorescent Proteins,32645157_ascii.txt.tx.1 Our results showed tha...,neg_location_of,,,


In [53]:
dfn['object_obo'] = dfn.apply(get_obo_mapping, axis=1, col='object')
dfn.head()

Unnamed: 0,pmid,subject_cui,subject_type,object_cui,object_type,year,subject_name,object_name,source_text,predicate,predicate_obo,subject_obo,object_obo
0,19353999,C0059438,"orch,phsu",C1332826,"gngm,aapp",2009,epigallocatechin gallate,CYP1A2 gene,19353999_ascii.txt.tx.1 A recent report has fo...,neg_interacts_with,,napdi_srs_imports:epigallocatechin_gallate,PR_000006102
1,12659723,C0007404,"orch,phsu",C1420188,"gngm,aapp",2003,Catechin,SLC35A2 gene,12659723_ascii.txt.tx.2 Lhoste et al./ Food an...,neg_interacts_with,,CHEBI_23053,PR_000015090
2,12659723,C0022203,npop,C0011135,orgf,2003,Isomerism (chemical phenomenon),Defecation,"12659723_ascii.txt.tx.2 Other polyphenols, e.g...",neg_affects,,,
3,32645157,C0041912,dsyn,C1154978,orgf,2020,Upper Respiratory Infections,cold acclimation,32645157_ascii.txt.tx.1 After undergoing cold ...,neg_affects,,MONDO_0024355,
4,32645157,C1167041,celc,C0120285,"aapp,gngm,irda",2020,vacuolar membrane,Green Fluorescent Proteins,32645157_ascii.txt.tx.1 Our results showed tha...,neg_location_of,,,


In [54]:
dfn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6852 entries, 0 to 6851
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   pmid           6852 non-null   int64 
 1   subject_cui    6852 non-null   object
 2   subject_type   6852 non-null   object
 3   object_cui     6852 non-null   object
 4   object_type    6852 non-null   object
 5   year           6852 non-null   object
 6   subject_name   6852 non-null   object
 7   object_name    6852 non-null   object
 8   source_text    6852 non-null   object
 9   predicate      6852 non-null   object
 10  predicate_obo  0 non-null      object
 11  subject_obo    4900 non-null   object
 12  object_obo     3275 non-null   object
dtypes: int64(1), object(12)
memory usage: 696.0+ KB


In [56]:
#MAPPING
dfpos['subject_obo'] = dfpos.apply(get_obo_mapping, axis=1, col='subject')

In [57]:
dfpos['object_obo'] = dfpos.apply(get_obo_mapping, axis=1, col='object')
dfpos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 270960 entries, 0 to 270959
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   pmid           270960 non-null  int64 
 1   subject_cui    270960 non-null  object
 2   subject_type   270960 non-null  object
 3   object_cui     270960 non-null  object
 4   object_type    270960 non-null  object
 5   year           270960 non-null  object
 6   subject_name   270960 non-null  object
 7   object_name    270960 non-null  object
 8   source_text    270960 non-null  object
 9   predicate      270960 non-null  object
 10  predicate_obo  0 non-null       object
 11  subject_obo    156548 non-null  object
 12  object_obo     137577 non-null  object
dtypes: int64(1), object(12)
memory usage: 26.9+ MB


In [93]:
##add labels for all subjects and objects

In [62]:
dfn.to_csv('semrep_data/semrep_negation_predications_mapped_20220407.csv', index=False)

In [64]:
dfpos.to_csv('semrep_data/semrep_positive_predications_mapped_20220407.csv', index=False)

In [61]:
dfn = dfn.drop_duplicates()
dfn = dfn.reset_index(drop=True)
dfn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1796 entries, 0 to 1795
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   pmid           1796 non-null   int64 
 1   subject_cui    1796 non-null   object
 2   subject_type   1796 non-null   object
 3   object_cui     1796 non-null   object
 4   object_type    1796 non-null   object
 5   year           1796 non-null   object
 6   subject_name   1796 non-null   object
 7   object_name    1796 non-null   object
 8   source_text    1796 non-null   object
 9   predicate      1796 non-null   object
 10  predicate_obo  0 non-null      object
 11  subject_obo    1194 non-null   object
 12  object_obo     831 non-null    object
dtypes: int64(1), object(12)
memory usage: 182.5+ KB


In [63]:
dfpos = dfpos.drop_duplicates()
dfpos = dfpos.reset_index(drop=True)
dfpos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75533 entries, 0 to 75532
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   pmid           75533 non-null  int64 
 1   subject_cui    75533 non-null  object
 2   subject_type   75533 non-null  object
 3   object_cui     75533 non-null  object
 4   object_type    75533 non-null  object
 5   year           75533 non-null  object
 6   subject_name   75533 non-null  object
 7   object_name    75533 non-null  object
 8   source_text    75533 non-null  object
 9   predicate      75533 non-null  object
 10  predicate_obo  0 non-null      object
 11  subject_obo    41977 non-null  object
 12  object_obo     37019 non-null  object
dtypes: int64(1), object(12)
memory usage: 7.5+ MB
