Import Libraries

In [1]:
import os

In [2]:
home_dir = '../'
os.chdir(home_dir)
from ext import download_pubmed_articles, search_gene_id, download_gene_names
from norm import normalize_genes
from relation_finder import find_sentences

"""
Required: A valid entrez registered email and entrez api-key.
More information: Entrez Programming Utilities (E-utilities)
"""  
from config import entrez_api_key, entrez_email

In [3]:
import pandas as pd
import numpy as np
import re

In [4]:
from Bio import Entrez
import xmltodict

Preprocessing <b>Elnagovan et al. dataset</b><br>
    <a href="https://github.com/elangovana/PPI-typed-relation-extractor">Elangovan GitHub</a>    

In [5]:
df = pd.read_json('Data/elangovan.json')

In [6]:
df.head(5)

Unnamed: 0,isNegative,participants,pubmedId,pubmedTitle,interactionType,interactionId,pubmedabstract
0,False,"[{'uniprotid': 'Q99640', 'alias': [['pmyt1_hum...",10373560,,phosphorylation,172934,The Myt1 protein kinase functions to negativel...
1,False,"[{'uniprotid': 'Q99640', 'alias': [['pmyt1_hum...",10373560,,phosphorylation,172940,The Myt1 protein kinase functions to negativel...
2,False,"[{'uniprotid': 'Q15139', 'alias': [['kpcd1_hum...",10831594,,phosphorylation,1325322,We identified the multifunctional chaperon pro...
3,False,"[{'uniprotid': 'Q15139', 'alias': [['kpcd1_hum...",10831594,,phosphorylation,1325324,We identified the multifunctional chaperon pro...
4,False,"[{'uniprotid': 'Q07021', 'alias': [['c1qbp_hum...",10831594,,phosphorylation,1325326,We identified the multifunctional chaperon pro...


In [7]:
print('No of entries in Elangovan Dataset:', df.shape[0])

No of entries in Elangovan Dataset: 3504


In [8]:
# Filter datapoint with interactions with two entities
df.participants = df.participants.apply(lambda x: [i['alias'] for i in x if i['alias'] != None])
df['nparticipants'] = df.participants.apply(lambda x: len(x))
df = df[df.nparticipants ==2].reset_index(drop = True)

In [9]:
# Create a dict for genes
source_gene_list = df.participants.apply(lambda x: [i[0] for i in x[0]]).to_list()
target_gene_list = df.participants.apply(lambda x: [i[0] for i in x[1]]).to_list()
df['source'] = [sorted(i, key=len)[0] for i in source_gene_list]
df['target'] = [sorted(i, key=len)[0] for i in target_gene_list]
source_gene_dict = [{'gene':sorted(i, key=len)[0].lower(), 'id':np.nan, 'aliases':i} for i in source_gene_list]
target_gene_dict = [{'gene':sorted(i, key=len)[0].lower(), 'id':np.nan, 'aliases':i} for i in target_gene_list]

In [10]:
df = df[['pubmedId', 'source', 'target', 'interactionType', 'pubmedabstract']]
df.columns = ['pmid', 'source', 'target', 'interaction', 'abstract']
df.index = df.pmid
df = df.drop(columns = 'pmid')
df.head(5)

Unnamed: 0_level_0,source,target,interaction,abstract
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10373560,MYT1,CDK1,phosphorylation,The Myt1 protein kinase functions to negativel...
10373560,MYT1,CDK1,phosphorylation,The Myt1 protein kinase functions to negativel...
10831594,p33,PKD,phosphorylation,We identified the multifunctional chaperon pro...
10393177,jnk-1,JUN,phosphorylation,The c-Jun N-terminal kinase (JNK) of the MAP k...
10393177,ATF2,pmk-1,phosphorylation,The c-Jun N-terminal kinase (JNK) of the MAP k...


In [11]:
# Unique genes
gene_dict = source_gene_dict + target_gene_dict
gene_dict = list(dict((v['gene'],v) for v in gene_dict).values())
print(f'{len(gene_dict)} unique genes found.')

1530 unique genes found.


In [12]:
# Normalize genes
df['abstract'] = normalize_genes(gene_dict, df.source.to_list(), df.target.to_list(), df.abstract.to_list())
df.head(5)

2885 abstracts to be normalized...
2885 abstracts normalized...
Done!


Unnamed: 0_level_0,source,target,interaction,abstract
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10373560,MYT1,CDK1,phosphorylation,The myt1 protein kinase functions to negativel...
10373560,MYT1,CDK1,phosphorylation,The myt1 protein kinase functions to negativel...
10831594,p33,PKD,phosphorylation,We identified the multifunctional chaperon pro...
10393177,jnk-1,JUN,phosphorylation,The c-jun N-terminal kinase (JNK) of the MAP k...
10393177,ATF2,pmk-1,phosphorylation,The c-Jun N-terminal kinase (JNK) of the MAP k...


In [13]:
rel_df = find_sentences(df)
rel_df.head(5)

2885 abstracts...
22304 sentences found...
1437 sentences have a relation information.


Unnamed: 0_level_0,source,target,interaction,abstract,sentence,n_lines,match
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10373560,MYT1,CDK1,phosphorylation,The myt1 protein kinase functions to negativel...,The myt1 protein kinase functions to negativel...,1205,1
10373560,MYT1,CDK1,phosphorylation,The myt1 protein kinase functions to negativel...,Throughout interphase human myt1 localizes to...,1205,1
10373560,MYT1,CDK1,phosphorylation,The myt1 protein kinase functions to negativel...,Here we report that overproduction of either ...,1205,0
10373560,MYT1,CDK1,phosphorylation,The myt1 protein kinase functions to negativel...,The COOHterminal 63 amino acids of myt1 were ...,1205,1
10373560,MYT1,CDK1,phosphorylation,The myt1 protein kinase functions to negativel...,myt1 mutants lacking this domain no longer bo...,1205,1


In [14]:
rel_df = rel_df[['source', 'target', 'interaction', 'sentence', 'match']]

In [15]:
rel_df.to_csv('elangovan_norm_rel.csv')