Import Libraries

In [1]:
import os

In [2]:
home_dir = '../'
os.chdir(home_dir)
from ext import download_pubmed_articles, search_gene_id, download_gene_names, search_chem_id, download_chem_names
from norm import normalize_chems_genes
from relation_finder import find_sentences

"""
Required: A valid entrez registered email and entrez api-key.
More information: Entrez Programming Utilities (E-utilities)
"""  
from config import entrez_api_key, entrez_email

In [3]:
import pandas as pd
import numpy as np
import re

In [4]:
from Bio import Entrez
import xmltodict

Preprocessing <b>ChemProt</b><br>
<a href="https://biocreative.bioinformatics.udel.edu/news/corpora/chemprot-corpus-biocreative-vi/">Chemical-Protein Interactions</a>

In [5]:
chemprot = pd.read_csv('Data/chemprot.csv')

In [6]:
chemprot.index = chemprot.pmid.astype('str')
chemprot = chemprot.dropna()
chemprot = chemprot.drop(columns = ['pmid'])
chemprot.head(5)

Unnamed: 0_level_0,source,target,interaction,abstract
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10047461,Tomudex,kinase,ACTIVATOR,Tomudex (ZD1694) is a specific antifolate-base...
10047461,Tomudex,cyclin E,INDIRECT-UPREGULATOR,Tomudex (ZD1694) is a specific antifolate-base...
10047461,Tomudex,cdk2,INDIRECT-UPREGULATOR,Tomudex (ZD1694) is a specific antifolate-base...
10047461,Tomudex,p27(kip1),INDIRECT-DOWNREGULATOR,Tomudex (ZD1694) is a specific antifolate-base...
10047461,Tomudex,cyclin E,ACTIVATOR,Tomudex (ZD1694) is a specific antifolate-base...


In [7]:
print('No of entries in ChemProt:', chemprot.shape[0])

No of entries in ChemProt: 15734


In [9]:
df = chemprot

In [11]:
# Get all the chem names, search and download chemicals.
chem_name_list = df.source.to_list()
chem_search_res = search_chem_id(entrez_api_key, entrez_email, chem_name_list)
chem_ids = [i['id'] for i in chem_search_res]
chem_dict = download_chem_names(entrez_api_key, entrez_email, chem_ids)

3952 unique chemical names.
Searching...
No of unique gene ids: 2380 3952
Downloading from gene db...
Decoding downloaded data...
Parsing to a dict
2375 found.


In [12]:
chem_df = pd.DataFrame(chem_dict)[['id', 'chem']].rename(columns = {'chem':'syn'})
chem_dict = pd.DataFrame(chem_search_res).merge(chem_df, on='id').dropna()
#chem_dict['syn'] = chem_dict.syn.apply(lambda x: [x])
chem_dict = list(chem_dict.T.to_dict().values())

In [15]:
# Get all the gene names, search and download genes.
gene_search_res = search_gene_id(entrez_api_key, entrez_email, df.target.unique().tolist())
gene_ids = [i['id'] for i in gene_search_res]
gene_dict = download_gene_names(entrez_api_key, entrez_email, gene_ids)
gene_df = pd.DataFrame(gene_dict)[['id', 'gene']].rename(columns = {'gene':'aliases'})
gene_dict = pd.DataFrame(gene_search_res).merge(gene_df, on='id').dropna()
gene_dict = list(gene_dict.T.to_dict().values())

4190 unique gene names...
Searching...
No of unique gene ids: 2493s...
Downloading from gene db...
Decoding downloaded data...
Parsing to a dict
2417 found.


In [16]:
# Normalize chemical and gene names
df['abstract'] = normalize_chems_genes(chem_dict, gene_dict, df.source.to_list(), df.target.to_list(), df.abstract.to_list())

15734 abstracts to be normalized...
15734 abstracts normalized...
Done!


In [17]:
for i in range(len(df)):
    try:
        source_chem = list(filter(lambda x: x['chem'].lower() == df.iloc[i]['source'].lower(), chem_dict))[0]
        target_gene = list(filter(lambda x: x['gene'].lower() == df.iloc[i]['target'].lower(), gene_dict))[0]
        df.iloc[i]['source'] = source_chem['syn']
        df.iloc[i]['target'] = target_gene['aliases']  
    except:
        pass

In [18]:
df.head(5)

Unnamed: 0_level_0,source,target,interaction,abstract
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10047461,raltitrexed,egfr,ACTIVATOR,raltitrexed (ZD1694) is a specific antifolate-...
10047461,raltitrexed,cyce,INDIRECT-UPREGULATOR,raltitrexed (ZD1694) is a specific antifolate-...
10047461,raltitrexed,cdk2,INDIRECT-UPREGULATOR,raltitrexed (ZD1694) is a specific antifolate-...
10047461,raltitrexed,cdkn1b,INDIRECT-DOWNREGULATOR,raltitrexed (ZD1694) is a specific antifolate-...
10047461,raltitrexed,cyce,ACTIVATOR,raltitrexed (ZD1694) is a specific antifolate-...


In [19]:
# Find sentences with gene and chem mentions
rel_df = find_sentences(df)
rel_df.head(5)

15734 abstracts...
187672 sentences found...
15693 sentences have a relation information.


Unnamed: 0_level_0,source,target,interaction,abstract,sentence,n_lines,match
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10047461,raltitrexed,egfr,ACTIVATOR,raltitrexed (ZD1694) is a specific antifolate-...,raltitrexed ZD1694 is a specific antifolatebas...,2815,0
10047461,raltitrexed,egfr,ACTIVATOR,raltitrexed (ZD1694) is a specific antifolate-...,Studies were carried out in vitro to evaluate...,2815,0
10047461,raltitrexed,egfr,ACTIVATOR,raltitrexed (ZD1694) is a specific antifolate-...,Twentyfour hours following the initial 2h tre...,2815,0
10047461,raltitrexed,egfr,ACTIVATOR,raltitrexed (ZD1694) is a specific antifolate-...,The changes in cyclin and cdk protein express...,2815,0
10047461,raltitrexed,egfr,ACTIVATOR,raltitrexed (ZD1694) is a specific antifolate-...,raltitrexed treatment resulted in the decreas...,2815,1


In [21]:
rel_df = rel_df[['source', 'target', 'interaction', 'sentence', 'match']]

In [22]:
rel_df.to_csv('chemprot_norm_rel.csv')