Import Libraries

In [1]:
import os

In [2]:
home_dir = '../'
os.chdir(home_dir)
from ext import download_pubmed_articles, search_gene_id, download_gene_names
from norm import normalize_genes
from relation_finder import find_sentences

"""
Required: A valid entrez registered email and entrez api-key.
More information: Entrez Programming Utilities (E-utilities)
"""  
from config import entrez_api_key, entrez_email

In [3]:
import pandas as pd
import numpy as np
import re

In [4]:
from Bio import Entrez
import xmltodict

Preprocessing <b>BioGRID</b>. <br>
<a href="https://thebiogrid.org/"> Biological General Repository for Interaction Datasets</a>

In [5]:
biogrid = pd.read_csv('Data/BIOGRID-ALL-4.3.196.mitab.txt', sep = '\t')
biogrid = biogrid[(biogrid['#ID Interactor A'].str[:6] == 'entrez') & (biogrid['ID Interactor B'].str[:6] == 'entrez')]
biogrid['source'] = biogrid['Alt IDs Interactor A'].apply(lambda x: re.findall(r'entrez gene/locuslink:[\w]*', x)[0][22:])
biogrid['target'] = biogrid['Alt IDs Interactor B'].apply(lambda x: re.findall(r'entrez gene/locuslink:[\w]*', x)[0][22:])
biogrid['interaction'] = biogrid['Interaction Types'].str[17:-1]
biogrid['pubmed'] = biogrid['Publication Identifiers'].str[7:]
biogrid = biogrid[~biogrid.pubmed.isna()]
biogrid.head(5)

Unnamed: 0,#ID Interactor A,ID Interactor B,Alt IDs Interactor A,Alt IDs Interactor B,Aliases Interactor A,Aliases Interactor B,Interaction Detection Method,Publication 1st Author,Publication Identifiers,Taxid Interactor A,Taxid Interactor B,Interaction Types,Source Database,Interaction Identifiers,Confidence Values,source,target,interaction,pubmed
0,entrez gene/locuslink:6416,entrez gene/locuslink:2318,biogrid:112315|entrez gene/locuslink:MAP2K4|un...,biogrid:108607|entrez gene/locuslink:FLNC|unip...,entrez gene/locuslink:JNKK(gene name synonym)|...,entrez gene/locuslink:ABP-280(gene name synony...,"psi-mi:""MI:0018""(two hybrid)",Marti A (1997),pubmed:9006895,taxid:9606,taxid:9606,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0463""(biogrid)",biogrid:103,-,MAP2K4,FLNC,direct interaction,9006895
1,entrez gene/locuslink:84665,entrez gene/locuslink:88,biogrid:124185|entrez gene/locuslink:MYPN|unip...,biogrid:106603|entrez gene/locuslink:ACTN2|uni...,entrez gene/locuslink:CMD1DD(gene name synonym...,entrez gene/locuslink:CMD1AA(gene name synonym),"psi-mi:""MI:0018""(two hybrid)",Bang ML (2001),pubmed:11309420,taxid:9606,taxid:9606,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0463""(biogrid)",biogrid:117,-,MYPN,ACTN2,direct interaction,11309420
2,entrez gene/locuslink:90,entrez gene/locuslink:2339,biogrid:106605|entrez gene/locuslink:ACVR1|uni...,biogrid:108625|entrez gene/locuslink:FNTA|unip...,entrez gene/locuslink:ACTRI(gene name synonym)...,entrez gene/locuslink:FPTA(gene name synonym)|...,"psi-mi:""MI:0018""(two hybrid)",Wang T (1996),pubmed:8599089,taxid:9606,taxid:9606,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0463""(biogrid)",biogrid:183,-,ACVR1,FNTA,direct interaction,8599089
3,entrez gene/locuslink:2624,entrez gene/locuslink:5371,biogrid:108894|entrez gene/locuslink:GATA2|uni...,biogrid:111384|entrez gene/locuslink:PML|unipr...,entrez gene/locuslink:DCML(gene name synonym)|...,entrez gene/locuslink:MYL(gene name synonym)|e...,"psi-mi:""MI:0018""(two hybrid)",Tsuzuki S (2000),pubmed:10938104,taxid:9606,taxid:9606,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0463""(biogrid)",biogrid:278,-,GATA2,PML,direct interaction,10938104
4,entrez gene/locuslink:6118,entrez gene/locuslink:6774,biogrid:112038|entrez gene/locuslink:RPA2|entr...,biogrid:112651|entrez gene/locuslink:STAT3|uni...,entrez gene/locuslink:REPA2(gene name synonym)...,entrez gene/locuslink:ADMIO(gene name synonym)...,"psi-mi:""MI:0018""(two hybrid)",Kim J (2000),pubmed:10875894,taxid:9606,taxid:9606,"psi-mi:""MI:0407""(direct interaction)","psi-mi:""MI:0463""(biogrid)",biogrid:418,-,RPA2,STAT3,direct interaction,10875894


In [6]:
print('No of entries in BioGRID:', biogrid.shape[0])

No of entries in BioGRID: 1996885


In [16]:
# Create a list of all Pubmed IDs
pubmed_list = list(set(biogrid['Publication Identifiers'].str[7:].to_list()))
# Download abstracts from Pubmed IDs
print('Set 1')
res1 = download_pubmed_articles(entrez_api_key, entrez_email, pubmed_list[:10000])
print('Set 2')
res2 = download_pubmed_articles(entrez_api_key, entrez_email, pubmed_list[10000:20000])
print('Set 3')
res3 = download_pubmed_articles(entrez_api_key, entrez_email, pubmed_list[20000:30000])
print('Set 4')
res4 = download_pubmed_articles(entrez_api_key, entrez_email, pubmed_list[30000:40000])
print('Set 5')
res5 = download_pubmed_articles(entrez_api_key, entrez_email, pubmed_list[40000:50000])
print('Set 6')
res6 = download_pubmed_articles(entrez_api_key, entrez_email, pubmed_list[50000:60000])
print('Set 7')
res7 = download_pubmed_articles(entrez_api_key, entrez_email, pubmed_list[60000:])
res = res1 + res2 + res3 + res4 + res5 + res6 + res7
print('-'*15)
print(f'{len(res)} articles found out of {len(pubmed_list)}')

KeyError: 'Publication Identifiers'

In [None]:
# Create DataFrame for downloaded pubmed articles.
pubmed_articles = pd.DataFrame(res).dropna()
pubmed_articles.index = pubmed_articles.pubmed
pubmed_articles.index.name = 'pmid'
pubmed_articles.head(5)

In [9]:
# Merge BioGRID with pubmed dataframe
biogrid.index = biogrid.pubmed
biogrid.index.name = 'pmid'
biogrid = biogrid[['source', 'target', 'interaction']]
df = biogrid.join(pubmed_articles).dropna(subset = ['abstract'])
df = df[['source', 'target', 'interaction', 'abstract']]
df.head(5)

Unnamed: 0_level_0,source,target,interaction,abstract
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10021333,apt,aret,physical association,The product of the oskar gene directs posterio...
10021333,apt,osk,direct interaction,The product of the oskar gene directs posterio...
10021350,arm,alpha,physical association,Drosophila Armadillo and its vertebrate homolo...
10021350,arm,shg,physical association,Drosophila Armadillo and its vertebrate homolo...
10021361,GRAP2,LCP2,physical association,The adaptor protein Gads is a Grb2-related pro...


In [None]:
# Search gene names
gene_name_list = list(set(df.source.to_list() + df.target.to_list()))
gene_name_list = [i for i in gene_name_list if len(i)>0]
gene_search_res = search_gene_id(entrez_api_key, entrez_email, gene_name_list)

In [11]:
# Download genes.
gene_ids = list(set([i['id'] for i in gene_search_res]))
print('Set 1')
gene_dict1 = download_gene_names(entrez_api_key, entrez_email, gene_ids[:10000])
print('Set 2')
gene_dict2 = download_gene_names(entrez_api_key, entrez_email, gene_ids[10000:20000])
print('Set 3')
gene_dict3 = download_gene_names(entrez_api_key, entrez_email, gene_ids[20000:30000])
print('Set 4')
gene_dict4 = download_gene_names(entrez_api_key, entrez_email, gene_ids[30000:40000])
print('Set 5')
gene_dict5 = download_gene_names(entrez_api_key, entrez_email, gene_ids[40000:])
gene_dict = gene_dict1 + gene_dict2 + gene_dict3 + gene_dict4 + gene_dict5
print('-'*15)
print(f'{len(gene_dict)} genes downloaded out of {len(gene_ids)}')

Set 1
No of unique gene ids: 10000
Downloading from gene db...
Decoding downloaded data...
Parsing to a dict
9846 found.
Set 2
No of unique gene ids: 10000
Downloading from gene db...
Decoding downloaded data...
Parsing to a dict
9841 found.
Set 3
No of unique gene ids: 10000
Downloading from gene db...
Decoding downloaded data...
Parsing to a dict
9849 found.
Set 4
No of unique gene ids: 10000
Downloading from gene db...
Decoding downloaded data...
Parsing to a dict
9817 found.
Set 5
No of unique gene ids: 11823
Downloading from gene db...
Decoding downloaded data...
Parsing to a dict
9835 found.
---------------
49188 genes downloaded out of 51823


In [14]:
# Normalize genes
df['abstract'] = normalize_genes(gene_dict, df.source.to_list(), df.target.to_list(), df.abstract.to_list())
df.head(5)

1977720 abstracts to be normalized...
509402 abstracts normalized...

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



593931 abstracts normalized...

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



677975 abstracts normalized...

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



758167 abstracts normalized...

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



839502 abstracts normalized...

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



918915 abstracts normalized...

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



1000310 abstracts normalized...

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



1977720 abstracts normalized...
Done!


Unnamed: 0_level_0,source,target,interaction,abstract
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10021333,apt,aret,physical association,The product of the oskar gene directs posterio...
10021333,apt,osk,direct interaction,The product of the osk gene directs posterior ...
10021350,arm,alpha,physical association,Drosophila Armadillo and its vertebrate homolo...
10021350,arm,shg,physical association,Drosophila Armadillo and its vertebrate homolo...
10021361,GRAP2,LCP2,physical association,The adaptor protein grap2 is a Grb2-related pr...


In [15]:
# Find sentences with gene mentions
rel_df = find_sentences(df)
rel_df.head(5)

1977720 abstracts...
14189269 sentences found...
313403 sentences have a relation information.


Unnamed: 0_level_0,source,target,interaction,abstract,sentence,n_lines,match
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10021333,apt,aret,physical association,The product of the oskar gene directs posterio...,The product of the oskar gene directs posterio...,1172,0
10021333,apt,aret,physical association,The product of the oskar gene directs posterio...,Proper expression relies on the coordinated l...,1172,0
10021333,apt,aret,physical association,The product of the oskar gene directs posterio...,Translational repression prior to localizatio...,1172,0
10021333,apt,aret,physical association,The product of the oskar gene directs posterio...,To begin to understand how Bruno acts in tran...,1172,0
10021333,apt,aret,physical association,The product of the oskar gene directs posterio...,One interactor described here is the product ...,1172,0


In [17]:
rel_df = rel_df[['source', 'target', 'interaction', 'sentence', 'match']]

In [18]:
rel_df.to_csv('biogrid_norm_rel.csv')