Import Libraries

In [1]:
import os

In [2]:
home_dir = '../'
os.chdir(home_dir)
from ext import download_pubmed_articles, search_gene_id, download_gene_names
from norm import normalize_genes
from relation_finder import find_sentences

"""
Required: A valid entrez registered email and entrez api-key.
More information: Entrez Programming Utilities (E-utilities)
"""  
from config import entrez_api_key, entrez_email

In [3]:
import pandas as pd
import numpy as np
import re

In [4]:
from Bio import Entrez
import xmltodict

Preprocessing <b>TRRUST</b>. <br>
<a href="https://www.grnpedia.org/trrust/">Transcriptional Regulatory Relationships Unravelled by Sentence-based Text-mining</a>

In [5]:
trrust = pd.read_csv('Data/TRRUST_Human.txt', sep = '\t')

In [6]:
trrust.head(5)

Unnamed: 0,source,source type,target,target type,interaction,pubmed
0,AATF,TF,BAX,PROTEIN,Repression,22909821
1,AATF,TF,CDKN1A,PROTEIN,Unknown,17157788
2,AATF,TF,KLK3,PROTEIN,Unknown,23146908
3,AATF,TF,MYC,PROTEIN,Activation,20549547
4,AATF,TF,TP53,PROTEIN,Unknown,17157788


In [7]:
print('No of entries in TRRUST:', trrust.shape[0])

No of entries in TRRUST: 9396


In [8]:
# Drop datapoints without pubmed ids. 
trrust = trrust[~trrust.pubmed.isna()]
trrust.pubmed = trrust.pubmed.apply(lambda x: [i for i in x.split(', ') if i.isdigit()])

In [9]:
# Create a list of all Pubmed IDs
pubmed_list = [j for i in trrust.pubmed.to_list() for j in i]
# Download abstracts from pubmed IDs.
res = download_pubmed_articles(entrez_api_key, entrez_email, pubmed_list)

No of unique pubmed ids: 6561
Downloading from pubmed db...
Decoding downloaded data...
Parsing to a dict...
6561 pubmed matches found out of 6561 pubmed articles.
Extracting abstracts and titles...
Done!


In [10]:
# Create DataFrame for downloaded pubmed articles.
pubmed_articles = pd.DataFrame(res).dropna()
pubmed_articles.index = pubmed_articles.pubmed
pubmed_articles.index.name = 'pmid'
pubmed_articles.head(5)

Unnamed: 0_level_0,pubmed,title,abstract
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
7914192,7914192,Binding of an ETS-related protein within the D...,Promoter elements accounting for HER2 (c-erbB-...
7635140,7635140,Identification of a transcriptional regulatory...,Human aromatase cytochrome P450 catalyzes the ...
18441094,18441094,Glucocorticoid regulation of CD38 expression i...,"The enzymatic activity of CD38, ADP-ribosyl cy..."
10677505,10677505,Myeloblastin is a granulocyte colony-stimulati...,Hematopoiesis depends on a pool of quiescent h...
21637919,21637919,RB1CC1 activates the p16 promoter through the ...,"RB1-inducible coiled-coil 1 (RB1CC1, also know..."


In [11]:
# Create datapoint for each Pubmed id reference 
trrust = trrust.explode(column = 'pubmed')
trrust = trrust[~trrust.pubmed.isna()]
trrust.index = trrust.pubmed
trrust.index.name = 'pmid'
trrust.head(5)

Unnamed: 0_level_0,source,source type,target,target type,interaction,pubmed
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
22909821,AATF,TF,BAX,PROTEIN,Repression,22909821
17157788,AATF,TF,CDKN1A,PROTEIN,Unknown,17157788
23146908,AATF,TF,KLK3,PROTEIN,Unknown,23146908
20549547,AATF,TF,MYC,PROTEIN,Activation,20549547
17157788,AATF,TF,TP53,PROTEIN,Unknown,17157788


In [12]:
# Merge TRRUST with pumed dataframe
df = trrust[['source', 'target', 'interaction']].join(pubmed_articles[['abstract']]).dropna(subset = ['abstract'])
df.head(5)

Unnamed: 0_level_0,source,target,interaction,abstract
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10022128,MYC,TERT,Activation,The telomerase reverse transcriptase component...
10022513,ERG,HMOX1,Unknown,Overexpression of human heme oxygenase-1 has b...
10022513,ETS1,HMOX1,Unknown,Overexpression of human heme oxygenase-1 has b...
10022513,FLI1,HMOX1,Unknown,Overexpression of human heme oxygenase-1 has b...
10022869,SMAD3,JUN,Unknown,Transcriptional regulation by transforming gro...


In [13]:
# Get all the gene names, search and download genes.
gene_name_list = df.source.to_list() + df.target.to_list()
gene_search_res = search_gene_id(entrez_api_key, entrez_email, gene_name_list)
gene_ids = [i['id'] for i in gene_search_res]
gene_dict = download_gene_names(entrez_api_key, entrez_email, gene_ids)

2861 unique gene names...
Searching...
No of unique gene ids: 2859s...
Downloading from gene db...
Decoding downloaded data...
Parsing to a dict
2855 found.


In [14]:
# Normalize genes
df['abstract'] = normalize_genes(gene_dict, df.source.to_list(), df.target.to_list(), df.abstract.to_list())
df.head(5)

11686 abstracts to be normalized...
11686 abstracts normalized...
Done!


Unnamed: 0_level_0,source,target,interaction,abstract
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10022128,MYC,TERT,Activation,The tert component (TERT) is not expressed in ...
10022513,ERG,HMOX1,Unknown,Overexpression of human heme oxygenase-1 has b...
10022513,ETS1,HMOX1,Unknown,Overexpression of human heme oxygenase-1 has b...
10022513,FLI1,HMOX1,Unknown,Overexpression of human heme oxygenase-1 has b...
10022869,SMAD3,JUN,Unknown,Transcriptional regulation by transforming gro...


In [15]:
# Find sentences with gene mentions
rel_df = find_sentences(df)
rel_df.head(5)

11686 abstracts...
115615 sentences found...
22276 sentences have a relation information.


Unnamed: 0_level_0,source,target,interaction,abstract,sentence,n_lines,match
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10022128,MYC,TERT,Activation,The tert component (TERT) is not expressed in ...,The tert component TERT is not expressed in mo...,1319,0
10022128,MYC,TERT,Activation,The tert component (TERT) is not expressed in ...,Here we identify the myc transcription factor...,1319,1
10022128,MYC,TERT,Activation,The tert component (TERT) is not expressed in ...,Through the use of a hormone inducible form o...,1319,1
10022128,MYC,TERT,Activation,The tert component (TERT) is not expressed in ...,These findings demonstrate that the TERT gene...,1319,1
10022128,MYC,TERT,Activation,The tert component (TERT) is not expressed in ...,Since telomerase activation frequently correl...,1319,1


In [16]:
rel_df = rel_df[['source', 'target', 'interaction', 'sentence', 'match']]

In [17]:
rel_df.to_csv('trrust_norm_rel.csv')