Import Libraries

In [1]:
import os

In [2]:
home_dir = '../'
os.chdir(home_dir)
from ext import download_pubmed_articles, search_gene_id, download_gene_names
from norm import normalize_genes
from relation_finder import find_sentences

"""
Required: A valid entrez registered email and entrez api-key.
More information: Entrez Programming Utilities (E-utilities)
"""  
from config import entrez_api_key, entrez_email

In [3]:
import pandas as pd
import numpy as np
import re

In [4]:
from Bio import Entrez
import xmltodict

Preprocessing <b>AIR</b>. <br>
<a href="https://air.bio.informatik.uni-rostock.de/">Atlas of Inflammation Resolution</a>

In [5]:
air = pd.read_csv('Data/air_mim.csv')

In [6]:
air.head(3)

Unnamed: 0,Source,SourceType,SourceIDs,InteractionTypeShort,InteractionType,Modification,Modifier,ModifierType,ModifierIDs,Target,TargetType,TargetIDs,Reference
0,ITGAL:ITGB2,COMPLEX,,positive,,,,,,ICAM1,RECEPTOR,"ensembl:ENSG00000090339, ncbigene:3383, HGNC:5...",15300248
1,ITGB4:ITGB1,COMPLEX,,positive,,,,,,VCAM1,RECEPTOR,"HGNC:12663, ensembl:ENSG00000162692, uniprot:P...",10626664;7504895
2,CXCL12,PROTEIN,"HGNC:10672, ensembl:ENSG00000107562, uniprot:P...",positive,,,,,,CXCR4,RECEPTOR,"ensembl:ENSG00000121966, ncbigene:7852, HGNC:2...",22220212;16267013;27180275


In [7]:
print('No of entries in AIR:', air.shape[0])

No of entries in AIR: 19230


In [8]:
# Drop datapoints without reference. 
air = air[~air.Reference.isna()]
air.Reference = air.Reference.apply(lambda x: [i for i in x.split(';') if i.isdigit()])

In [9]:
# Create a list of all Pubmed IDs
pubmed_list = [j for i in air.Reference.to_list() for j in i]
# Download abstracts from pubmed IDs.
res = download_pubmed_articles(entrez_api_key, entrez_email, pubmed_list)

No of unique pubmed ids: 15625
Downloading from pubmed db...
Decoding downloaded data...
Parsing to a dict...
9997 pubmed matches found out of 15625 pubmed articles.
Extracting abstracts and titles...
Done!


In [10]:
# Create DataFrame for downloaded pubmed articles.
pubmed_articles = pd.DataFrame(res).dropna()
pubmed_articles.index = pubmed_articles.pubmed
pubmed_articles.index.name = 'pmid'
pubmed_articles.head(5)

Unnamed: 0_level_0,pubmed,title,abstract
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
21761340,21761340,Prostate cancer stem cells: do they have a bas...,The prostate is a luminal secretory tissue who...
26663434,26663434,Systemic genome screening identifies the outco...,Systemic analyses using large-scale genomic pr...
23970470,23970470,Inhibitory effect of microRNA-34a on retinal p...,Retinal pigment epithelial (RPE) cells play im...
12057865,12057865,A myeloperoxidase polymorphism associated with...,Myeloperoxidase (MPO) is a metabolic/oxidative...
20533062,20533062,The CAG repeat in SCA12 functions as a cis ele...,"PPP2R2B, a protein widely expressed in neurons..."


In [11]:
# Create datapoint for each Pubmed id reference 
air = air.explode(column = 'Reference')
air = air[~air.Reference.isna()]
air.index = air.Reference
air.index.name = 'pmid'
air.head(5)

Unnamed: 0_level_0,Source,SourceType,SourceIDs,InteractionTypeShort,InteractionType,Modification,Modifier,ModifierType,ModifierIDs,Target,TargetType,TargetIDs,Reference
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
15300248,ITGAL:ITGB2,COMPLEX,,positive,,,,,,ICAM1,RECEPTOR,"ensembl:ENSG00000090339, ncbigene:3383, HGNC:5...",15300248
10626664,ITGB4:ITGB1,COMPLEX,,positive,,,,,,VCAM1,RECEPTOR,"HGNC:12663, ensembl:ENSG00000162692, uniprot:P...",10626664
7504895,ITGB4:ITGB1,COMPLEX,,positive,,,,,,VCAM1,RECEPTOR,"HGNC:12663, ensembl:ENSG00000162692, uniprot:P...",7504895
22220212,CXCL12,PROTEIN,"HGNC:10672, ensembl:ENSG00000107562, uniprot:P...",positive,,,,,,CXCR4,RECEPTOR,"ensembl:ENSG00000121966, ncbigene:7852, HGNC:2...",22220212
16267013,CXCL12,PROTEIN,"HGNC:10672, ensembl:ENSG00000107562, uniprot:P...",positive,,,,,,CXCR4,RECEPTOR,"ensembl:ENSG00000121966, ncbigene:7852, HGNC:2...",16267013


In [12]:
# Merge AIR with pumed dataframe
df = air.join(pubmed_articles).dropna(subset = ['abstract'])
df = df[['Source', 'Target', 'InteractionTypeShort', 'abstract']]
df.columns = ['source', 'target', 'interaction', 'abstract']
df.head(5)

Unnamed: 0_level_0,source,target,interaction,abstract
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10022118,CDKN1A,CDK2,negative,The cyclin-dependent kinase inhibitor p21waf1/...
10022118,CASP3,CDKN1A,negative,The cyclin-dependent kinase inhibitor p21waf1/...
10022513,ERG,HMOX1,positive,Overexpression of human heme oxygenase-1 has b...
10022513,ETS1,HMOX1,positive,Overexpression of human heme oxygenase-1 has b...
10022513,FLI1,HMOX1,positive,Overexpression of human heme oxygenase-1 has b...


In [13]:
# Get all the gene names, search and download genes.
gene_name_list = df.source.to_list() + df.target.to_list()
gene_search_res = search_gene_id(entrez_api_key, entrez_email, gene_name_list)
gene_ids = [i['id'] for i in gene_search_res]
gene_dict = download_gene_names(entrez_api_key, entrez_email, gene_ids)

4678 unique gene names...
Searching...
No of unique gene ids: 4415s...
Downloading from gene db...
Decoding downloaded data...
Parsing to a dict
4365 found.


In [14]:
# Normalize genes
df['abstract'] = normalize_genes(gene_dict, df.source.to_list(), df.target.to_list(), df.abstract.to_list())

16835 abstracts to be normalized...
16835 abstracts normalized...
Done!


In [15]:
# Find sentences with gene mentions
rel_df = find_sentences(df)
rel_df.head(5)

16835 abstracts...
164451 sentences found...
30214 sentences have a relation information.


Unnamed: 0_level_0,source,target,interaction,abstract,sentence,n_lines,match
pmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10022118,CDKN1A,CDK2,negative,The cyclin-dependent kinase inhibitor cdkn1acd...,The cyclindependent kinase inhibitor cdkn1acdk...,747,0
10022118,CDKN1A,CDK2,negative,The cyclin-dependent kinase inhibitor cdkn1acd...,We report herein that cdkn1a was cleaved by c...,747,0
10022118,CDKN1A,CDK2,negative,The cyclin-dependent kinase inhibitor cdkn1acd...,The cleaved cdkn1a fragment could no more arr...,747,0
10022118,CDKN1A,CDK2,negative,The cyclin-dependent kinase inhibitor cdkn1acd...,Thus caspase3mediated cleavage and inactivati...,747,0
10022118,CASP3,CDKN1A,negative,The cyclin-dependent kinase inhibitor cdkn1acd...,The cyclindependent kinase inhibitor cdkn1acdk...,752,0


In [17]:
rel_df = rel_df[['source', 'target', 'interaction', 'sentence', 'match']]

In [18]:
rel_df.to_csv('air_norm_rel.csv')