## Entrez package

Thanks to Entrez package we can comunnicate with Entrez databases.
( https://www.ncbi.nlm.nih.gov/Class/MLACourse/Original8Hour/Entrez/ ) 

In [1]:
# protein search based on protein ID. result in gb format
from Bio import Entrez
from Bio import SeqIO
Entrez.email = "ap85@seznam.cz"

protein_id = 'NP_035357.1'
net_handle = Entrez.efetch(db="protein", id=protein_id, rettype="gb", retmode="text")
out_handle = open(protein_id + '.gb', "w")
out_handle.write(net_handle.read())
out_handle.close()
net_handle.close()
handle = open(protein_id + '.gb', mode="r")
protein_record = SeqIO.read(handle, "genbank")

In [6]:
# type(protein_record.features) is list
x = protein_record.features[0]
x

SeqFeature(FeatureLocation(ExactPosition(0), ExactPosition(233)), type='source')

In [7]:
x.qualifiers['organism']

['Mus musculus']

In [10]:
x = ''
for feature in protein_record.features:
    if feature.type == 'CDS':
        print(feature.qualifiers['db_xref'])

['CCDS:CCDS22095.1', 'GeneID:19332', 'MGI:MGI:102789']


GenBank is not perfect...
* RefSeq - they are trying to keep it ordered
* GeneID is not always on the second position... always check if you are getting what you want
* better solve it with searching the GeneID term using a function - get_gene_id

In [11]:
# gene search base on ID of its corresponding protein products
import os
from Bio import Entrez
from Bio import SeqIO

Entrez.email = "ap85@seznam.cz"  # Always tell NCBI who you are

def read_protein(protein_id):
    filename = protein_id
    if not os.path.isfile(filename): #if protein record is not in your folder
        net_handle = Entrez.efetch(db="protein", id=protein_id, rettype="gb", retmode="text")
        out_handle = open(filename, "w")
        out_handle.write(net_handle.read())
        out_handle.close()
        net_handle.close()
    handle = open(filename, mode="r")
    protein_record = SeqIO.read(handle, "genbank")
    return protein_record

def get_gene_id(protein_record):
    for feature in protein_record.features:
        if feature.type == "CDS":
            for item in feature.qualifiers["db_xref"]:
                if 'GeneID' in item:
                    return((item.split(':'))[1])
                
protein_record = read_protein('NP_035357.1')
gene_id = get_gene_id(protein_record)
print('gene id:', gene_id)

gene id: 19332


## Pubmed

In [32]:
# number of publications for given author
from Bio import Entrez
Entrez.email = "ap85@seznam.cz"     # Always tell NCBI who you are
handle = Entrez.esearch(db="pubmed", term="stanislav kmoch", retmax=100000)
record = Entrez.read(handle)
len(record["IdList"])

76

In [26]:
print(record)

{'Count': '18', 'RetMax': '18', 'RetStart': '0', 'IdList': ['30411505', '29967284', '29875394', '29352102', '29311744', '27839525', '27466185', '27412140', '27392076', '27296017', '25058500', '24587672', '24449431', '24114807', '23602711', '23543484', '23415546', '21820099'], 'TranslationSet': [{'From': 'Anna Pristoupilova', 'To': 'Pristoupilova, Anna[Full Author Name]'}], 'TranslationStack': [{'Term': 'Pristoupilova, Anna[Full Author Name]', 'Field': 'Full Author Name', 'Count': '18', 'Explode': 'N'}, 'GROUP'], 'QueryTranslation': 'Pristoupilova, Anna[Full Author Name]'}


In [22]:
# coauthor search
from Bio import Entrez
from Bio import Medline

MAX_COUNT = 10
TERM = 'Anna Pristoupilova'

print('Getting {0} publications containing {1}...'.format(MAX_COUNT, TERM))
Entrez.email = "ap85@seznam.cz"
h = Entrez.esearch(db='pubmed', retmax=MAX_COUNT, term=TERM)
result = Entrez.read(h)
print('Total number of publications containing {0}: {1}'.format(TERM, result['Count']))
ids = result['IdList']
h = Entrez.efetch(db='pubmed', id=ids, rettype='medline', retmode='text')
records = Medline.parse(h)

authors = []
for record in records:
    au = record.get('AU', '?')
    for a in au: 
        if a not in authors:
            authors.append(a)
    authors.sort()
print('Authors: {0}'.format(', '.join(authors)))

Getting 10 publications containing Anna Pristoupilova...
Total number of publications containing Anna Pristoupilova: 18
Authors: Acott PD, Adamkova V, Adams DJ, Alper SL, Anderson GW, Azou M, Bahlo M, Baresova V, Beck BB, Beltran S, Berkovic SF, Bleyer AJ, Blumenstiel B, Bolar NA, Breuss M, Brunner H, Cadieux-Dion M, Carpenter S, Ceuterick-de Groote C, Conlon P, Connaughton DM, Cossette P, Cotman SL, Crocker JF, Damiano JA, DeFelice M, Deltas CC, Dermaut B, Durnberger G, Edwards A, Flint J, Golzio C, Greka A, Gstrein T, Gubler MC, Gut I, Gut M, Hansen AH, Hansikova H, Harden M, Hartmannova H, Hayot G, Hildebrand MS, Hnizda A, Hochstoeger T, Hodanova K, Hoischen A, Honzik T, Houstek J, Hubacek JA, Hulkova H, Huyghe JR, Jedlickova I, Jiricka V, Jirsa M, Kaplanova V, Katsanis N, Keane TM, Keays DA, Kidd K, Kmoch S, Kucerova-Vidrova V, Landler L, Lavin P, Leca I, Loeys BL, Magner M, Mallet M, Matej R, Matthys E, Mazurova S, McFadden K, Mechtler K, Meurs A, Mole SE, Mortier G, Mracek T, Mra

In [29]:
# try to get names of publications for a given autor
from Bio import Entrez
Entrez.email = "ap85@seznam.cz"     # Always tell NCBI who you are
handle = Entrez.esearch(db="pubmed", term="NOTCH2NLC", retmax=100000)
record = Entrez.read(handle)
print(record)

{'Count': '5', 'RetMax': '5', 'RetStart': '0', 'IdList': ['31433517', '31413119', '31332381', '31332380', '31178126'], 'TranslationSet': [], 'TranslationStack': [{'Term': 'NOTCH2NLC[All Fields]', 'Field': 'All Fields', 'Count': '5', 'Explode': 'N'}, 'GROUP'], 'QueryTranslation': 'NOTCH2NLC[All Fields]'}
