# NCBI Databases Information

In [1]:
from Bio import Entrez
Entrez.email = 'sbwiecko@free.fr'

## Obtaining information about the Entrez databases

See also [https://biopython.org/DIST/docs/tutorial/Tutorial.html#sec145](https://biopython.org/DIST/docs/tutorial/Tutorial.html#sec145).

In [2]:
record = Entrez.read(Entrez.einfo())
print(type(record))
print(record.keys())
print(record['DbList'])

<class 'Bio.Entrez.Parser.DictionaryElement'>
dict_keys(['DbList'])
['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'structure', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'mesh', 'ncbisearch', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'proteinclusters', 'pcassay', 'protfam', 'biosystems', 'pccompound', 'pcsubstance', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'gtr']


See more details in [A General Introduction to the E-utilities](https://www.ncbi.nlm.nih.gov/books/NBK25497/).

| Entrez Database   | UID common name    | E-utility Database Name |
| ----------------- | ------------------ | ----------------------- |
| BioProject        | BioProject ID      | bioproject              |
| BioSample         | BioSample ID       | biosample               |
| Biosystems        | BSID               | biosystems              |
| Books             | Book ID            | books                   |
| Conserved Domains | PSSM-ID            | cdd                     |
| dbGaP             | dbGaP ID           | gap                     |
| dbVar             | dbVar ID           | dbvar                   |
| Epigenomics       | Epigenomics ID     | epigenomics             |
| EST               | GI number          | nucest                  |
| Gene              | Gene ID            | gene                    |
| Genome            | Genome ID          | genome                  |
| GEO Datasets      | GDS ID             | gds                     |
| GEO Profiles      | GEO ID             | geoprofiles             |
| GSS               | GI number          | nucgss                  |
| HomoloGene        | HomoloGene ID      | homologene              |
| MeSH              | MeSH ID            | mesh                    |
| NCBI C++ Toolkit  | Toolkit ID         | toolkit                 |
| NCBI Web Site     | Web Site ID        | ncbisearch              |
| NLM Catalog       | NLM Catalog ID     | nlmcatalog              |
| Nucleotide        | GI number          | nuccore                 |
| OMIA              | OMIA ID            | omia                    |
| PopSet            | PopSet ID          | popset                  |
| Probe             | Probe ID           | probe                   |
| Protein           | GI number          | protein                 |
| Protein Clusters  | Protein Cluster ID | proteinclusters         |
| PubChem BioAssay  | AID                | pcassay                 |
| PubChem Compound  | CID                | pccompound              |
| PubChem Substance | SID                | pcsubstance             |
| PubMed            | PMID               | pubmed                  |
| PubMed Central    | PMCID              | pmc                     |
| SNP               | rs number          | snp                     |
| SRA               | SRA ID             | sra                     |
| Structure         | MMDB-ID            | structure               |
| Taxonomy          | TaxID              | taxonomy                |
| UniGene           | UniGene Cluster ID | unigene                 |
| UniSTS            | STS ID             | unists                  |

## Read information from a particular database

In [174]:
handle = Entrez.einfo(db='sra') # added `db` param

# convert from XML to python datatype
record = Entrez.read(handle)

for key in record['DbInfo'].keys(): # changed the key name
    print(key, ':', record['DbInfo'][key])

DbName : sra
MenuName : SRA
Description : SRA Database
DbBuild : Build211012-1433m.1
Count : 17057960
LastUpdate : 2021/10/12 19:10
FieldList : [{'Name': 'ALL', 'FullName': 'All Fields', 'Description': 'All terms from all searchable fields', 'TermCount': '117175019', 'IsDate': 'N', 'IsNumerical': 'N', 'SingleToken': 'N', 'Hierarchy': 'N', 'IsHidden': 'N'}, {'Name': 'UID', 'FullName': 'UID', 'Description': 'Unique number assigned to publication', 'TermCount': '0', 'IsDate': 'N', 'IsNumerical': 'Y', 'SingleToken': 'Y', 'Hierarchy': 'N', 'IsHidden': 'Y'}, {'Name': 'FILT', 'FullName': 'Filter', 'Description': 'Limits the records', 'TermCount': '100', 'IsDate': 'N', 'IsNumerical': 'N', 'SingleToken': 'Y', 'Hierarchy': 'N', 'IsHidden': 'N'}, {'Name': 'ACCN', 'FullName': 'Accession', 'Description': 'Accession number of sequence', 'TermCount': '50701385', 'IsDate': 'N', 'IsNumerical': 'N', 'SingleToken': 'N', 'Hierarchy': 'N', 'IsHidden': 'N'}, {'Name': 'TITL', 'FullName': 'Title', 'Descriptio

In [86]:
print(Entrez.read(Entrez.einfo(db='pmc'))['DbInfo']['Description'])
print(Entrez.read(Entrez.einfo(db='pmc'))['DbInfo']['Count'])

PubMed Central
7481530


In [87]:
term = "(U-937 OR U937) AND interferon"
handle = Entrez.esearch(
    term=term,
    db='sra',
)

record = Entrez.read(handle)

print(record)

{'Count': '0', 'RetMax': '0', 'RetStart': '0', 'IdList': [], 'TranslationSet': [], 'TranslationStack': [{'Term': 'U-937[All Fields]', 'Field': 'All Fields', 'Count': '86', 'Explode': 'N'}, {'Term': 'U937[All Fields]', 'Field': 'All Fields', 'Count': '904', 'Explode': 'N'}, 'OR', 'GROUP', {'Term': 'interferon[All Fields]', 'Field': 'All Fields', 'Count': '8472', 'Explode': 'N'}, 'AND'], 'QueryTranslation': '(U-937[All Fields] OR U937[All Fields]) AND interferon[All Fields]'}


In [88]:
term = "BET inhibitor*[TITL] AND homo sapiens[ORGN]"
handle = Entrez.esearch(
    term=term,
    db='sra',
)

record = Entrez.read(handle)

print(record)

{'Count': '645', 'RetMax': '20', 'RetStart': '0', 'IdList': ['13928271', '13928270', '13928269', '13928268', '11190443', '11190442', '11190441', '11190440', '11190439', '11190438', '11190437', '11190436', '11190435', '11190434', '6864792', '6864791', '6864790', '6864789', '6864788', '6864787'], 'TranslationSet': [{'From': 'homo sapiens[ORGN]', 'To': '"Homo sapiens"[Organism]'}], 'TranslationStack': [{'Term': 'bet inhibitor[TITL]', 'Field': 'TITL', 'Count': '369', 'Explode': 'N'}, {'Term': 'bet inhibitor jq1[TITL]', 'Field': 'TITL', 'Count': '51', 'Explode': 'N'}, 'OR', {'Term': 'bet inhibitor resistance[TITL]', 'Field': 'TITL', 'Count': '44', 'Explode': 'N'}, 'OR', {'Term': 'bet inhibitors[TITL]', 'Field': 'TITL', 'Count': '280', 'Explode': 'N'}, 'OR', 'GROUP', {'Term': '"Homo sapiens"[Organism]', 'Field': 'Organism', 'Count': '2773483', 'Explode': 'Y'}, 'AND'], 'QueryTranslation': '(bet inhibitor[TITL] OR bet inhibitor jq1[TITL] OR bet inhibitor resistance[TITL] OR bet inhibitors[TITL

## Obtaining spelling suggestions

Suggests spelling corrections. See also [https://biopython.org/DIST/docs/tutorial/Tutorial.html#sec152](https://biopython.org/DIST/docs/tutorial/Tutorial.html#sec152). The result includes the original query, the complete corrected query, and a breakdown of the terms in the complete corrected query flagged as either "replaced" or "original".

In [89]:
term = 'biobython' # entered a mispelled term

handle = Entrez.espell(
    db='pmc',
    term=term,
)

record = Entrez.read(handle)

print(type(record))
print(record.keys())

for key in record.keys(): # changed the key name
    print(key, ':', record[key])

<class 'Bio.Entrez.Parser.DictionaryElement'>
dict_keys(['Database', 'Query', 'CorrectedQuery', 'SpelledQuery'])
Database : pmc
Query : biobython
CorrectedQuery : biopython
SpelledQuery : ['', 'biopython']


### Exercice spelling correction

The The following list contains 45 scientific names for animals. We want to ensure that these names are correct. Print the name of the query and the correction you received.

In the second part, we want to print the names THAT ARE CORRECTED ONLY and the correction you received.

In [10]:
sciNames = [
    'Bos gaurus',
    'Antelope cervicapra',
    'Gazella bennettii',
    'Boselaphus tragocamelus',
    'Canis lupus',
    'Panthera leo',
    'Elephas maximus',
    'Equus africanus',
    'Panthera pardus',
    'Cervus canadensis',
    'Pavo cristatus',
    'Grus leucogeranus',
    'Vulpes vulpes',
    'Rhinoceros unicornis',
    'Panthera Tigris',
    'Crocodylus palustris',
    'Gavialis gangeticus',
    'Equus caballus',
    'Equus quagga',
    'Babalus bubalis',
    'Sus scrofa',
    'Camelus dromedaries',
    'Giraffa camelopardalis ',
    'Hemidactylus flaviviridis',
    'Hippopotamus amphibius',
    'Macaca mulatta',
    'Canis lupus',
    'Felis domesticus',
    'Acinonyx jubatus',
    'Rattus rattus',
    'Mus musculus',
    'Oryctolagus cuniculus',
    'Bubo virginianus',
    'Passer domesticus',
    'Corvus splendens',
    'Acridotheres tristis',
    'Psittacula eupatria',
    'Molpastes cafer',
    'Eudynamis scolopaccus',
    'Columba livia',
    'Naja naja',
    'Ophiophagus hannah',
    'Hydrophiinae ',
    'Python molurus',
    'Ptyas mucosa'
]

In [11]:
Entrez.read(Entrez.einfo(db='taxonomy'))['DbInfo']['Description']

'Taxonomy db'

In [18]:
for sciName in sciNames:
    handle = Entrez.espell(
        db='taxonomy', # also works with PMC and certainly other databases
        term=sciName,
    )
    record = Entrez.read(handle)

    print("Query: ", record['Query'], " - Corrected query: ", record['CorrectedQuery'])

Query:  Bos gaurus  - Corrected query:  bos taurus
Query:  Antelope cervicapra  - Corrected query:  antilope cervicapra
Query:  Gazella bennettii  - Corrected query:  gazella bennettii
Query:  Boselaphus tragocamelus  - Corrected query:  boselaphus tragocamelus
Query:  Canis lupus  - Corrected query:  canis lupus
Query:  Panthera leo  - Corrected query:  panthera leo
Query:  Elephas maximus  - Corrected query:  elephas maximus
Query:  Equus africanus  - Corrected query:  equus africanus
Query:  Panthera pardus  - Corrected query:  panthera pardus
Query:  Cervus canadensis  - Corrected query:  cervus canadensis
Query:  Pavo cristatus  - Corrected query:  pavo cristatus
Query:  Grus leucogeranus  - Corrected query:  grus leucogeranus
Query:  Vulpes vulpes  - Corrected query:  vulpes vulpes
Query:  Rhinoceros unicornis  - Corrected query:  rhinoceros unicornis
Query:  Panthera Tigris  - Corrected query:  panthera tigris
Query:  Crocodylus palustris  - Corrected query:  crocodylus palustri

In [34]:
for sciName in sciNames:
    handle = Entrez.espell(
        db='taxonomy', # also works with PMC and certainly other databases
        term=sciName,
    )
    record = Entrez.read(handle)

    if bool(record['SpelledQuery']): # when the list containing the spelled terms is not empty
        print("Query: ", record['Query'], " - Corrected query: ", record['CorrectedQuery'])

Query:  Bos gaurus  - Corrected query:  bos taurus
Query:  Antelope cervicapra  - Corrected query:  antilope cervicapra
Query:  Babalus bubalis  - Corrected query:  bubalus bubalis
Query:  Camelus dromedaries  - Corrected query:  camelus dromedarius
Query:  Molpastes cafer  - Corrected query:  molasses caffer
Query:  Eudynamis scolopaccus  - Corrected query:  eudynamys scolopaceus


## Search NCBI databases and retrieving documents summaries

To search any of these databases, we use [esearch](https://biopython.org/DIST/docs/tutorial/Tutorial.html#sec146). In the output, you see PubMed IDs. You can also use ESearch to search GenBank, where each of the IDs is a GenBank identifier (Accession number). You can do things like Jones[AUTH] to search the author field, or Sanger[AFFL] to restrict to authors at the Sanger Centre. This can be very handy - especially if you are not so familiar with a particular database.

[ESummary](https://biopython.org/DIST/docs/tutorial/Tutorial.html#sec148) retrieves document summaries from a list of primary IDs

In [156]:
print(Entrez.read(Entrez.einfo(db='pcassay'))['DbInfo']['Description'])

term = "antibody AND phagocytosis"
handle = Entrez.esearch(
    term=term,
    db='pcassay',
)

record = Entrez.read(handle)

print(record)

for id in record['IdList']:
    handle = Entrez.esummary(
        db='pcassay',
        id=id,
    )
    assay = Entrez.read(handle)[0]

    print(assay['AssayName'])

PubChem BioAssay Database
{'Count': '21', 'RetMax': '20', 'RetStart': '0', 'IdList': ['339171', '339170', '339167', '339165', '339164', '1139164', '1139163', '1139162', '1139161', '1139160', '1139159', '1139158', '1139157', '339169', '339168', '339166', '290243', '290242', '290238', '290237'], 'TranslationSet': [], 'TranslationStack': [{'Term': 'antibody[All Fields]', 'Field': 'All Fields', 'Count': '6068', 'Explode': 'N'}, {'Term': 'phagocytosis[All Fields]', 'Field': 'All Fields', 'Count': '507', 'Explode': 'N'}, 'AND'], 'QueryTranslation': 'antibody[All Fields] AND phagocytosis[All Fields]'}
Immunostimulant activity in sheep RBC-immunized BALB/c albino mouse assessed as hemagglutininating antibody titer at 1 mg/kg, po administered daily for 7 days by microtiter hemagglutinination technique
Immunostimulant activity in sheep RBC-immunized BALB/c albino mouse assessed as macrophage migration index at 1 mg/kg, po administered daily for 7 days measured after 18 to 24 hrs
Nonspecific immu

In [90]:
handle = Entrez.einfo(db="pubmed")
record = Entrez.read(handle)

print(record['DbInfo'].keys())
print("-"*15)

for field in record["DbInfo"]["FieldList"]:
    print(f"{field['Name']}, {field['FullName']}, {field['Description']}")

dict_keys(['DbName', 'MenuName', 'Description', 'DbBuild', 'Count', 'LastUpdate', 'FieldList', 'LinkList'])
---------------
ALL, All Fields, All terms from all searchable fields
UID, UID, Unique number assigned to publication
FILT, Filter, Limits the records
TITL, Title, Words in title of publication
WORD, Text Word, Free text associated with publication
MESH, MeSH Terms, Medical Subject Headings assigned to publication
MAJR, MeSH Major Topic, MeSH terms of major importance to publication
AUTH, Author, Author(s) of publication
JOUR, Journal, Journal abbreviation of publication
AFFL, Affiliation, Author's institutional affiliation and address
ECNO, EC/RN Number, EC number for enzyme or CAS registry number
SUBS, Supplementary Concept, CAS chemical name or MEDLINE Substance Name
PDAT, Date - Publication, Date of publication
EDAT, Date - Entrez, Date publication first accessible through Entrez
VOL, Volume, Volume number of publication
PAGE, Pagination, Page number(s) of publication
PTYP, P

In [137]:
handle = Entrez.esearch(
    db='pubmed',
    term="BET inhibitors[TITL] AND (review[PTYP] OR review[TITL])",
    retmax=100,
    sort='most+recent',
    mindate=2016,
    maxdate=2021, # daterange; (mindate, maxdate) must be used together
)

record = Entrez.read(handle)
print(record['Count'])

15


In [138]:
list_pubs = record['IdList']
print(list_pubs)

['34072421', '33723398', '32951810', '32827765', '32218352', '30693439', '29189147', '29050361', '29032717', '28838216', '28786345', '27769357', '27729803', '27250929', '26924192']


In [139]:
Entrez.read(
    Entrez.esummary(
        db='pubmed',
        id=list_pubs[0],
    )
)[0].keys()

dict_keys(['Item', 'Id', 'PubDate', 'EPubDate', 'Source', 'AuthorList', 'LastAuthor', 'Title', 'Volume', 'Issue', 'Pages', 'LangList', 'NlmUniqueID', 'ISSN', 'ESSN', 'PubTypeList', 'RecordStatus', 'PubStatus', 'ArticleIds', 'DOI', 'History', 'References', 'HasAbstract', 'PmcRefCount', 'FullJournalName', 'ELocationID', 'SO'])

In [140]:
for id in record['IdList']:
    handle = Entrez.esummary(
        db='pubmed',
        id=id,
    )

    summary = Entrez.read(handle)[0]
    print(f"{summary['Title']}\t{summary['PubDate']}\t{summary['FullJournalName']}")

Are BET Inhibitors yet Promising Latency-Reversing Agents for HIV-1 Reactivation in AIDS Therapy?	2021 May 29	Viruses
Achieving clinical success with BET inhibitors as anti-cancer agents.	2021 Apr	British journal of cancer
Current advances on the development of BET inhibitors: insights from computational methods.	2020	Advances in protein chemistry and structural biology
The emerging role of BET inhibitors in breast cancer.	2020 Oct	Breast (Edinburgh, Scotland)
Role of BET Inhibitors in Triple Negative Breast Cancers.	2020 Mar 25	Cancers
How Might Bromodomain and Extra-Terminal (BET) Inhibitors Operate in Cardiovascular Disease?	2019 Apr	American journal of cardiovascular drugs : drugs, devices, and other interventions
Disrupting Acetyl-lysine Interactions: Recent Advance in the Development of BET Inhibitors.	2018	Current drug targets
BET inhibitors as novel therapeutic agents in breast cancer.	2017 Sep 19	Oncotarget
BET inhibitors in metastatic prostate cancer: therapeutic implications

### Exercice Search NCBI databases and retrieviing document summaries

We have a list of search methods for obtaining all research papers contain molecular markers related to human lung cancer. Test each method and find out which is the best method and why?

NOTE: \[Title\] is very important because when searching for a research paper, the title is the first indication of the information inside the paper, so if the title contains the words you are searching for, then by a large percentage it will contain the information you want to get.

In [111]:
terms = [
    'homo sapiens AND Lung Cancer AND Marker', # will look in all fields, not only the TITL
    'Human AND homo sapiens AND Lung Cancer AND Marker', # won't work if homo sapiens and human are used interchangeably
    '(Human[Title] OR homo sapiens[Title]) AND Lung Cancer AND Marker',
    '(Human[Title] OR homo sapiens[Title]) OR (Lung Cancer[Title] AND Marker[Title])', # won't find articles for disease IN lung cancer
    '(Human[Title] OR homo sapiens[Title] OR Lung Cancer[Title] OR Marker[Title]', # bad because will find all about lung cancer OR marker OR human
    '(Human[Title] OR homo sapiens[Title]) AND (Lung Cancer[Title] AND Marker[Title])', # like the 3rd term of the list but focussing in the TITL
]

In [113]:
for term in terms:
    handle = Entrez.esearch(
        db='pubmed',
        term=term,
    )

    record = Entrez.read(handle)
    print(term, record['Count'], sep='\n')

homo sapiens AND Lung Cancer AND Marker
37082
Human AND homo sapiens AND Lung Cancer AND Marker
37081
(Human[Title] OR homo sapiens[Title]) AND Lung Cancer AND Marker
2757
(Human[Title] OR homo sapiens[Title]) OR (Lung Cancer[Title] AND Marker[Title])
1029577
(Human[Title] OR homo sapiens[Title] OR Lung Cancer[Title] OR Marker[Title]
1179756
(Human[Title] OR homo sapiens[Title]) AND (Lung Cancer[Title] AND Marker[Title])
27


In [141]:
handle = Entrez.esearch(
    db='pubmed',
    term="Lung Cancer[TITL] AND Marker[TITL] AND (homo sapiens[TITL] OR human[TITL])",
)

record = Entrez.read(handle)

print(record)

{'Count': '27', 'RetMax': '20', 'RetStart': '0', 'IdList': ['34558311', '31307200', '30536321', '30250539', '28875417', '27390594', '26851650', '24829194', '21619671', '19170237', '17628801', '16955534', '12660007', '12627521', '11251963', '9815576', '10920891', '8705998', '7767913', '7514953'], 'TranslationSet': [], 'TranslationStack': [{'Term': 'Lung Cancer[TITL]', 'Field': 'TITL', 'Count': '106471', 'Explode': 'N'}, {'Term': 'Marker[TITL]', 'Field': 'TITL', 'Count': '52458', 'Explode': 'N'}, 'AND', {'Term': 'homo sapiens[TITL]', 'Field': 'TITL', 'Count': '742', 'Explode': 'N'}, {'Term': 'human[TITL]', 'Field': 'TITL', 'Count': '1028305', 'Explode': 'N'}, 'OR', 'GROUP', 'AND'], 'QueryTranslation': 'Lung Cancer[TITL] AND Marker[TITL] AND (homo sapiens[TITL] OR human[TITL])'}


What is the Accession Number of the reference nucleotide sequence for the rice plant plastid?

>Tips to facilitate the solution: The accession number prefix is NC_

In [158]:
terms = ['Rice[Organism] AND RefSeq[Keyword] AND Plastid[Title]']

handle = Entrez.esearch(
    db='nuccore', # same as nucleotide
    term=terms[0],
    idtype='acc',
    retmax=25, # needed to increase because count=24
)

record = Entrez.read(handle)

print(record)

{'Count': '24', 'RetMax': '24', 'RetStart': '0', 'IdList': ['XM_015755471.2', 'XM_015756359.2', 'XM_015789572.2', 'XM_015786234.2', 'XM_015786233.2', 'XM_015782372.2', 'XM_015782371.2', 'XM_015781167.2', 'XM_015767993.2', 'XM_026021934.1', 'XM_015760769.2', 'XR_001542440.2', 'XM_015765844.2', 'XM_015765843.2', 'XM_015765984.2', 'XM_015766187.2', 'XM_015766186.2', 'XM_015766185.2', 'XM_015757722.2', 'XM_015787291.2', 'XM_015757346.1', 'XM_015785362.1', 'XM_015765845.1', 'NC_001320.1'], 'TranslationSet': [{'From': 'Rice[Organism]', 'To': '"Oryza sativa"[Organism]'}], 'TranslationStack': [{'Term': '"Oryza sativa"[Organism]', 'Field': 'Organism', 'Count': '2283562', 'Explode': 'Y'}, {'Term': 'RefSeq[Keyword]', 'Field': 'Keyword', 'Count': '79496272', 'Explode': 'N'}, 'AND', {'Term': 'Plastid[Title]', 'Field': 'Title', 'Count': '144512', 'Explode': 'N'}, 'AND'], 'QueryTranslation': '"Oryza sativa"[Organism] AND RefSeq[Keyword] AND Plastid[Title]'}


In [160]:
for id in record['IdList']:
    if 'NC_' in id:
        print(f"The Accession Number is: {id}")
        handle = Entrez.esummary(
            db='nuccore',
            id=id,
        )

        summary = Entrez.read(handle)[0]
        print(summary)

The Accession Number is: NC_001320.1
{'Item': [], 'Id': '11466763', 'Caption': 'NC_001320', 'Title': 'Oryza sativa Japonica Group plastid, complete genome', 'Extra': 'gi|11466763|ref|NC_001320.1||gnl|NCBI_GENOMES|10013[11466763]', 'Gi': IntegerElement(11466763, attributes={}), 'CreateDate': '1989/11/23', 'UpdateDate': '2009/04/15', 'Flags': IntegerElement(768, attributes={}), 'TaxId': IntegerElement(39947, attributes={}), 'Length': IntegerElement(134525, attributes={}), 'Status': 'live', 'ReplacedBy': '', 'Comment': '  ', 'AccessionVersion': 'NC_001320.1'}


## Global search at NCBI

[EGQuery](https://biopython.org/DIST/docs/tutorial/Tutorial.html#sec151) provides counts for a search term in each of the Entrez databases (i.e. a global query). This is particularly useful to find out how many items your search terms would find in each database without actually performing lots of separate searches with ESearch

In [109]:
handle = Entrez.egquery(
    term="BET inhibitors[TITL] AND (review[PTYP] OR review[TITL])",
)

record = Entrez.read(handle)

for row in record["eGQueryResult"]:
    print(row['DbName'], ": ", row['Count'])

pubmed :  16
pmc :  32
mesh :  0
books :  0
pubmedhealth :  Error
omim :  0
ncbisearch :  0
nuccore :  0
nucgss :  0
nucest :  0
protein :  0
genome :  0
structure :  0
taxonomy :  0
snp :  0
dbvar :  0
gene :  0
sra :  0
biosystems :  0
unigene :  0
cdd :  0
clone :  0
popset :  0
geoprofiles :  0
gds :  0
homologene :  0
pccompound :  0
pcsubstance :  0
pcassay :  0
nlmcatalog :  0
probe :  0
gap :  0
proteinclusters :  0
bioproject :  0
biosample :  0
biocollections :  0


### Exercice

```python
r = Entrez.read(Entrez.egquery(term="Cancer"))
```

1. How many PubMed records?
2. What is the number of MeSH records?
3. How many books?
4. How many nucleotide records?

In [164]:
handle = Entrez.egquery(
    term="Cancer",
)

record = Entrez.read(handle)

for row in record["eGQueryResult"]:
    if (db:=row['DbName']) in ['pubmed', 'mesh', 'books', 'nuccore']:
        print(db, ": ", row['Count'])

pubmed :  4430710
mesh :  397
books :  127903
nuccore :  10692351


## Fetching data from NCBI

Three examples of Homo sapiens genes:

- HBB
- DNASE1L3
- OCA2

In [165]:
handle = Entrez.esearch(
    db='nucleotide',
    term='HBB',
)
record = Entrez.read(handle)
print(record)

{'Count': '6798', 'RetMax': '20', 'RetStart': '0', 'IdList': ['1531243787', '1217338357', '2090987776', '2090987765', '1441565460', '1401724401', '142384133', '2086922094', '2086918440', '2085699295', '2084455553', '2084449657', '2077483945', '2077483944', '2077154353', '2077154352', '2077154351', '2077154350', '2077154349', '2077154348'], 'TranslationSet': [], 'TranslationStack': [{'Term': 'HBB[All Fields]', 'Field': 'All Fields', 'Count': '6798', 'Explode': 'N'}, 'GROUP'], 'QueryTranslation': 'HBB[All Fields]'}


In [167]:
handle = Entrez.esearch(
    db='nucleotide',
    term='HBB[Gene Name]', # focus on item with the Gene Name specified
)
record = Entrez.read(handle)
print(record)

{'Count': '3889', 'RetMax': '20', 'RetStart': '0', 'IdList': ['2090987776', '2090987765', '1401724401', '2085699295', '2084455553', '2084449657', '164448673', '2055116838', '1043620671', '1005667215', '918400561', '28380636', '2043735767', '2043735766', '2043735765', '2043735764', '2043735763', '2043735762', '2043735761', '2043735760'], 'TranslationSet': [], 'TranslationStack': [{'Term': 'HBB[Gene Name]', 'Field': 'Gene Name', 'Count': '3889', 'Explode': 'N'}, 'GROUP'], 'QueryTranslation': 'HBB[Gene Name]'}


In [191]:
handle = Entrez.esearch(
    db='nucleotide',
    term='HBB[Gene Name] AND RefSeq[Keyword]',
    retmax=2000, # beware use a decent number here!
    idtype='acc',
)
record = Entrez.read(handle)
print(record)

{'Count': '71', 'RetMax': '71', 'RetStart': '0', 'IdList': ['NM_000518.5', 'NM_001097648.1', 'NC_056068.1', 'NG_000007.3', 'NC_000011.10', 'NC_000012.12', 'NC_054388.1', 'NM_131020.3', 'NM_001086273.2', 'NM_001304885.1', 'NM_001246752.1', 'NM_001168847.1', 'NM_001164428.1', 'NM_001144841.1', 'NM_001164018.1', 'NM_001123666.1', 'NM_033234.1', 'NG_059281.1', 'NM_001314043.1', 'NM_001082260.3', 'NM_173917.2', 'NC_051336.1', 'NM_001329918.1', 'NW_023666043.1', 'XM_037840313.1', 'NC_051312.1', 'XM_017507286.1', 'NW_016107685.1', 'NC_048596.1', 'NW_003645186.1', 'XM_027407893.2', 'XM_027386456.2', 'NM_001304883.1', 'NM_001304110.1', 'NM_001303935.1', 'NM_001303868.1', 'NM_001303858.1', 'NC_047579.1', 'NW_022611653.1', 'XM_032166808.1', 'XM_032240524.1', 'NW_022436987.1', 'NC_045446.1', 'XM_023209613.1', 'NC_044987.1', 'XM_004090649.3', 'NC_044395.1', 'NG_000940.4', 'NM_001279263.1', 'NM_001201019.1', 'NC_041767.1', 'NC_037342.1', 'NC_036890.1', 'XM_508242.4', 'XM_002822127.4', 'NC_036914.1',

In [192]:
counter=0
fetch_list = []

for ID in record['IdList']:
    if 'NM_' in ID: # NM stands for mRNA
        counter+=1
        fetch = Entrez.efetch(
            db='nucleotide',
            id=ID,
            rettype='fasta',
            retmode='text',
        )

        #read_fetch = fetch.read() # same as Entrez.read
        read_fetch = fetch.readline() # get only the 1st line
        fetch_list.append(read_fetch)
        #print(readFetch)

print(counter)
print(fetch_list)

23
['>NM_000518.5 Homo sapiens hemoglobin subunit beta (HBB), mRNA\n', '>NM_001097648.1 Ovis aries hemoglobin, beta (HBB), mRNA\n', '>NM_131020.3 Danio rerio hemoglobin, beta adult 1 (hbba1), mRNA\n', '>NM_001086273.2 Xenopus laevis hemoglobin subunit delta S homeolog (hbd.S), mRNA\n', '>NM_001304885.1 Ailuropoda melanoleuca hemoglobin subunit beta (LOC100499574), mRNA\n', '>NM_001246752.1 Cricetulus griseus beta major globin chain (Hbb), mRNA\n', '>NM_001168847.1 Papio anubis hemoglobin, beta (HBB), mRNA\n', '>NM_001164428.1 Macaca mulatta hemoglobin subunit beta (HBB), mRNA\n', '>NM_001144841.1 Sus scrofa hemoglobin, beta (HBB), mRNA\n', '>NM_001164018.1 Equus caballus hemoglobin, beta (HBB), mRNA\n', '>NM_001123666.1 Salmo salar beta globin (LOC100136576), mRNA\n', '>NM_033234.1 Rattus norvegicus hemoglobin subunit beta (Hbb), mRNA\n', '>NM_001314043.1 Oryctolagus cuniculus hemoglobin, beta (HBB2), transcript variant 1, mRNA\n', '>NM_001082260.3 Oryctolagus cuniculus hemoglobin, bet

In [193]:
fetch_list = []

for ID in record['IdList']:
    if 'NM_' in ID: # NM stands for mRNA, XM predicted mRNA, NC genomic
        fetch = Entrez.efetch(
            db='nucleotide',
            id=ID,
            rettype='fasta',
            retmode='text',
        )

        read_fetch = fetch.read() # same as Entrez.read
        #read_fetch = fetch.readline() # get only the 1st line
        fetch_list.append(read_fetch)

for file in fetch_list:
    with open('HBB.fasta', 'a+') as saved_file:
        saved_file.write(file)

For another search and fetch, we just have to change the `term` and the final filename

In [194]:
fetch_list = []

handle = Entrez.esearch(
    db='nucleotide',
    term='OCA2[Gene Name] AND RefSeq[Keyword]',
    retmax=2000,
    idtype='acc',
)

record = Entrez.read(handle)
print(f"Item count after search = {record['Count']}")

for ID in record['IdList']:
    if 'NM_' in ID: # NM stands for mRNA
        fetch = Entrez.efetch(
            db='nucleotide',
            id=ID,
            rettype='fasta',
            retmode='text',
        )

        read_fetch = fetch.read() # same as Entrez.read
        fetch_list.append(read_fetch)

print(f"Item count after filtering = {len(fetch_list)}")

for file in fetch_list:
    with open('OCA2.fasta', 'a+') as saved_file:
        saved_file.write(file)

Item count after search = 1152
Item count after filtering = 10


In [198]:
fetch_list = []

handle = Entrez.esearch(
    db='nucleotide',
    term='DNASE1L3[Gene Name] AND RefSeq[Keyword]',
    retmax=1000,
    idtype='acc',
)

record = Entrez.read(handle)
print(f"Item count after search = {record['Count']}")

for ID in record['IdList']:
    if 'NM_' in ID: # NM stands for mRNA
        fetch = Entrez.efetch(
            db='nucleotide',
            id=ID,
            rettype='fasta',
            retmode='text',
        )

        read_fetch = fetch.read() # same as Entrez.read
        fetch_list.append(read_fetch)

print(f"Item count after filtering = {len(fetch_list)}")

for file in fetch_list:
    with open('DNASE1L3.fasta', 'a+') as saved_file:
        saved_file.write(file)

Item count after search = 730
Item count after filtering = 10


Fetch the sequences from a specific organism

In [199]:
fetch_list = []

handle = Entrez.esearch(
    db='nucleotide',
    term='DNASE1L3[Gene Name] AND RefSeq[Keyword] AND Homo sapiens[Organism]',
    retmax=1000,
    idtype='acc',
)

record = Entrez.read(handle)
print(f"Item count after search = {record['Count']}")

for ID in record['IdList']:
    if 'NM_' in ID: # NM stands for mRNA
        fetch = Entrez.efetch(
            db='nucleotide',
            id=ID,
            rettype='fasta',
            retmode='text',
        )

        read_fetch = fetch.read() # same as Entrez.read
        fetch_list.append(read_fetch)

print(f"Item count after filtering = {len(fetch_list)}")

for file in fetch_list:
    with open('DNASE1L3-human.fasta', 'a+') as saved_file:
        saved_file.write(file)

Item count after search = 4
Item count after filtering = 2


A different format, e.g. GenBank

In [200]:
fetch_list = []

handle = Entrez.esearch(
    db='nucleotide',
    term='DNASE1L3[Gene Name] AND RefSeq[Keyword] AND Homo sapiens[Organism]',
    retmax=1000,
    idtype='acc',
)

record = Entrez.read(handle)
print(f"Item count after search = {record['Count']}")

for ID in record['IdList']:
    if 'NM_' in ID: # NM stands for mRNA
        fetch = Entrez.efetch(
            db='nucleotide',
            id=ID,
            rettype='gb',
            retmode='text',
        )

        read_fetch = fetch.read() # same as Entrez.read
        fetch_list.append(read_fetch)

print(f"Item count after filtering = {len(fetch_list)}")

for file in fetch_list:
    with open('DNASE1L3-human.gb', 'a+') as saved_file:
        saved_file.write(file)

Item count after search = 4
Item count after filtering = 2
