# BIOINFORMATICS DATABASES WITH BIOPYTHON

1. [NCBI](#1.-NCBI)<br>
    1.1. [Nucleotide BLAST](#1.1.-Nucleotide-BLAST)<br>
    1.2. [Protein BLAST](#1.2.-Protein-BLAST)
    
2. [ENTREZ](#2.-ENTREZ)<br>
    2.1. [PUBMED](#2.1.-PUBMED)<br>
    2.2. [Nucleotide](#2.2.-Nucleotide)
    
3. [PDB](#3.-PDB)

4. [EXPASY](#4.-EXPASY)<br>
    4.1. [PROSITE](#4.1.-PROSITE)<br>
    4.2. [ScanProsite](#4.2.-ScanProsite)
    
5. [KEGG](#5.-KEGG)

# 1. NCBI

### Import Modules

In [1]:
from Bio.Blast import NCBIWWW
from Bio import SeqIO, SearchIO

In [2]:
# help(NCBIWWW.qblast)

## 1.1. Nucleotide BLAST

In [3]:
# !ls

In [4]:
nuc_record = SeqIO.read("nuc_seq.fasta", format="fasta")
len(nuc_record.seq)

774

In [5]:
result_handle = NCBIWWW.qblast("blastn", "nt", nuc_record.seq)
blast_result = SearchIO.read(result_handle, "blast-xml")

In [6]:
# print(blast_result[0:2])

In [7]:
Seq = blast_result [0]
print(f"Sequence ID: {Seq.id}")
print(f"Sequence Description: {Seq.description}")

details = Seq[0]
print(f"E-value: {details.evalue}")

Sequence ID: gi|1946508550|gb|MW375728.1|
Sequence Description: Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/ESP/AST-232026126/2020, complete genome
E-value: 0.0


In [8]:
print(f"alignment:\n{details.aln}")

alignment:
Alignment with 2 rows and 774 columns
ATCGCTCCAGGGCAAACTGGAAAGATTGCTGATTATAATTATAA...GGT No
ATCGCTCCAGGGCAAACTGGAAAGATTGCTGATTATAATTATAA...GGT gi|1946508550|gb|MW375728.1|


## 1.2. Protein BLAST

In [9]:
prot_record = SeqIO.read("prot_seq.fasta", format="fasta")
len(prot_record.seq)

258

In [10]:
result_handle = NCBIWWW.qblast("blastp", "pdb", prot_record.seq)
blast_result = SearchIO.read(result_handle, "blast-xml")

In [11]:
# print(blast_result[0:2])

In [12]:
Seq = blast_result [0]
print(f"Sequence ID: {Seq.id}")
print(f"Sequence Description: {Seq.description}")

details = Seq[0]
print(f"E-value: {details.evalue}")

Sequence ID: pdb|7BYR|A
Sequence Description: BD23-Fab in complex with the S ectodomain trimer [Severe acute respiratory syndrome coronavirus 2]
E-value: 0.0


In [13]:
print(f"alignment:\n {details.aln}")

alignment:
 Alignment with 2 rows and 258 columns
IAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLY...PIG unnamed
IAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLY...PIG pdb|7BYR|A


------------------------------------------------------

# 2. ENTREZ

### Import Modules

In [47]:
from Bio import Entrez

In [48]:
# help(Entrez)

In [49]:
Entrez.email = "datacyclopes@gmail.com"

In [17]:
handle = Entrez.einfo()
record = Entrez.read(handle)
record["DbList"]

['pubmed', 'protein', 'nuccore', 'ipg', 'nucleotide', 'structure', 'sparcle', 'protfam', 'genome', 'annotinfo', 'assembly', 'bioproject', 'biosample', 'blastdbinfo', 'books', 'cdd', 'clinvar', 'gap', 'gapplus', 'grasp', 'dbvar', 'gene', 'gds', 'geoprofiles', 'homologene', 'medgen', 'mesh', 'ncbisearch', 'nlmcatalog', 'omim', 'orgtrack', 'pmc', 'popset', 'proteinclusters', 'pcassay', 'biosystems', 'pccompound', 'pcsubstance', 'seqannot', 'snp', 'sra', 'taxonomy', 'biocollections', 'gtr']

## 2.1. PUBMED

In [18]:
handle = Entrez.einfo(db="pubmed")
record = Entrez.read(handle)
record["DbInfo"]["Description"]

'PubMed bibliographic record'

In [19]:
record["DbInfo"]["Count"]

'31943423'

In [20]:
handle = Entrez.esearch(db="pubmed", term="biopython")
record = Entrez.read(handle)
record["IdList"]

['33242467', '32044951', '31762715', '31278684', '31069053', '30013827', '29641230', '28011774', '24929426', '24497503', '24267035', '24194598', '23842806', '23157543', '22909249', '22399473', '21666252', '21210977', '20015970', '19811691']

In [21]:
handle = Entrez.esummary(db="pubmed", id='33242467, 32044951')
records = Entrez.parse(handle)
for record in records:
    print(record['AuthorList'],record['Title'],record['PubDate'],record['FullJournalName'])

['Kricka LJ', 'Cornish TC', 'Park JY'] Eponyms in clinical chemistry. 2021 Jan Clinica chimica acta; international journal of clinical chemistry
['Ireland SM', 'Martin ACR'] atomium-a Python structure parser. 2020 May 1 Bioinformatics (Oxford, England)


In [22]:
handle = Entrez.efetch(db="pubmed", id="19811691")
print(handle.read())


Pubmed-entry ::= {
  pmid 19811691,
  medent {
    em std {
      year 2009,
      month 10,
      day 9,
      hour 6,
      minute 0
    },
    cit {
      title {
        name "Exploratory visual analysis of conserved domains on multiple
 sequence alignments."
      },
      authors {
        names std {
          {
            name ml "Jankun-Kelly TJ",
            affil str "Institute for Digital Biology and Department of
 Computer Science and Engineering, Bagley College of Engineering, Mississippi
 State University, Mississippi, USA. tjk@acm.org"
          },
          {
            name ml "Lindeman AD"
          },
          {
            name ml "Bridges SM"
          }
        }
      },
      from journal {
        title {
          iso-jta "BMC Bioinformatics",
          ml-jta "BMC Bioinformatics",
          issn "1471-2105",
          name "BMC bioinformatics"
        },
        imp {
          date std {
            year 2009,
            month 10,
            day 8
   

## 2.2. Nucleotide

In [42]:
handle = Entrez.esearch(db="nucleotide",retmax=10, term="Severe acute respiratory syndrome")
record = Entrez.read(handle)
record["IdList"]

['1949110243', '1949110228', '1949110214', '1949110198', '1949110184', '1949110171', '1949110158', '1949110144', '1949110130', '1949110116']

In [57]:
handle2 = Entrez.efetch(db="nucleotide", id="1946981036", rettype="gb", retmode="text")
# print(handle2.read())

In [58]:
short_record = SeqIO.read(handle2, "genbank")
print(short_record.description)

Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/CA-LACPHL-AD00003/2020, complete genome


In [59]:
handle3 = Entrez.esearch(db='nucleotide', term='accD[Gene Name] AND "E. coli"[Organism]', retmax="20")
result_list = Entrez.read(handle3)

In [60]:
id_list = result_list['IdList']
count = result_list['Count']
print(id_list)
print("\n")
print(count)

['1949182897', '1949182621', '1949182501', '1949181018', '1949175854', '1949174750', '1949174165', '1949173738', '1949173077', '1949172476', '1949172241', '1949171947', '1949171725', '1949170850', '1949169771', '1949169741', '1949167201', '1949164081', '1949163981', '1949163801']


90795


In [61]:
handle.close()

------------------------------------------------------

# 3. PDB

In [62]:
# import warnings
# warnings.filterwarnings('ignore')
# warnings.filterwarnings(action='once')

### Import Modules

In [63]:
from Bio.PDB import PDBParser,PDBList

In [64]:
# help(PDBList)

In [65]:
pdbl = PDBList()
pdbl.retrieve_pdb_file("7BYR",file_format="pdb",pdir="dir")

Downloading PDB structure '7BYR'...


'dir/pdb7byr.ent'

In [66]:
parser = PDBParser()
structure = parser.get_structure("7BYR","dir/pdb7byr.ent")



In [67]:
for chain in structure[0]:
    print(f"chainid: {chain.id}")

chainid: A
chainid: B
chainid: C
chainid: H
chainid: L
chainid: D
chainid: E
chainid: F
chainid: G
chainid: I
chainid: J


In [68]:
resolution = structure.header["resolution"]
resolution

3.84

In [69]:
keywords = structure.header["keywords"]
keywords

'sars-cov-2, antigen, rbd, neutralizing antibody, viral protein'

------------------------------------------------------

# 4. EXPASY

## 4.1. PROSITE

### Import Modules

In [70]:
from Bio import ExPASy
from Bio.ExPASy import Prosite

In [71]:
# help(Prosite)

In [72]:
handle = ExPASy.get_prosite_raw('PS51442')
record = Prosite.read(handle)

In [73]:
print(record.description)

Coronavirus main protease (M-pro) domain profile.


In [74]:
print(record.pdb_structs[:10])

['1LVO', '1P9S', '1P9U', '1Q2W', '1UJ1', '1UK2', '1UK3', '1UK4', '1WOF', '1Z1I']


In [75]:
handle = ExPASy.get_prosite_raw('PS00001')
record = Prosite.read(handle)
print(record.pattern)

N-{P}-[ST]-{P}.


## 4.2. ScanProsite

### Import Modules

In [76]:
from Bio.ExPASy import ScanProsite

In [77]:
prot_record = SeqIO.read("prot_seq.fasta", format="fasta")
len(prot_record.seq)

258

In [78]:
handle = ScanProsite.scan(seq=prot_record.seq, mirror="https://prosite.expasy.org/")

In [79]:
result = ScanProsite.read(handle)

In [80]:
result.n_match

1

In [81]:
result[0]

{'sequence_ac': 'USERSEQ1',
 'start': 1,
 'stop': 118,
 'signature_ac': 'PS51921',
 'score': '32.871',
 'level': '0'}

------------------------------------------------------

# 5. KEGG

### Import Modules

In [82]:
from Bio.KEGG import REST, Enzyme

In [83]:
# help(Enzyme)

In [84]:
request = REST.kegg_get("ec:5.4.2.2")
open("ec_5.4.2.2.txt", 'w').write(request.read())

194908

In [85]:
records = Enzyme.parse(open("ec_5.4.2.2.txt"))
record = list(records)[0]
record.classname

['Isomerases;',
 'Intramolecular transferases;',
 'Phosphotransferases (phosphomutases)']

In [86]:
record.pathway

[('PATH', 'ec00010', 'Glycolysis / Gluconeogenesis'),
 ('PATH', 'ec00030', 'Pentose phosphate pathway'),
 ('PATH', 'ec00052', 'Galactose metabolism'),
 ('PATH', 'ec00230', 'Purine metabolism'),
 ('PATH', 'ec00500', 'Starch and sucrose metabolism'),
 ('PATH', 'ec00520', 'Amino sugar and nucleotide sugar metabolism'),
 ('PATH', 'ec00521', 'Streptomycin biosynthesis'),
 ('PATH', 'ec01100', 'Metabolic pathways'),
 ('PATH', 'ec01110', 'Biosynthesis of secondary metabolites'),
 ('PATH', 'ec01120', 'Microbial metabolism in diverse environments')]

In [87]:
record.genes[:10]

[('HSA', ['5236', '55276']),
 ('PTR', ['456908', '461162']),
 ('PPS', ['100977295', '100993927']),
 ('GGO', ['101128874', '101131551']),
 ('PON', ['100190836', '100438793']),
 ('NLE', ['100596081', '100600656']),
 ('MCC', ['100424648', '699401']),
 ('MCF', ['101925921', '102130622']),
 ('CSAB', ['103224690', '103246223']),
 ('RRO', ['104664350', '104668067'])]

In [88]:
list_genes = []
for x,y in record.genes:
    list_genes +=x.split("\n")
    
print(list_genes[:10])

['HSA', 'PTR', 'PPS', 'GGO', 'PON', 'NLE', 'MCC', 'MCF', 'CSAB', 'RRO']


------------------------------------------------------