In [None]:
'''
Biopython 
* started in 1999-2000. 
* is an open source community project.
* has various modules.

https://biopython.org/DIST/docs/api/Bio-module.html

We will discuss how Bio can be applied to 
* work with sequences, 
* different data formats 
* and retrieving info from biological DBs.
'''

In [None]:
# ==================================== Sequences ================================================

In [1]:
from Bio.Seq import Seq
my_seq = Seq("AGTACACTGGT")

In [2]:
my_seq

Seq('AGTACACTGGT')

In [None]:
'''
We haven’t specified an alphabet so we end up with a default generic alphabet. 
Biopython doesn’t know if this is a nucleotide sequence 
or a protein rich in alanines (A), glycines (G), cysteines (C) and threonines (T). 
If you know, you should supply this information:
'''

In [4]:
# help(Seq)

In [5]:
print(my_seq.alphabet)

Alphabet()


In [6]:
from Bio.Alphabet import generic_dna, generic_protein

In [7]:
my_dna = Seq("AGTACACTGGT", alphabet=generic_dna)
print(my_dna)

AGTACACTGGT


In [8]:
my_dna.alphabet

DNAAlphabet()

In [9]:
my_protein = Seq("AGTACACTGGT", alphabet=generic_protein)
print(my_protein)

AGTACACTGGT


In [10]:
my_protein.alphabet

ProteinAlphabet()

In [11]:
# https://biopython.org/DIST/docs/api/Bio.Alphabet-module.html

import Bio.Alphabet

In [12]:
print(Bio.Alphabet.IUPAC.unambiguous_dna.letters)
print(Bio.Alphabet.IUPAC.ambiguous_dna.letters)

GATC
GATCRYWSMKHBVDN


In [13]:
print(Bio.Data.IUPACData.ambiguous_dna_values["A"])
print(Bio.Data.IUPACData.ambiguous_dna_values["R"])
print(Bio.Data.IUPACData.ambiguous_dna_values["Y"])
print(Bio.Data.IUPACData.ambiguous_dna_values["S"])

A
AG
CT
CG


In [14]:
print(len(Bio.Alphabet.ThreeLetterProtein.letters))
print(Bio.Alphabet.ThreeLetterProtein.letters)
# Selenocysteine (Sec), Pyrrolysine (Pyl) -- these are not normally found in proteins,
# Asx - Asparagine (Asn) or aspartic acid (Asp)
# Xaa - Unspecified or unknown amino acid

24
['Ala', 'Asx', 'Cys', 'Asp', 'Glu', 'Phe', 'Gly', 'His', 'Ile', 'Lys', 'Leu', 'Met', 'Asn', 'Pro', 'Gln', 'Arg', 'Ser', 'Thr', 'Sec', 'Val', 'Trp', 'Xaa', 'Tyr', 'Glx']


In [15]:
print(Bio.Alphabet.SecondaryStructure.letters) 
# H - helix
# S - strand 
# T - turn
# C - coil

HSTC


In [None]:
# ==================================== Back to Sequences ================================================

In [16]:
coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", alphabet=generic_dna)
print(coding_dna)

ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG


In [17]:
messenger_rna = coding_dna.transcribe()
print(messenger_rna)

AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG


In [18]:
messenger_rna.alphabet

RNAAlphabet()

In [19]:
cDNA = messenger_rna.back_transcribe()
print(cDNA)

ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG


In [20]:
cDNA.alphabet

DNAAlphabet()

In [21]:
protein1 = coding_dna.translate()
protein2 = messenger_rna.translate()

print(protein1)
print(protein2)

MAIVMGR*KGAR*
MAIVMGR*KGAR*


In [22]:
protein1.alphabet

HasStopCodon(ExtendedIUPACProtein(), '*')

In [23]:
protein2.alphabet

HasStopCodon(ExtendedIUPACProtein(), '*')

In [None]:
# Examples of errors
# 1:

In [24]:
Seq("ATGCCCAA").translate()



Seq('MP', ExtendedIUPACProtein())

In [None]:
# 2:

In [25]:
rna_seq = Seq("CCGGGUU", alphabet = Bio.Alphabet.IUPAC.unambiguous_rna)

In [26]:
rna_seq.transcribe()

ValueError: RNA cannot be transcribed!

In [None]:
# 3:

In [27]:
seq = Seq("CCGGGTT")
print(seq)
print(seq[:5]) # it behaves almost like a string 
print(seq[0])
seq[0]="T" # they are immutable

CCGGGTT
CCGGG
C


TypeError: 'Seq' object does not support item assignment

In [28]:
# Seq objects are not mutable but...

mut_seq = seq.tomutable()
mut_seq

MutableSeq('CCGGGTT')

In [29]:
mut_seq[0]="T"
print(mut_seq)
# append(), insert(), pop(), remove()... --> check all methods via `tab`

TCGGGTT


In [30]:
# There are also some methods specific for changing a DNA sequence:
mut_seq.reverse()
print("\nreversed: %s" % (mut_seq))

mut_seq.complement()
print("\ncomplement: %s" % (mut_seq))

mut_seq.reverse_complement()
print("\nreverse_complement: %s" % (mut_seq))

print("\nNote, Seq isn't mutable and thus method reverse() isn't applicable there:")
seq.reverse()


reversed: TTGGGCT

complement: AACCCGA

reverse_complement: TCGGGTT

Note, Seq isn't mutable and thus method reverse() isn't applicable there:


AttributeError: 'Seq' object has no attribute 'reverse'

In [None]:
# ==================================== Sequence Records ================================================

In [31]:
# SeqRecord is a Seq object with associated metadata:

from Bio.SeqRecord import SeqRecord

# let's define out Seq as SeqRecord
seq_rec = SeqRecord(seq, 
                    id="001", 
                    name="My Sequence", 
                    description="Gene ***",
                    dbxrefs=["Pfam:PF05077", "InterPro:IPR007769", "DIP:2186N"])
                    # dbxrefs A list of strings, each string is a database cross reference id.

In [33]:
# help(SeqRecord)

In [34]:
seq_rec

SeqRecord(seq=Seq('CCGGGTT'), id='001', name='My Sequence', description='Gene ***', dbxrefs=['Pfam:PF05077', 'InterPro:IPR007769', 'DIP:2186N'])

In [35]:
# changing features post-factum:
seq_rec.description = "gene of toxic membrane protein"

In [36]:
seq_rec

SeqRecord(seq=Seq('CCGGGTT'), id='001', name='My Sequence', description='gene of toxic membrane protein', dbxrefs=['Pfam:PF05077', 'InterPro:IPR007769', 'DIP:2186N'])

In [37]:
print(seq_rec.format("fasta"))

>001 gene of toxic membrane protein
CCGGGTT



In [41]:
# Bio.SeqIO is a common interface to input (read) and output (write) sequence file formats. 

# Sequences retrieved with this interface are passed to your program as SeqRecord objects.

from Bio import SeqIO

for seq_record in SeqIO.parse("/home/octopus/Documents/2scripts/ScientificPython/1_Basics/Lecture5_Bio/data/ls_orchid.fasta", 
                              "fasta"):
#     print(seq_record)
    print("------------------")
    print(seq_record.id)
#     print(seq_record.seq)
    print(repr(seq_record.seq))
    print(len(seq_record.seq))
    print("=============================")

------------------
gi|2765658|emb|Z78533.1|CIZ78533
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC', SingleLetterAlphabet())
740
------------------
gi|2765657|emb|Z78532.1|CCZ78532
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAG...GGC', SingleLetterAlphabet())
753
------------------
gi|2765656|emb|Z78531.1|CFZ78531
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGCAG...TAA', SingleLetterAlphabet())
748
------------------
gi|2765655|emb|Z78530.1|CMZ78530
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAAACAACAT...CAT', SingleLetterAlphabet())
744
------------------
gi|2765654|emb|Z78529.1|CLZ78529
Seq('ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAA...AAA', SingleLetterAlphabet())
733
------------------
gi|2765652|emb|Z78527.1|CYZ78527
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...CCC', SingleLetterAlphabet())
718
------------------
gi|2765651|emb|Z78526.1|CGZ78526
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG.

In [43]:
# GenBank file

for seq_record in SeqIO.parse("/home/octopus/Documents/2scripts/ScientificPython/1_Basics/Lecture5_Bio/data/ls_orchid.gbk", 
                              "genbank"):
#     print(seq_record)
    print("------------------")
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record.seq))
    print("=============================")

------------------
Z78533.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC', IUPACAmbiguousDNA())
740
------------------
Z78532.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAG...GGC', IUPACAmbiguousDNA())
753
------------------
Z78531.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGCAG...TAA', IUPACAmbiguousDNA())
748
------------------
Z78530.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAAACAACAT...CAT', IUPACAmbiguousDNA())
744
------------------
Z78529.1
Seq('ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAA...AAA', IUPACAmbiguousDNA())
733
------------------
Z78527.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...CCC', IUPACAmbiguousDNA())
718
------------------
Z78526.1
Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGTAG...TGT', IUPACAmbiguousDNA())
730
------------------
Z78525.1
Seq('TGTTGAGATAGCAGAATATACATCGAGTGAATCCGGAGGACCTGTGGTTATTCG...GCA', IUPACAmbiguousDNA())
704
------------------
Z78524.1
Seq(

In [44]:
record = SeqIO.parse("/home/octopus/Documents/2scripts/ScientificPython/1_Basics/Lecture5_Bio/data/ls_orchid.gbk",
                     "genbank")

In [45]:
record

<generator object InsdcScanner.parse_records at 0x7f6fa86ee4d0>

In [46]:
record_dict = SeqIO.to_dict(record)

In [47]:
record_dict

{'Z78533.1': SeqRecord(seq=Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC', IUPACAmbiguousDNA()), id='Z78533.1', name='Z78533', description='C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA', dbxrefs=[]),
 'Z78532.1': SeqRecord(seq=Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAACAG...GGC', IUPACAmbiguousDNA()), id='Z78532.1', name='Z78532', description='C.californicum 5.8S rRNA gene and ITS1 and ITS2 DNA', dbxrefs=[]),
 'Z78531.1': SeqRecord(seq=Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAGACAGCAG...TAA', IUPACAmbiguousDNA()), id='Z78531.1', name='Z78531', description='C.fasciculatum 5.8S rRNA gene and ITS1 and ITS2 DNA', dbxrefs=[]),
 'Z78530.1': SeqRecord(seq=Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGTTGAAACAACAT...CAT', IUPACAmbiguousDNA()), id='Z78530.1', name='Z78530', description='C.margaritaceum 5.8S rRNA gene and ITS1 and ITS2 DNA', dbxrefs=[]),
 'Z78529.1': SeqRecord(seq=Seq('ACGGCGAGCTGCCGAAGGACATTGTTGAGACAGCAGAATATACGATTGAGTGAA...AAA',

In [48]:
record_dict["Z78533.1"]

SeqRecord(seq=Seq('CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGG...CGC', IUPACAmbiguousDNA()), id='Z78533.1', name='Z78533', description='C.irapeanum 5.8S rRNA gene and ITS1 and ITS2 DNA', dbxrefs=[])

In [None]:
# Let's create our own fasta file from scratch:

In [49]:
Rec1 = SeqRecord(Seq("ACCA"), id="1", description="")
Rec2 = SeqRecord(Seq("CDRFAA"), id="2", description="")
Rec3 = SeqRecord(Seq("GRKLM"), id="3", description="")
My_records = [Rec1, Rec2, Rec3]
handle = open("/home/octopus/Documents/2scripts/ScientificPython/1_Basics/Lecture5_Bio/data/MySeqs.fasta","w")
SeqIO.write(My_records, handle, "fasta")
handle.close()

In [None]:
# Converting between sequence file formats has never been easier than it's now:
# You can do file conversion by combining Bio.SeqIO.parse() and Bio.SeqIO.write()

In [50]:
In_handle = open("/home/octopus/Documents/2scripts/ScientificPython/1_Basics/Lecture5_Bio/data/ls_orchid.gbk", "r")
Out_handle = open("/home/octopus/Documents/2scripts/ScientificPython/1_Basics/Lecture5_Bio/data/manuallyCreated_ls_orchid.fasta", "w")

records = SeqIO.parse(In_handle, "genbank")

SeqIO.write(records, Out_handle, "fasta")

In_handle.close()
Out_handle.close()

In [51]:
from Bio.Align import Applications

In [53]:
Applications.

In [54]:
from Bio import pairwise2
# https://biopython.org/docs/1.75/api/Bio.pairwise2.html
alignments = pairwise2.align.globalxx("ACCGT", "ACG")
alignments

[('ACCGT', 'A-CG-', 3.0, 0, 5), ('ACCGT', 'AC-G-', 3.0, 0, 5)]

In [55]:
from Bio.pairwise2 import format_alignment
print(format_alignment(*alignments[0]))

ACCGT
| || 
A-CG-
  Score=3



In [56]:
from Bio import AlignIO

align = AlignIO.read("/home/octopus/Documents/2scripts/ScientificPython/1_Basics/Lecture5_Bio/data/opuntia.aln", "clustal")
print(align)

SingleLetterAlphabet() alignment with 7 rows and 906 columns
TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191
TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191
TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191
TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191
TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191
TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191
TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191


In [57]:
from Bio import Phylo

tree = Phylo.read("/home/octopus/Documents/2scripts/ScientificPython/1_Basics/Lecture5_Bio/data/opuntia.dnd", "newick")
Phylo.draw_ascii(tree)

                             _______________ gi|6273291|gb|AF191665.1|AF191665
  __________________________|
 |                          |   ______ gi|6273290|gb|AF191664.1|AF191664
 |                          |__|
 |                             |_____ gi|6273289|gb|AF191663.1|AF191663
 |
_|_________________ gi|6273287|gb|AF191661.1|AF191661
 |
 |__________ gi|6273286|gb|AF191660.1|AF191660
 |
 |    __ gi|6273285|gb|AF191659.1|AF191659
 |___|
     | gi|6273284|gb|AF191658.1|AF191658



In [58]:
from Bio import motifs 
from Bio.Seq import Seq 
DNA_motif = [ Seq("AGCTAAAAGTGAGA"), 
              Seq("TCGAAATATGGAGA"), 
              Seq("AACTAATATAGGCC"),
              Seq("AGCTAATATTGGGA")
            ] 
seq = motifs.create(DNA_motif) 
print(seq)
print("\nconsensus: %s\n" % (seq.consensus)) 
print(seq.counts) 

AGCTAAAAGTGAGA
TCGAAATATGGAGA
AACTAATATAGGCC
AGCTAATATTGGGA


consensus: AGCTAATATTGAGA

        0      1      2      3      4      5      6      7      8      9     10     11     12     13
A:   3.00   1.00   0.00   1.00   4.00   4.00   1.00   4.00   0.00   1.00   0.00   2.00   0.00   3.00
C:   0.00   1.00   3.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00   0.00   1.00   1.00
G:   0.00   2.00   1.00   0.00   0.00   0.00   0.00   0.00   1.00   1.00   4.00   2.00   3.00   0.00
T:   1.00   0.00   0.00   3.00   0.00   0.00   3.00   0.00   3.00   2.00   0.00   0.00   0.00   0.00



In [59]:
seq.weblogo("seq.png")

In [60]:
pwd()

'/home/octopus/Documents/2scripts/ScientificPython/1_Basics/Lecture5_Bio'

In [None]:
# ==================================== BioData and DataBases ================================================

In [61]:
from Bio.Data import CodonTable

print(CodonTable.generic_by_id[5])

Table 5 Invertebrate Mitochondrial, SGC4

  |  U      |  C      |  A      |  G      |
--+---------+---------+---------+---------+--
U | UUU F   | UCU S   | UAU Y   | UGU C   | U
U | UUC F   | UCC S   | UAC Y   | UGC C   | C
U | UUA L   | UCA S   | UAA Stop| UGA W   | A
U | UUG L(s)| UCG S   | UAG Stop| UGG W   | G
--+---------+---------+---------+---------+--
C | CUU L   | CCU P   | CAU H   | CGU R   | U
C | CUC L   | CCC P   | CAC H   | CGC R   | C
C | CUA L   | CCA P   | CAA Q   | CGA R   | A
C | CUG L   | CCG P   | CAG Q   | CGG R   | G
--+---------+---------+---------+---------+--
A | AUU I(s)| ACU T   | AAU N   | AGU S   | U
A | AUC I(s)| ACC T   | AAC N   | AGC S   | C
A | AUA M(s)| ACA T   | AAA K   | AGA S   | A
A | AUG M(s)| ACG T   | AAG K   | AGG S   | G
--+---------+---------+---------+---------+--
G | GUU V   | GCU A   | GAU D   | GGU G   | U
G | GUC V   | GCC A   | GAC D   | GGC G   | C
G | GUA V   | GCA A   | GAA E   | GGA G   | A
G | GUG V(s)| GCG A   | GAG E   | GGG G 

In [62]:
print(CodonTable.generic_by_id[5].stop_codons)

['TAA', 'UAA', 'TAG', 'UAG']


In [None]:
# Each amino acid (represented by a single letter) has an individual weight. 

# Each amino acid bond release a water molecule (with a weight of 18 units).

from Bio.Data.IUPACData import protein_weights as protweight

protweight.get("A") # Alanine

In [None]:
protseq = "MFPTWYV"

totalW = 0
for aa in protseq:
    totalW += protweight.get(aa)

totalW -= 18 * (len(protseq) - 1)

print(totalW)

In [63]:
# This function implements the “nearest neighbor method”
# and can be use for both DNA and RNA sequences in primer design task:

from Bio.SeqUtils import MeltingTemp

mt = MeltingTemp.Tm_staluc("TCTTGGCGGAGACA")
print("%.2f" % mt)

43.74


In [None]:
'''
Entrez is a search engine, 
it integrates several health sciences databases at NCBI website.
'''

In [65]:
from Bio import Entrez
my_em = "anastasiia.gainullina@gmail.com"
db = "pubmed"

# Search in the Entrez databases by keyword(s):
h_search = Entrez.esearch(db=db, 
                          email=my_em, 
                          term="itaconate") # term="Homo sapiens AND mRNA AND MapK"
print("\nESearch:")
print(h_search)

record = Entrez.read(h_search)
print("\nRecord:")
print(record)

# Get the list of Ids returned by previous search
print("\nRecord_IdList:")
res_ids = record["IdList"]
print(res_ids) # https://www.ncbi.nlm.nih.gov/pubmed/32178247


ESearch:
<_io.TextIOWrapper encoding='utf-8'>

Record:
{'Count': '866', 'RetMax': '20', 'RetStart': '0', 'IdList': ['33899904', '33899162', '33862463', '33859194', '33841646', '33840027', '33795019', '33780051', '33775760', '33762950', '33749263', '33728906', '33719217', '33713917', '33708036', '33691120', '33691097', '33679790', '33672256', '33670656'], 'TranslationSet': [{'From': 'itaconate', 'To': '"itaconic acid"[Supplementary Concept] OR "itaconic acid"[All Fields] OR "itaconate"[All Fields]'}], 'TranslationStack': [{'Term': '"itaconic acid"[Supplementary Concept]', 'Field': 'Supplementary Concept', 'Count': '353', 'Explode': 'N'}, {'Term': '"itaconic acid"[All Fields]', 'Field': 'All Fields', 'Count': '681', 'Explode': 'N'}, 'OR', {'Term': '"itaconate"[All Fields]', 'Field': 'All Fields', 'Count': '324', 'Explode': 'N'}, 'OR', 'GROUP'], 'QueryTranslation': '"itaconic acid"[Supplementary Concept] OR "itaconic acid"[All Fields] OR "itaconate"[All Fields]'}

Record_IdList:
['338999

In [66]:
# For each id in the list
for r_id in res_ids[0:2]:
    
    # Get summary information for each id
    h_summ = Entrez.esummary(db=db, 
                             email=my_em,
                             id=r_id)
    
    # Parse the result with Entrez.read()
    summ = Entrez.read(h_summ)
    # print(summ[0])
    # print("\n")
    print(summ[0]["Title"])
    print("\n")
    print(summ[0]["DOI"])
    print("==============================================")

Matsuda-Heck arylation of itaconates: a versatile approach to heterocycles from a renewable resource.


10.1039/d1ob00392e
Optimization, in vitro release and toxicity evaluation of novel pH sensitive itaconic acid-g-poly(acrylamide)/sterculia gum semi-interpenetrating networks.


10.1007/s40199-021-00395-8


In [None]:
'''
PDB files store information regarding three dimensional structures of molecules held at the Protein Data Bank.
(crystallography, NMR, EM, ...)

To analyze protein structure data, your need to be able to parse the data from PDB files. 
This is the role of the Bio.PDB module.

To effectively use the Bio.PDB module, you have first to understand the PDB file structure.
'''

In [67]:
from Bio.PDB.PDBParser import PDBParser

pdbfn = "/home/octopus/Documents/2scripts/ScientificPython/1_Basics/Lecture5_Bio/data/6w4b.pdb"
parser = PDBParser(PERMISSIVE=1)
structure = parser.get_structure(id = "some_structure_id", file = pdbfn)
print(structure)
type(structure)

<Structure id=some_structure_id>


Bio.PDB.Structure.Structure

In [None]:
# help(PDBParser)

In [68]:
print(structure.header.keys())

dict_keys(['name', 'head', 'idcode', 'deposition_date', 'release_date', 'structure_method', 'resolution', 'structure_reference', 'journal_reference', 'author', 'compound', 'source', 'has_missing_residues', 'missing_residues', 'keywords', 'journal'])


In [69]:
print(structure.header["resolution"])

2.95


In [70]:
print(structure.header["structure_method"])

x-ray diffraction


In [71]:
for i in structure.get_chains():
    print(i)

<Chain id=A>
<Chain id=B>


In [73]:
model = structure.get_models()
# In a structure derived from an X-ray crystallography experiment, 
# only a single model will be present (with some exceptions). 
# NMR structures normally contain many different models.

In [74]:
models = list(model)
print(models)

[<Model id=0>]


In [75]:
list(models[0].get_chains())

[<Chain id=A>, <Chain id=B>]

In [76]:
chains = list(models[0].get_chains()) 

In [77]:
residue = list(chains[0].get_residues())
print(residue[1:5])
print(len(residue))

[<Residue PRO het=  resseq=7 icode= >, <Residue VAL het=  resseq=8 icode= >, <Residue ALA het=  resseq=9 icode= >, <Residue LEU het=  resseq=10 icode= >]
109


In [78]:
list(residue[0].get_atoms()) 

[<Atom N>, <Atom CA>, <Atom C>, <Atom O>, <Atom CB>, <Atom OG>]

In [79]:
atoms = list(residue[0].get_atoms())
atoms[0].get_vector() 

<Vector 36.76, -1.24, 13.21>

In [None]:
'''
KEGG is a collection of databases dealing with genomes, 
biological pathways, diseases, drugs, and chemical substances.
'''

In [80]:
from Bio.KEGG import REST
# This module aims to make the KEGG online REST-style API easier to use.
# See: https://www.kegg.jp/kegg/rest/keggapi.html
human_pathways = REST.kegg_list("pathway", "hsa").read()

In [84]:
human_pathways[1:500]

'ath:hsa00010\tGlycolysis / Gluconeogenesis - Homo sapiens (human)\npath:hsa00020\tCitrate cycle (TCA cycle) - Homo sapiens (human)\npath:hsa00030\tPentose phosphate pathway - Homo sapiens (human)\npath:hsa00040\tPentose and glucuronate interconversions - Homo sapiens (human)\npath:hsa00051\tFructose and mannose metabolism - Homo sapiens (human)\npath:hsa00052\tGalactose metabolism - Homo sapiens (human)\npath:hsa00053\tAscorbate and aldarate metabolism - Homo sapiens (human)\npath:hsa00061\tFatty acid biosynt'

In [85]:
repair_pathways = []
for line in human_pathways.split("\n"):
    entry, description = line.split("\t")
    print(entry)
    print(description)
    if "repair" in description:
        repair_pathways.append(entry)
        # Get the genes for pathways and add them to a list

path:hsa00010
Glycolysis / Gluconeogenesis - Homo sapiens (human)
path:hsa00020
Citrate cycle (TCA cycle) - Homo sapiens (human)
path:hsa00030
Pentose phosphate pathway - Homo sapiens (human)
path:hsa00040
Pentose and glucuronate interconversions - Homo sapiens (human)
path:hsa00051
Fructose and mannose metabolism - Homo sapiens (human)
path:hsa00052
Galactose metabolism - Homo sapiens (human)
path:hsa00053
Ascorbate and aldarate metabolism - Homo sapiens (human)
path:hsa00061
Fatty acid biosynthesis - Homo sapiens (human)
path:hsa00062
Fatty acid elongation - Homo sapiens (human)
path:hsa00071
Fatty acid degradation - Homo sapiens (human)
path:hsa00072
Synthesis and degradation of ketone bodies - Homo sapiens (human)
path:hsa00100
Steroid biosynthesis - Homo sapiens (human)
path:hsa00120
Primary bile acid biosynthesis - Homo sapiens (human)
path:hsa00130
Ubiquinone and other terpenoid-quinone biosynthesis - Homo sapiens (human)
path:hsa00140
Steroid hormone biosynthesis - Homo sapiens

ValueError: not enough values to unpack (expected 2, got 1)

In [86]:
repair_pathways

['path:hsa03410', 'path:hsa03420', 'path:hsa03430']

In [87]:
pathway = repair_pathways[0]
REST.kegg_get(pathway).read()
# https://www.genome.jp/dbget-bin/www_bget?hsa03410

'ENTRY       hsa03410                    Pathway\nNAME        Base excision repair - Homo sapiens (human)\nDESCRIPTION Base excision repair (BER) is the predominant DNA damage repair pathway for the processing of small base lesions, derived from oxidation and alkylation damages. BER is normally defined as DNA repair initiated by lesion-specific DNA glycosylases and completed by either of the two sub-pathways: short-patch BER where only one nucleotide is replaced and long-patch BER where 2-13 nucleotides are replaced. Each sub-pathway of BER relies on the formation of protein complexes that assemble at the site of the DNA lesion and facilitate repair in a coordinated fashion. This process of complex formation appears to provide an increase in specificity and efficiency to the BER pathway, thereby facilitating the maintenance of genome integrity by preventing the accumulation of highly toxic repair intermediates.\nCLASS       Genetic Information Processing; Replication and repair\nPATHWA

In [89]:
repair_genes = []

for pathway in repair_pathways:
    
    pathway_file = REST.kegg_get(pathway).read() # query and read each pathway
#     print(pathway_file)
    
    for line in pathway_file.split("\n"):
        section = line[:12].strip() # section names are within 12 columns (know by specification or empirically)
        # print(section)
        
        if section == "GENE":

            gene_identifiers, gene_description = line[12:].split("; ")
            gene_id, gene_symbol = gene_identifiers.split()

            if not gene_symbol in repair_genes:
                repair_genes.append(gene_symbol)

In [90]:
print("There are %d repair pathways and %d repair genes. The genes are:" 
      % (len(repair_pathways), len(repair_genes)))
print(", ".join(repair_genes))

There are 3 repair pathways and 3 repair genes. The genes are:
OGG1, RBX1, SSBP1
