Date Created: 01/26/21
## Goal of Notebook: Retrieve Substrate and Primer Sequences for BRED

In [22]:
# import libraries
import requests 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [23]:
### USER CONFIG VARS
target_phage = "D29"
target_gp = "25"

In [24]:
# retrieve all phages names from phagesDB
query_url = "https://phagesdb.org/api/phages/{}/".format(target_phage)
out = requests.get(url = query_url).json()#["results"]

genome_length = out['genome_length']
fasta_file = out["fasta_file"]

In [25]:
# Retrieve gene info
query_url = "https://phagesdb.org/api/genesbyphage/{}/".format(target_phage)
out = requests.get(url = query_url).json()["results"]
genes_df = pd.DataFrame(out)
genes_df.head()

Unnamed: 0,GeneID,PhageID,phams,Start,Stop,Length,Name,translation,Orientation,Notes
0,D29_CDS_1,"{'PhageID': 'D29', 'Accession': 'AF022214', 'N...",[48212],400,1213,813,1,MYGTRSSAYWSTQPGKFDVLNLRMTFPSSSAYEIPDLRPTTYVPAN...,F,b''
1,D29_CDS_2,"{'PhageID': 'D29', 'Accession': 'AF022214', 'N...",[14135],1326,2106,780,2,MDPAQKRAAFNALAVAARDQHSPSDAKRIAKRDQMLGYVRGPWEQL...,F,b''
2,D29_CDS_3,"{'PhageID': 'D29', 'Accession': 'AF022214', 'N...",[9720],2105,2357,252,3,MALMQATHTIEGFLAVETHPRAFVAENGHVITRLSATKWGGWEGLE...,F,b''
3,D29_CDS_4,"{'PhageID': 'D29', 'Accession': 'AF022214', 'N...",[52081],2353,2650,297,4,MSWAGSKRRQELPEDWELNYRLPVLSAAGWLCEVDGPGCVRAATDV...,F,b''
4,D29_CDS_5,"{'PhageID': 'D29', 'Accession': 'AF022214', 'N...",[7918],2685,3153,468,5,MGTRGPIGKRDEERVRRNTPENPTETISMIGTVEIPELGDMSYMGE...,F,b''


In [26]:
# extract info from gene
target_gene_df = genes_df[genes_df["Name"] == str(target_gp)]
start_bp = int(target_gene_df["Start"].iloc[0])
stop_bp = int(target_gene_df["Stop"].iloc[0])
function = target_gene_df["Notes"].iloc[0]
orientation = target_gene_df["Orientation"].iloc[0]
translation = target_gene_df["translation"].iloc[0]
# genome_length = phages_df[phages_df["phage_name"]==target_phage]['genome_length'].iloc[0]


print(function, orientation, start_bp, stop_bp)

b'' F 16221 16527


## Substrate Sequence Using Interval Location

In [27]:
# https://phagesdb.org/media/fastas/{}.fasta

## Substrate Sequence Using GP 

In [28]:
# download the fasta file
url = "https://phagesdb.org/media/fastas/{}.fasta".format(target_phage)
r = requests.get(url, allow_redirects=True)
open('fasta_files/{}'.format(target_phage), 'wb').write(r.content) # delete the first line

49818

In [29]:
f = open('fasta_files/{}'.format(target_phage), "r")
DNA = ''.join(f.read().split("\n")[1:])

# if orientation == "R":
gene = DNA[start_bp-1:stop_bp-1] # -1 to account for zero indexing, -4 to account for start site
# else:
#     gene = DNA[start_bp-1:stop_bp-1] # -1 to account for zero indexing, -4 to account for start site
# gene

In [30]:
gene

'GGTGATTCACCTCCCGAAGACAGGCGCGTTCTACGCGGAGCGTCGAGGTGGTCAGCAGTACCGAGGTTGGGATGAGGACCGGTACGCGCTCGCGGACATCTACGACGCAGTCCAGGCGGGCAACCACATCCTCCTGATGGCCAACCGGGATCCGAAGAAGCCAAAGCCGAAGGCACCCAAGGCATACCCGCGTCCCGACGACTTTGAGAAGACAACGCCGAAGCCAGGTTCGTTCGCCGCGATGGTAGTGGCCGCGAAGAAGGCTGCGCGAGAGAAGAGGGAAAGGGAGGAGGCGAATGCCGAATA'

In [45]:
table = { 
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                  
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',  # GTG in bacteria codes for MET
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 
        'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', 
    }

comp_table = {"A":"T", "T":"A", "C":"G", "G":"C"}

def translate(seq):
    protein = "" 
    num_codons = len(seq)-len(seq)%3
    for i in range(0, num_codons, 3): 
        codon = seq[i:i + 3] 
        #print(codon, table[codon])
        protein += table[codon] 
            
    return protein

def complementation(seq):
    comp_seq = ""
    for bp in seq:
        comp_seq += comp_table[bp]
    return comp_seq
    
def reverse(seq):
    return seq[::-1]

In [46]:
translation

'MIHLPKTGAFYAERRGGQQYRGWDEDRYALADIYDAVQAGNHILLMANRDPKKPKPKAPKAYPRPDDFEKTTPKPGSFAAMVVAAKKAAREKREREEANAE'

In [47]:
'MIHLPKTGAFYAERRGGQQYRGWDEDRYALADIYDAVQAGNHILLMANRDPKKPKPKAPKAYPRPDDFEKTTPKPGSFAAMVMAAKKAAREKREREEANAE'

'MIHLPKTGAFYAERRGGQQYRGWDEDRYALADIYDAVQAGNHILLMANRDPKKPKPKAPKAYPRPDDFEKTTPKPGSFAAMVMAAKKAAREKREREEANAE'

In [48]:
'MIHLPKTGAFYAERRGGQQYRGWDEDRYALADIYDAVQAGNHILLMANRDPKKPKPKAPKAYPRPDDFEKTTPKPGSFAAMVMAAKKAAREKREREEANAE'==translation

False

In [49]:
t = [i for i in 'MIHLPKTGAFYAERRGGQQYRGWDEDRYALADIYDAVQAGNHILLMANRDPKKPKPKAPKAYPRPDDFEKTTPKPGSFAAMVMAAKKAAREKREREEANAE']
arr = [i for i in translation]
for i in range(len(arr)):
    if t[i]!=arr[i]:
        print(i, t[i], arr[i])

82 M V


In [44]:
len(t)

101

In [41]:
t[2]

IndexError: list index out of range

Transcribe the gene

In [53]:
# CHECK IF Nucleotide sequence in any of the 6 reading frames lines up with protien
print("No Alterations")
for reading_frame in range(3): # iterate through forward reading frames
    protien = translate(gene[reading_frame:])
    if translation[1:] == protien[1:]:
        print(protien)
        print(translation)
        
print("\n")
print("Reverse Complementation")
for reading_frame in range(3): # iterate through reverse reading frames
    protien = translate(complementation(reverse(gene))[reading_frame:])
    if translation[1:] == protien[1:]:
        print(protien)
        print(translation)


No Alterations
VIHLPKTGAFYAERRGGQQYRGWDEDRYALADIYDAVQAGNHILLMANRDPKKPKPKAPKAYPRPDDFEKTTPKPGSFAAMVVAAKKAAREKREREEANAE
MIHLPKTGAFYAERRGGQQYRGWDEDRYALADIYDAVQAGNHILLMANRDPKKPKPKAPKAYPRPDDFEKTTPKPGSFAAMVVAAKKAAREKREREEANAE


Reverse Complementation


In [57]:
substrate = DNA[start_bp-1-100:stop_bp-1+100]
primer_F = DNA[start_bp-1-100-20:start_bp-1-100]
primer_R = reverse(complementation(DNA[stop_bp-1+100:stop_bp-1+100+20]))
                   
print("Substrate w/ homologous ends:", substrate, "\n")
print("Forward Primer:", primer_F, "\n")
print("Reverse Primer:", primer_R, "\n")

# biopython Melting tempatures, self/homo diamorization
# Unique binding site
# Primer3 https://pypi.org/project/primer3-py/
#.        https://www.biostars.org/p/99865/ https://libnano.github.io/primer3-py/quickstart.html
# attttt-gggg-ctctctc
# attt-...-ttctc-...-tctc 20 bp

Substrate w/ homologous ends: GCGGGGCGATTCTCTCCGACTTGTCCGAGTACCACGGGGTCGACCTGCGCGATCTGTTCAGAGATGAAGATCCGCTGTCCCCCAGGTACGTCCTGAATCTGGTGATTCACCTCCCGAAGACAGGCGCGTTCTACGCGGAGCGTCGAGGTGGTCAGCAGTACCGAGGTTGGGATGAGGACCGGTACGCGCTCGCGGACATCTACGACGCAGTCCAGGCGGGCAACCACATCCTCCTGATGGCCAACCGGGATCCGAAGAAGCCAAAGCCGAAGGCACCCAAGGCATACCCGCGTCCCGACGACTTTGAGAAGACAACGCCGAAGCCAGGTTCGTTCGCCGCGATGGTAGTGGCCGCGAAGAAGGCTGCGCGAGAGAAGAGGGAAAGGGAGGAGGCGAATGCCGAATAGTGCTGGCGTAGAGGTCGCACGGATCTCCGTCAAGGTCAGCCCCAACACCAAAGAGTTCCGTCGTGAGCTGAAGACCGATCTGGAGAAGATCGAGCGGGA 

Forward Primer: CGAGCTGATCGACAAGTTCG 

Reverse Primer: CCGGAACGTCGGCCGACAGC 



Transcribe the entire DNA sequence

Double check FASTA belongs to D29

In [92]:
len(DNA) == genome_length

True

In [93]:
for i in np.unique(list(translation)):
    if i not in table.values():
        print(i)

In [94]:
translation

'MQASYVSPIDGQRYFGPRNYDNRMDAEAWLASEKRLIDNEEWTPPAEREKKAAASAITVEEYTKKWIAERDLAGGTKDLYSTHARKRIYPVLGDTPVAEMTPALVRAWWAGMGKQYPTARRHAYNVLRAVMNTAVEDKLVSENPCRIEQKAPAERDVEALTPEELDVVAGEVFEHYRVAVYILAWTSLRFGELIEIRRKDIVDDGETMKLRVRRGAARVGEKIVVGNTKTVRSKRPVTVPPHVAAMIREHMADRTKMNKGPEALLVTTTRGQRLSKSAFTRSLKKGYAKIGRPDLRIHDLRAVGATLAAQAGATTKELMVRLGHTTPRMAMKYQMASAARDEEIARRMSELAGITP'