In [1]:
from primers import primers
import primer3 as pr
import melting as melt
from Bio import SeqIO
from Bio.SeqUtils import GC
from Bio.SeqUtils import MeltingTemp
import numpy as np
from tqdm import tqdm
from itertools import product
import pandas as pd

In [46]:
folder = 'datasets/genbank'
files_bulk = [
    ['cymRC-pCym.gb', 'e11-N-term.gb', 'araC-pBAD.gb', 'e11-C-term-backbone.gb'],
    ['cymRC-pCym.gb', 'e16-N-term.gb', 'araC-pBAD.gb', 'e16-C-term-backbone.gb'],
    ['cymRC-pCym.gb', 'e32-N-term.gb', 'araC-pBAD.gb', 'e32-C-term-backbone.gb'],
    ['cymRC-pCym.gb', 'e41-N-term.gb', 'araC-pBAD.gb', 'e41-C-term-backbone.gb'],
    ['cymRC-pCym.gb', 'e42-N-term.gb', 'araC-pBAD.gb', 'e42-C-term-backbone.gb'],
    ['pLuxB.gb', 'e11-N-term.gb', 'pSalTTC.gb', 'e11-C-term-backbone.gb'],
    ['pLuxB.gb', 'e16-N-term.gb', 'pSalTTC.gb', 'e16-C-term-backbone.gb'],
    ['pLuxB.gb', 'e32-N-term.gb', 'pSalTTC.gb', 'e32-C-term-backbone.gb'],
    ['pLuxB.gb', 'e41-N-term.gb', 'pSalTTC.gb', 'e41-C-term-backbone.gb'],
    ['pLuxB.gb', 'e42-N-term.gb', 'pSalTTC.gb', 'e42-C-term-backbone.gb']
]
last_num = 167
overlap_len = 16

df = pd.DataFrame()

for files in tqdm(files_bulk):
    
    seqs = []
    overhangs = []
    for f in files:
        with open('{}/{}'.format(folder, f)) as input_handle:
            for record in SeqIO.parse(input_handle, 'genbank'):
                seq = record.seq
                seqs.append(seq)
                overhangs.append((seq[:overlap_len], seq[-overlap_len:]))

    fragments = [(overhangs[i-1][1], seqs[i], overhangs[(i+1)%len(seqs)][0].reverse_complement()) for i in range(len(seqs))]
    
    for i, f in enumerate(files):
        fwd, rev = primers(str(fragments[i][1]), add_fwd=str(fragments[i][0]), add_rev=str(fragments[i][2]))
        temp = pd.DataFrame([fwd, rev])
        temp['fragment'] = f
        df = df.append(temp)

df['type'] = df['fwd'].apply(lambda x: 'F' if x else 'R')
df = df.drop_duplicates(subset=['seq'])
df = df.reset_index(drop=True).reset_index()
df['primer-idx'] = df['index'].apply(lambda x: f'{(x+last_num+1):03d}')
df['primer-name'] = '(P' + df['primer-idx'] + '-RM)_' + df['fragment'].str.split('.', expand=True)[0] + '.' + df['type']
df[['fragment', 'primer-name', 'seq']]

100%|██████████| 10/10 [00:14<00:00,  1.46s/it]


Unnamed: 0,fragment,primer-name,seq
0,cymRC-pCym.gb,(P168-RM)_cymRC-pCym.F,CGTTTATATACTAGAGTCAACGTTTGAATTTTGCATAAC
1,cymRC-pCym.gb,(P169-RM)_cymRC-pCym.R,CTCTAGTAATGGAGAAATAATACAAACAGACCAGATTGTCT
2,e11-N-term.gb,(P170-RM)_e11-N-term.F,GGTCTGTTTGTATTATTTCTCCATTACTAGAGTCACACA
3,e11-N-term.gb,(P171-RM)_e11-N-term.R,CGTCAAGTTGTCATAATAAGGATCCTAATTGGTAACGAATC
4,araC-pBAD.gb,(P172-RM)_araC-pBAD.F,CCAATTAGGATCCTTATTATGACAACTTGACGGCTAC
5,araC-pBAD.gb,(P173-RM)_araC-pBAD.R,ACATCCGGTGACAGCTACAGTAGAGAGTTGCGATAAAA
6,e11-C-term-backbone.gb,(P174-RM)_e11-C-term-backbone.F,CGCAACTCTCTACTGTAGCTGTCACCGGATGTGCTT
7,e11-C-term-backbone.gb,(P175-RM)_e11-C-term-backbone.R,AAAATTCAAACGTTGACTCTAGTATATAAACGCAGAAAGGCC
8,e16-N-term.gb,(P176-RM)_e16-N-term.F,GGTCTGTTTGTATTATTTCTCCATTACTAGAGTCACAC
9,e16-N-term.gb,(P177-RM)_e16-N-term.R,CGTCAAGTTGTCATAATAAGGATCCTAATTGGTAACGAAT


In [47]:
df[['primer-name', 'seq']].to_csv('datasets/genbank/primer-list-2.csv', index=False)

### From Scratch

In [None]:
def check_single_repetition(s):
    
    rep = 0
    for i in range(len(s)-1):
        if s[i]==s[i+1]:
            rep += 1
        else:
            rep = 0
        if rep >= 3:
            return True
    return False

def check_double_repetition(s):

    rep = 0
    for i in range(0, len(s)-1)  :
        if s[i:i+2]==s[i+2:i+4]:
            rep += 1
        else:
            rep = 0
        if rep >= 2:
            True

    return False

def check_end_clamp(s):
    
    clamp = s[-5:]
    if clamp[-1]=='G' or clamp[-1]=='C':
        return 1
    elif clamp[-1]=='T':
        return -1
    else:
        return 0

def check_gc_clamp(s):
    
    clamp = s[-5:]
    bases_count = {'G': 0, 'C': 0, 'A': 0, 'T': 0}
    for i in range(len(clamp)):
        bases_count[clamp[i]] += 1
    if bases_count['G'] or bases_count['C'] > 2:
        return -1
    return 1

In [None]:
folder = 'datasets/genbank'
files = ['cymrc-pcym.gb', 'e11-n-term.gb', 'arac-pbad.gb', 'e11-c-term-backbone.gb']
seqs = []
f_overhangs = []
f_anneals = []
r_overhangs = []
r_anneals = []
overlap_len = 16

for f in files:
    with open('{}/{}'.format(folder, f)) as input_handle:
        for record in SeqIO.parse(input_handle, 'genbank'):
            seq = record.seq
            seqs.append(seq)
            f_overhangs.append(seq[-overlap_len:])
            r_overhangs.append(seq[:overlap_len].complement())
            f_anneals.append([seq[:overlap] for overlap in range(18, 26)])
            r_anneals.append([seq[-overlap:].complement() for overlap in range(18, 26)])
num_fragments = len(seqs)

In [None]:
seqs

In [None]:
f_overhangs = [f_overhangs[-1]] + f_overhangs[:-1]
f_overhangs

In [None]:
f_anneals

In [None]:
r_overhangs = r_overhangs[1:] + [r_overhangs[0]]
r_overhangs

In [None]:
r_anneals

In [None]:
forward_primers = [(list(product([f_overhangs[i]], f_anneals[i]))) for i in range(num_fragments)]
reversed_primers = [(list(product(r_anneals[i], [r_overhangs[i]]))) for i in range(num_fragments)]

In [None]:
forward_primers

In [None]:
reversed_primers

In [None]:
candidate_f_primers = []

for forward_primer in forward_primers:
    
    candidate_f_primer = []
    for primer_o, primer_a in forward_primer:
        
        primer_a = str(primer_a)
        gc = GC(primer_a)
        tm_breslaur = pr.calcTm(primer_a, tm_method=0) #breslaur
        tm_santalucia =  pr.calcTm(primer_a, tm_method=1) #santalucia
        
        repeated_single = check_single_repetition(primer_a)
        repeated_double = check_double_repetition(primer_a)
        end_clamp_score = check_end_clamp(primer_a)
        gc_clamp_score = check_gc_clamp(primer_a)
        #print(primer_o, primer_a, gc, tm_santalucia)
        
        primer = str(primer_o) + primer_a
        
        hairpin = pr.calcHairpin(primer).structure_found
        hairpin_tm = pr.calcHairpin(primer).tm

        homodimer = pr.calcHomodimer(primer).structure_found
        homodimer_tm = pr.calcHomodimer(primer).tm
        
        #if tm_santalucia < 40 or tm_santalucia > 60:
        #    continue
        #if gc < 40 or gc > 60:
        #    continue
        #if hairpin and (tm_santalucia - hairpin_tm) < 10:
        #    continue
        #if homodimer and (tm_santalucia - homodimer_tm) < 10:
        #    continue
        #if repeated_single or repeated_double:
        #    continue
        #if end_clamp_score < 0:
        #    continue
            
        candidate_f_primer.append((primer, tm_santalucia, gc))
    candidate_f_primers.append(candidate_f_primer)

In [None]:
candidate_r_primers = []

for reversed_primer in reversed_primers:
    
    candidate_r_primer = []
    for primer_a, primer_o in reversed_primer:
        
        primer_a = str(primer_a)
        gc = GC(primer_a)
        tm_breslaur = pr.calcTm(primer_a, tm_method=0) #breslaur
        tm_santalucia =  pr.calcTm(primer_a, tm_method=1) #santalucia
        
        repeated_single = check_single_repetition(primer_a)
        repeated_double = check_double_repetition(primer_a)
        end_clamp_score = check_end_clamp(primer_a)
        gc_clamp_score = check_gc_clamp(primer_a)
        #print(primer_o, primer_a, gc, tm_santalucia)
        
        primer = primer_a + str(primer_o)
        
        hairpin = pr.calcHairpin(primer).structure_found
        hairpin_tm = pr.calcHairpin(primer).tm

        homodimer = pr.calcHomodimer(primer).structure_found
        homodimer_tm = pr.calcHomodimer(primer).tm
        
        #if tm_santalucia < 40 or tm_santalucia > 60:
        #    continue
        #if gc < 40 or gc > 60:
        #    continue
        #if hairpin and (tm_santalucia - hairpin_tm) < 10:
        #    continue
        #if homodimer and (tm_santalucia - homodimer_tm) < 10:
        #    continue
        #if repeated_single or repeated_double:
        #    continue
        #if end_clamp_score < 0:
        #    continue
            
        candidate_r_primer.append((primer, tm_santalucia, gc))
    candidate_r_primers.append(candidate_r_primer)

In [None]:
primers_combo = [(a, b) for a, b in zip(candidate_f_primers, candidate_r_primers)]

In [None]:
final_primers = []
for x in tqdm(range(num_fragments)):
    final_primer = []
    for FP in primers_combo[x][0]:
        for RP in primers_combo[x][1]:
            heterodimer = pr.calcHeterodimer(FP[0], RP[0]).structure_found
            heterodimer_tm = pr.calcHeterodimer(FP[0], RP[0]).tm
            if heterodimer and (np.min([FP[1], RP[1]]) - heterodimer_tm) >= 10:
                final_primer.append((np.abs(FP[1] - RP[1]), FP, RP))
    final_primers.append(sorted(final_primer)[0])

In [None]:
from Bio.Seq import Seq

to_recommend = []
for i in range(num_fragments):
    
    to_recommend.append((
        str(Seq(final_primers[i][1][0])),
        str(Seq(final_primers[i][2][0]).complement().reverse_complement())
    ))
to_recommend

In [None]:
final_primers[0]

In [None]:
for i in range(16, 33, 8):
    print([a[-i:] for a in f_overhangs])

In [None]:

            
shifted_overhangs = [overhangs[-1]] + overhangs[:-1]
num_fragments = len(seqs)

forward_anneals = [[a[0] for a in aa] for aa in anneals]
forward_overhangs = [[b[1] for b in bb] for bb in shifted_overhangs]
reversed_anneals = [[a[0].complement() for a in aa] for aa in anneals]
reversed_overhangs = [[b[1].complement() for b in bb] for bb in shifted_overhangs]

forward_primers = []
reversed_primers = []
for i in range(num_fragments):
    forward_primers.append(list(product(forward_overhangs[i], forward_anneals[i])))
    reversed_primers.append(list(product(reversed_overhangs[i], reversed_anneals[i])))

In [None]:
[len(seq) for seq in seqs]

In [None]:
forward_primers

In [None]:
reversed_primers

In [None]:
for i, fragment in tqdm(enumerate(fragments)):
    
    
    for i in range(16, 40, 8):
    for j in range(0, i+1, 8):
        overhangs = [fragment[0][:j], fragment[1][:(i-j)]]
        
        
        

    seq = fragment[-1]
    primers = [[], []]
    final_primers = []

    for i in range(18, 25):

        candidate_primers = [str(seq[:i]), str(seq[-i:].reverse_complement())]
        
        for j, primer in enumerate(candidate_primers):

            gc = GC(primer)
            tm_breslaur = pr.calcTm(primer, dntp_conc=dntp, tm_method=0) #breslaur
            tm_santalucia =  pr.calcTm(primer, dntp_conc=dntp, tm_method=1) #santalucia
            
            hairpin = pr.calcHairpin(primer, dntp_conc=dntp).structure_found
            hairpin_tm = pr.calcHairpin(primer, dntp_conc=dntp).tm

            homodimer = pr.calcHomodimer(primer, dntp_conc=dntp).structure_found
            homodimer_tm = pr.calcHomodimer(primer, dntp_conc=dntp).tm

            #print('Tm_Wallace', MeltingTemp.Tm_Wallace(FP))
            #print('Tm_GC', MeltingTemp.Tm_GC(FP))
            #print('Tm_NN', MeltingTemp.Tm_NN(FP))
            #print('IDT', melt.temp(FP))

            repeated_single = check_single_repetition(primer)
            repeated_double = check_double_repetition(primer)
            end_clamp_score = check_end_clamp(primer)
            gc_clamp_score = check_gc_clamp(primer)

            if tm_santalucia < 0 or tm_santalucia > 60:
                continue
            if gc < 40 or gc > 60:
                continue
            if hairpin and (tm_santalucia - hairpin_tm) < 10:
                continue
            if homodimer and (tm_santalucia - homodimer_tm) < 10:
                continue
            #if repeated_single or repeated_double:
            #    continue
            if end_clamp_score < 0:
                continue
            #if gc_clamp_score < 0:
            #    continue
            
            primers[j].append((ext_primer, tm_santalucia, gc))
            
            for k in range(0, 40, 8):
            
                oh_primer = str(component[j][-k:])
                
            
    print(primers)

    for FP in primers[0]:
        for RP in primers[1]:
            heterodimer = pr.calcHeterodimer(FP[0], RP[0], dntp_conc=dntp).structure_found
            heterodimer_tm = pr.calcHeterodimer(FP[0], RP[0], dntp_conc=dntp).tm
            if heterodimer and (np.min([FP[1], RP[1]]) - heterodimer_tm) >= 10:
                final_primers.append((np.abs(FP[1] - RP[1]), FP, RP))
                
    print(final_primers)

In [None]:
folder = 'datasets/genbank'
files = ['cymrc-pcym.gb', 'e11-n-term.gb', 'arac-pbad.gb', 'e11-c-term-backbone.gb']
seqs = []
#dntp = 0.8

for i, component in tqdm(enumerate(components)):

    seq = component[-1]
    primers = [[], []]
    final_primers = []

    for i in range(18, 25):

        candidate_primers = [str(seq[:i]), str(seq[-i:].reverse_complement())]
        
        for j, primer in enumerate(candidate_primers):

            gc = GC(primer)
            tm_breslaur = pr.calcTm(primer, dntp_conc=dntp, tm_method=0) #breslaur
            tm_santalucia =  pr.calcTm(primer, dntp_conc=dntp, tm_method=1) #santalucia
            
            hairpin = pr.calcHairpin(primer, dntp_conc=dntp).structure_found
            hairpin_tm = pr.calcHairpin(primer, dntp_conc=dntp).tm

            homodimer = pr.calcHomodimer(primer, dntp_conc=dntp).structure_found
            homodimer_tm = pr.calcHomodimer(primer, dntp_conc=dntp).tm

            #print('Tm_Wallace', MeltingTemp.Tm_Wallace(FP))
            #print('Tm_GC', MeltingTemp.Tm_GC(FP))
            #print('Tm_NN', MeltingTemp.Tm_NN(FP))
            #print('IDT', melt.temp(FP))

            repeated_single = check_single_repetition(primer)
            repeated_double = check_double_repetition(primer)
            end_clamp_score = check_end_clamp(primer)
            gc_clamp_score = check_gc_clamp(primer)

            if tm_santalucia < 0 or tm_santalucia > 60:
                continue
            if gc < 40 or gc > 60:
                continue
            if hairpin and (tm_santalucia - hairpin_tm) < 10:
                continue
            if homodimer and (tm_santalucia - homodimer_tm) < 10:
                continue
            #if repeated_single or repeated_double:
            #    continue
            if end_clamp_score < 0:
                continue
            #if gc_clamp_score < 0:
            #    continue
            
            primers[j].append((ext_primer, tm_santalucia, gc))
            
            for k in range(0, 40, 8):
            
                oh_primer = str(component[j][-k:])
                
            
    print(primers)

    for FP in primers[0]:
        for RP in primers[1]:
            heterodimer = pr.calcHeterodimer(FP[0], RP[0], dntp_conc=dntp).structure_found
            heterodimer_tm = pr.calcHeterodimer(FP[0], RP[0], dntp_conc=dntp).tm
            if heterodimer and (np.min([FP[1], RP[1]]) - heterodimer_tm) >= 10:
                final_primers.append((np.abs(FP[1] - RP[1]), FP, RP))
                
    print(final_primers)

In [None]:
final_primers

In [None]:
sorted(final_primers)[0]

In [None]:
aligner = PairwiseAligner()
aligner.mode = 'global'
aligner.match_score = 2
aligner.mismatch_score = -1
alignments = aligner.align("TCAGACCG", "CCG")
for alignment in sorted(alignments):
    print("Score = %.1f:" % alignment.score)
    print(alignment)

In [None]:
len('CTCTAGTATATAAACGCAGAAAGGCCCAC')

In [None]:
folder = 'datasets/genbank'
files = ['cymrc-pcym.gb', 'e11-n-term.gb', 'arac-pbad.gb', 'e11-c-term-backbone.gb']
f = files[0]
gb_file = read('{}/{}'.format(folder, f))
gb_file

In [None]:
gb_file.list_features()

In [None]:
seq = gb_file.extract_feature(0)
seq.seq

In [None]:
from pydna.design import primer_design
gfp_amplicon = primer_design(seq)

In [None]:
gfp_amplicon.figure()

In [None]:
class GC_clamps(object):
    '''
    Description:
    ------------
    Class obtains features from GC clamp such as strength and num_GC 
    NN model strength table taken from 
    Khandelwal G, Bhyravabhotla J. A Phenomenological Model for Predicting Melting Temperatures of DNA Sequences. PLOS ONE. 2010;5: e12433. doi:10.1371/journal.pone.0012433
    '''
    #initialize class variables
    clamp = ""
    num_GC = 0
    strength={'GC': 13,'CC': 11,'GG': 11,'CG': 10,'AC': 10,'TC': 8,'AG': 8,'TG': 7,'GT': 10,'CT': 8,'GA': 8,'CA': 7,'AT': 7,'TT': 5,'AA': 5,'TA': 4}
    score=0
    #GC_Clamp class constructor
    def __init__(self, primer):
        self.clamp = primer[-5:]
        for nt in range(len(self.clamp)-1):#inrement by 1 nucleotide at a time
            NN=self.clamp[nt]+self.clamp[nt+1]#grab dinucleotides
            self.score+=self.strength[NN]#GC clamp strength score
        for nt in range(len(self.clamp)):
            if self.clamp[nt]=="G" or self.clamp[nt]=="C":
                self.num_GC+=1

In [None]:
salts={'Pfu':{"Tris":20.0,"Na":0.0,"K":10.0,"Mg":1.5,"dNTPs":0.8, "MV_tot":20.0,"DV_tot":1.5},
           'LongAmp':{"Tris":60.0,"Na":0.0,"K":0.0,"Mg":2.0,"dNTPs":1.2, "MV_tot":20.0,"DV_tot":2.0},
           'PhusionHF':{"Tris":25.0,"Na":0.0,"K":50.0,"Mg":1.5,"dNTPs":0.8, "MV_tot":50,"DV_tot":1.5},
           'SuperMix':{"Tris":66.0,"Na":0.0,"K":0.0,"Mg":2.4,"dNTPs":0.88, "MV_tot":19.8,"DV_tot":2.4},
           'Vent':{"Tris":20.0,"Na":0.0,"K":10.0,"Mg":2.0,"dNTPs":0.8, "MV_tot":20.0,"DV_tot":2.0},
           'PrimeSTAR':{"Tris":0,"Na":0.0,"K":0,"Mg":1,"dNTPs":1.0, "MV_tot":50,"DV_tot":1.0},
           'Custom':{"Tris":10,"Na":0.0,"K":50.0,"Mg":1.5,"dNTPs":0.8, "MV_tot":50,"DV_tot":1.5},
           'Taq':{"Tris":0.0,"Na":0.0,"K":50.5,"Mg":1.5,"dNTPs":0.8, "MV_tot":50.5,"DV_tot":1.5},
           'Q5':{"Tris":25.0,"Na":0.0,"K":50.0,"Mg":2.0,"dNTPs":0.2, "MV_tot":50.0,"DV_tot":2.0}
            }
def_salts={
    'Mg':1.5, 'dNTPs':0.2, 'Na':0.5
}