In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from itertools import permutations
from primers import primers
from Bio.Seq import Seq
from Bio.Restriction import BsaI, BsmBI

In [30]:
res_enzymes = [BsaI.site, BsmBI.site]
forbidden_sites = []
for enz in res_enzymes:
    forbidden_sites.append(enz)
    forbidden_sites.append(str(Seq(enz).reverse_complement()))
forbidden_sites

['GGTCTC', 'GAGACC', 'CGTCTC', 'GAGACG']

In [31]:
parts = pd.read_csv('datasets/moclo/level-0-parts.csv')
parts['sequence'] = parts['sequence'].str.upper()
#forbidden_sites = ['GGTCTC', 'GAGACC', 'CGTCTC', 'GAGACG']
parts['valid'] = parts['sequence'].apply(lambda x: np.sum([x.count(a) for a in forbidden_sites]))

In [32]:
sites = pd.read_csv('datasets/moclo/jump-sites.csv')
parts = pd.merge(parts, sites, on='sites', how='left')
parts

Unnamed: 0,name,type,sites,sequence,valid,overhang_1,overhang_2
0,P(11_LnE20),Promoter,P,GCCTCCACACCGCTCGTCACATCCTGTGATCCATTCCCCGCCCATC...,0,GGAG,TACT
1,E11-PhoRadA-N,CDS,NOC,ATGAGCGATAGTCCGCAGAAACTGGGTCGTAATGAATGGAATGCAT...,0,AATG,GCTT
2,E11-PhoRadA-C,CDS,NOC,ACAGATGTAACGATTAAAAGAATAATATCCAAAGGAGAACTTGAAT...,0,AATG,GCTT
3,P(20_T5Ln17),Promoter,P,GCGCGGATAAAAATTTCATTTGCCCGCGACGGTTTTTCCGCCCATC...,0,GGAG,TACT
4,E20-gp411-N,CDS,NOC,AATGAAACCGATCCTGATCTGGAACTGCTGAAACGTATTGGTAATA...,0,AATG,GCTT
5,E20-gp411-C,CDS,NOC,CTGAAAAAAATCCTGAAAATCGAGGAACTGGATGAACGCGAACTGA...,0,AATG,GCTT
6,P(38_up1322),Promoter,P,CAGTACAAAATTTTTTAGATGCGTTGTACAACCCTCACGGGGGTGG...,0,GGAG,TACT
7,E38-gp418-N,CDS,NOC,CCTGCCGCACGTCCGGCACGTATTACCAATCAGCGTGATGGTGCAG...,0,AATG,GCTT
8,E38-gp418-C,CDS,NOC,TGTGAAATCTTTGAAAACGAGATCGACTGGGATGAAATTGCCAGCA...,0,AATG,GCTT
9,P(42_nat+),Promoter,P,CAGTACAAAATTTTTTAGATGCGTTCGCTGTCGATCCGGCCCGTCG...,0,GGAG,TACT


In [41]:
forbidden_sites[2] + forbidden_sites[0] + 'A', 'T' + forbidden_sites[1] + forbidden_sites[3]

('CGTCTCGGTCTCA', 'TGAGACCGAGACG')

In [42]:
#prefix, suffix = 'CGTCTCGGTCTCA', 'TGAGACCTGAGACG'
prefix, suffix = forbidden_sites[2] + forbidden_sites[0] + 'A', 'T' + forbidden_sites[1] + forbidden_sites[3]
parts['bases'] = prefix + parts['overhang_1'] + parts['sequence'] + parts['overhang_2'] + suffix
parts_output = parts[['name', 'bases']]
parts_output.columns = ['Name', 'Bases']
#parts_output.to_csv('datasets/moclo/level-0-parts-output.csv', index=False)
parts_output

Unnamed: 0,Name,Bases
0,P(11_LnE20),CGTCTCGGTCTCAGGAGGCCTCCACACCGCTCGTCACATCCTGTGA...
1,E11-PhoRadA-N,CGTCTCGGTCTCAAATGATGAGCGATAGTCCGCAGAAACTGGGTCG...
2,E11-PhoRadA-C,CGTCTCGGTCTCAAATGACAGATGTAACGATTAAAAGAATAATATC...
3,P(20_T5Ln17),CGTCTCGGTCTCAGGAGGCGCGGATAAAAATTTCATTTGCCCGCGA...
4,E20-gp411-N,CGTCTCGGTCTCAAATGAATGAAACCGATCCTGATCTGGAACTGCT...
5,E20-gp411-C,CGTCTCGGTCTCAAATGCTGAAAAAAATCCTGAAAATCGAGGAACT...
6,P(38_up1322),CGTCTCGGTCTCAGGAGCAGTACAAAATTTTTTAGATGCGTTGTAC...
7,E38-gp418-N,CGTCTCGGTCTCAAATGCCTGCCGCACGTCCGGCACGTATTACCAA...
8,E38-gp418-C,CGTCTCGGTCTCAAATGTGTGAAATCTTTGAAAACGAGATCGACTG...
9,P(42_nat+),CGTCTCGGTCTCAGGAGCAGTACAAAATTTTTTAGATGCGTTCGCT...


In [11]:
def design_primers(parts, prefix, suffix, start=1):
    
    primers_list = []
    counter = 0
    for i, part in tqdm(parts.iterrows()):
        
        try:
            fp, rp = primers(part['sequence'], add_fwd=prefix + part['overhang_1'],
                             add_rev=str(Seq(suffix).reverse_complement()) + str(Seq(part['overhang_2']).reverse_complement()))
            
            #forward primers
            primers_list.append(('(P{}J-RM)_{}.F'.format(str(counter+start).zfill(3), part['name']), fp.seq, 
                                 fp.tm, fp.tm_total, fp.gc, fp.dg, fp.fwd, fp.offtargets, fp.penalty))
            counter += 1
            
            #reverse primers
            primers_list.append(('(P{}J-RM)_{}.R'.format(str(counter+start).zfill(3), part['name']), rp.seq,
                                 rp.tm, rp.tm_total, rp.gc, rp.dg, rp.fwd, rp.offtargets, rp.penalty))
            counter += 1
        
        except:
            print('Error at', i, part['name'])
            continue
        
    return pd.DataFrame(primers_list)#, columns=['part', 'sequence'
                                     #          'tm', 'tm_total', 'gc', 'dg', 'fwd', 'offtargets', 'penalty'])
        
primers_df = design_primers(parts, prefix, suffix)
#primers_df.to_csv('datasets/moclo/level-0-primers.csv', index=False)

17it [00:05,  3.93it/s]

Error at 15 RBS32
Error at 16 RBS33


25it [00:08,  3.01it/s]


In [13]:
primers_df.columns = ['name', 'sequence', 'tm', 'tm_total', 'gc', 'dg', 'fwd', 'offtargets', 'penalty']
primers_df

Unnamed: 0,name,sequence,tm,tm_total,gc,dg,fwd,offtargets,penalty
0,(P001J-RM)_P(11_LnE20).F,CGTCTCGGTCTCAGGAGGCCTCCACACCGCTCG,68.5,79.9,0.7,-1.76807,True,0,16.63614
1,(P002J-RM)_P(11_LnE20).R,CGTCTCAGGTCTCAAGTAAGAGGTGTTACGATAGATGGGCG,68.2,75.2,0.5,-0.341245,False,0,7.88249
2,(P003J-RM)_E11-PhoRadA-N.F,CGTCTCGGTCTCAAATGATGAGCGATAGTCCGCA,62.8,74.1,0.5,-2.760765,True,0,11.32153
3,(P004J-RM)_E11-PhoRadA-N.R,CGTCTCAGGTCTCAAAGCTTATTATCTTCTCAGTACCTCTTCC,62.1,72.7,0.4,0.0,False,0,3.4
4,(P005J-RM)_E11-PhoRadA-C.F,CGTCTCGGTCTCAAATGACAGATGTAACGATTAAAAGAATAATATC,60.6,70.1,0.3,-3.57113,True,0,16.14226
5,(P006J-RM)_E11-PhoRadA-C.R,CGTCTCAGGTCTCAAAGCTTATTAGACGCTCGCG,60.5,74.2,0.5,-1.443465,False,0,10.38693
6,(P007J-RM)_P(20_T5Ln17).F,CGTCTCGGTCTCAGGAGGCGCGGATAAAAATTTCATTTG,63.5,75.3,0.5,-3.634815,True,0,8.76963
7,(P008J-RM)_P(20_T5Ln17).R,CGTCTCAGGTCTCAAGTATGGGTTCAACGATAGATGG,63.0,73.1,0.5,0.0,False,0,4.0
8,(P009J-RM)_E20-gp411-N.F,CGTCTCGGTCTCAAATGAATGAAACCGATCCTGATC,60.0,72.2,0.5,-6.3339,True,0,17.6678
9,(P010J-RM)_E20-gp411-N.R,CGTCTCAGGTCTCAAAGCTTATTATTCTTTAACATACAGGCAC,59.6,71.2,0.4,0.0,False,0,5.7


In [14]:
primers_df.to_csv('datasets/moclo/level-0-primers.csv', index=False)

In [None]:
parts = pd.read_csv('datasets/moclo/parts.csv')
sites = pd.read_csv('datasets/moclo/sites.csv')

In [None]:
parts['sequence'] = parts['sequence'].str.upper()

In [None]:
parts['len'] = parts['sequence'].apply(lambda x: len(x))
parts

In [None]:
prefix = 'gggtctca'
suffix = 'aggtctct'
fusion_sites = pd.Series(sites['sites'].values, index=sites['overhang']).to_dict()
parts_seq = pd.Series(parts['sequence'].values, index=parts['part_name']).to_dict()

In [None]:
primers(parts_seq['P11'])

In [None]:
pd.DataFrame(primers(parts_seq['P11']))

In [None]:
p11_ab_f, p11_ab_r = primers(parts_seq['P11'], add_fwd=prefix+fusion_sites['A'], add_rev=suffix+fusion_sites['B'])
p11_cb_f, p11_cb_r = primers(parts_seq['P11'], add_fwd=prefix+fusion_sites['C'], add_rev=suffix+fusion_sites['B'])

In [None]:
promoters = parts['part_name'].tolist()[:8]
promoters

In [None]:
prom_primers = pd.DataFrame()
for p in tqdm(promoters):
    for s in ['A', 'C', 'D', 'E', 'F', 'G', 'H']:
        primer_pair = pd.DataFrame(primers(parts_seq[p], add_fwd=prefix+fusion_sites[s], add_rev=suffix+fusion_sites['B']))
        primer_pair['part_name'] = p
        primer_pair['scar'] = s + 'B'
        prom_primers = prom_primers.append(primer_pair)

In [None]:
prom_primers.reset_index(drop=True, inplace=True)

In [None]:
prom_primers