In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from Bio.Restriction import BsaI, BsmBI
from Bio import SeqIO
from Bio.Seq import Seq
import warnings
warnings.filterwarnings("ignore")

In [2]:
def reindex_ps1(plasmid):
    '''Reindex plasmids to start from the annealing region of PS1 so the fragment will be in the middle of the sequence.
    This will do nothing for a fragment from pcr products as they do not have PS1 region'''
    
    new_start = plasmid.find('AGGGCGGCGGATTTGTCC')
    return plasmid[new_start:] + plasmid[:new_start]

In [3]:
#import fasta files
fastas = ['vectors', 'fragments', 'level-0', 'level-1', 'addgene']
plasmids = pd.DataFrame([(p.id, str(p.seq), fasta) for fasta in fastas \
                        for p in list(SeqIO.parse('datasets/jump/{}.fasta'.format(fasta), 'fasta'))], \
                        columns=['name', 'sequence', 'level'])
plasmids['sequence'] = plasmids['sequence'].str.upper()
plasmids.loc[plasmids['level']!='fragments', 'sequence'] = plasmids['sequence'].apply(reindex_ps1)
plasmids

Unnamed: 0,name,sequence,level
0,pJUMP18-Uac,AGGGCGGCGGATTTGTCCTACTCAGGAGAGCGTTCACCGACAAACA...,vectors
1,pJUMP26-1A(sfGFP),AGGGCGGCGGATTTGTCCTACTCAGGAGAGCGTTCACCGACAAACA...,vectors
2,pJUMP29-1A(sfGFP),AGGGCGGCGGATTTGTCCTACTCAGGAGAGCGTTCACCGACAAACA...,vectors
3,pJUMP29-1B(sfGFP),AGGGCGGCGGATTTGTCCTACTCAGGAGAGCGTTCACCGACAAACA...,vectors
4,pJUMP29-1B*(sfGFP),AGGGCGGCGGATTTGTCCTACTCAGGAGAGCGTTCACCGACAAACA...,vectors
5,pJUMP29-1C(sfGFP),AGGGCGGCGGATTTGTCCTACTCAGGAGAGCGTTCACCGACAAACA...,vectors
6,pJUMP29-1C*(sfGFP),AGGGCGGCGGATTTGTCCTACTCAGGAGAGCGTTCACCGACAAACA...,vectors
7,pJUMP29-1D'(sfGFP),AGGGCGGCGGATTTGTCCTACTCAGGAGAGCGTTCACCGACAAACA...,vectors
8,pJUMP29-1D(sfGFP),AGGGCGGCGGATTTGTCCTACTCAGGAGAGCGTTCACCGACAAACA...,vectors
9,pJUMP29-1E'(sfGFP),AGGGCGGCGGATTTGTCCTACTCAGGAGAGCGTTCACCGACAAACA...,vectors


In [4]:
#map the name of vectors and parts obtained from addgenes
mapping = {
    'UAC': 'pJUMP18-Uac',
    '1A': 'pJUMP29-1A(sfGFP)',
    '1B': 'pJUMP29-1B(sfGFP)',
    '1B*': 'pJUMP29-1B*(sfGFP)',
    '1C': 'pJUMP29-1C(sfGFP)',
    '1C*': 'pJUMP29-1C*(sfGFP)',
    '1Dp': 'pJUMP29-1D\'(sfGFP)',
    '1D': 'pJUMP29-1D(sfGFP)',
    '1Ep': 'pJUMP29-1E\'(sfGFP)',
    '1E': 'pJUMP29-1E(sfGFP)',
    '1F': 'pJUMP29-1F(sfGFP)',
    '2A': 'pJUMP49-2A(sfGFP)',
    '2B': 'pJUMP49-2B(sfGFP)',
    '2B*': 'pJUMP49-2B*(sfGFP)',
    '2C': 'pJUMP49-2C(sfGFP)',
    '2C*': 'pJUMP49-2C*(sfGFP)',
    '2Dp': 'pJUMP49-2D\'(sfGFP)',
    '2D': 'pJUMP49-2D(sfGFP)',
    '2E': 'pJUMP49-2E(sfGFP)',
    'B0033_RN': 'pJUMP18-B0033-MV_RN',
    'B0033_R': 'pJUMP18-B0033_R',
    'B0034_RN': 'pJUMP18-B0034-MV_RN',
    'B0034_R': 'pJUMP18-B0034_R',
    'mCherry_O': 'pJUMP18-mCherry_O',
    'sGFP_O': 'pJUMP18-sfGFP_O',
    'B0015_CT': 'pJUMP19-B0015_CT',
    'B0015_T': 'pJUMP19-B0015_T',
    'J100_P': 'pJUMP19-23100_P',
}

In [5]:
def get_sites(part, odd_level=True, vector=False):
    
    enz = BsaI if odd_level else BsmBI
    
    if vector:
        
        site = str(Seq(enz.site).reverse_complement())
        right_idx = part.find(site) - 5
        right_cut = part[right_idx: right_idx+4]

        site = enz.site
        left_idx = part.find(site) + len(site) + 1
        left_cut = part[left_idx: left_idx+4]
        
        fragment = part[left_idx:] + part[:right_idx+4]
        
    else:
        site = enz.site
        left_idx = part.find(site) + len(site) + 1
        left_cut = part[left_idx: left_idx+4]
        
        site = str(Seq(enz.site).reverse_complement())
        right_idx = part.find(site) - 5
        right_cut = part[right_idx: right_idx+4]
        
        fragment = part[left_idx: right_idx+4]
        
    return left_cut, right_cut, fragment

#part_name_u, get_sites(part_seq_u, odd_level=False, vector=True)

Level 0.

In [7]:
fragments = plasmids[plasmids['level']=='fragments']
fragments_map = dict(zip(fragments['name'], fragments['sequence']))
vectors = plasmids[plasmids['level']=='vectors']
vectors_map = dict(zip(vectors['name'], vectors['sequence']))
uac = vectors_map[mapping['UAC']]

def assemble_lvl_0(fragments, uac):
    
    constructs = []
    for name in fragments:

        uac_sites = get_sites(uac, odd_level=False, vector=True)
        frag_sites = get_sites(fragments[name], odd_level=False, vector=False)
        
        if (uac_sites[1]==frag_sites[0] and uac_sites[0]==frag_sites[1]):
            assembly = reindex_ps1(frag_sites[2][:-4] + uac_sites[2][:-4])
            constructs.append((name, assembly))
            
    return pd.DataFrame(constructs, columns=['name', 'sequence'])

lvl0 = assemble_lvl_0(fragments_map, uac)
new_promoters = lvl0[lvl0['name'].isin(['PBAD-RiboJ', 'PLuxB-RiboJ', 'PSalTTC-RiboJ', 'PBetI-RiboJ', 'PTac-RiboJ'])]
new_promoters['name'] = 'pJ0-' + new_promoters['name'] + '_P'
new_promoters['size'] = new_promoters['sequence'].apply(lambda x: len(x))
new_promoters.to_csv('datasets/jump/level-0-outputs.csv', index=False)

In [12]:
assembly_plan = pd.read_csv('datasets/jump/level-1-assembly.csv')
lvl1_assembly = assembly_plan[['promoter', 'rbs', 'cds', 'terminator', 'vector']].melt()
lvl1_assembly['name'] = lvl1_assembly['value'].apply(lambda x: mapping[x] if x in mapping else '')
lvl1_assembly.loc[lvl1_assembly['name']=='', 'name'] = 'pJ0-' + lvl1_assembly['value']
lvl1_assembly = lvl1_assembly.drop_duplicates().reset_index(drop=True)
lvl1_assembly = pd.merge(lvl1_assembly, plasmids, on='name', how='left')
assembly_plan_map = dict(zip(lvl1_assembly['value'], lvl1_assembly['sequence']))

In [13]:
def assemble_lvl_1(assembly_plan, mapping):

    constructs = []
    for i, entry in assembly_plan.iterrows():

        _id = entry[0]
        name = entry[1]
        vector = entry[-1]
        parts = entry[2:-1].tolist()

        fragments = []
        fragments.append(get_sites(mapping[vector], odd_level=True, vector=True))
        for part in parts:
            fragments.append(get_sites(mapping[part], odd_level=True, vector=False))
            
        sites = list(map(list, zip(*[fragment[:2] for fragment in fragments])))
        sites[0] = sites[0][1:] + [sites[0][0]]
        
        if (sites[0]==sites[1]):
            assembly = reindex_ps1(''.join([fragment[2][:-4] for fragment in fragments]))
            constructs.append(('pJ1-' + name, assembly))
        else:
            print('Error at', name)
        
    df_constructs = pd.DataFrame(constructs, columns=['name', 'sequence'])
    df_constructs['length'] = df_constructs['sequence'].apply(lambda x: len(x))
    return df_constructs
        
assemble_lvl_1(assembly_plan, assembly_plan_map).to_csv('datasets/jump/level-1-simulate.csv', index=False)