In [14]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from Bio.Restriction import BsaI, BsmBI
from Bio import SeqIO
from Bio.Seq import Seq
import warnings
warnings.filterwarnings("ignore")

In [117]:
def reindex_ps1(plasmid):
    '''Reindex plasmids to start from the annealing region of PS1 so the fragment will not end at the end of the sequence.
    This will not apply for fragments as they do not have PS1 region'''
    
    new_start = plasmid.find('AGGGCGGCGGATTTGTCC')
    return plasmid[new_start:] + plasmid[:new_start]

In [116]:
#import fasta files
fastas = ['vectors', 'fragments', 'level-0', 'level-1']
plasmids = pd.DataFrame([(p.id, str(p.seq), fasta) for fasta in fastas \
                        for p in list(SeqIO.parse('datasets/jump/{}.fasta'.format(fasta), 'fasta'))], \
                        columns=['name', 'sequence', 'level'])
plasmids['sequence'] = plasmids['sequence'].str.upper()
plasmids.loc[plasmids['level']!='fragments', 'sequence'] = plasmids['sequence'].apply(reindex_ps1)
plasmids

Unnamed: 0,name,sequence,level
0,pJUMP18-Uac,TTAATTAAGGAGTTTTGCAGGTGCACCTGCTTTTCGCTGAATTCGC...,vectors
1,pJUMP26-1A(sfGFP),TTAATTAAGGAGTTTTGCAGGTGCCTTGGAACACCTGCTTTTCGCT...,vectors
2,pJUMP29-1A(sfGFP),TTAATTAAGGAGTTTTGCAGGTGCCTTGGAACACCTGCTTTTCGCT...,vectors
3,pJUMP29-1B(sfGFP),TTAATTAAGGAGTTTTGCAGGTGCCTTGGAACACCTGCTTTTCGCT...,vectors
4,pJUMP29-1B*(sfGFP),ACCACCGCTACCAGCGGTGGTTTGTTTGCCGGATCAAGAGCTACCA...,vectors
5,pJUMP29-1C(sfGFP),TTAATTAAGGAGTTTTGCAGGTGCCTTGGAACACCTGCTTTTCGCT...,vectors
6,pJUMP29-1C*(sfGFP),TCTAGTGTAGCCGTAGTTAGGCCACCACTTCAAGAACTCTGTAGCA...,vectors
7,pJUMP29-1D'(sfGFP),TTAATTAAGGAGTTTTGCAGGTGCCTTGGAACACCTGCTTTTCGCT...,vectors
8,pJUMP29-1D(sfGFP),TTAATTAAGGAGTTTTGCAGGTGCCTTGGAACACCTGCTTTTCGCT...,vectors
9,pJUMP29-1E'(sfGFP),TGCTCAACGGGAATCCTGCTCTGCGAGGCTGGCCGTAGGCCGGCCC...,vectors


In [137]:
#map the name of vectors and parts obtained from addgenes
mapping = {
    'UAC': 'pJUMP18-Uac',
    '1A': 'pJUMP29-1A(sfGFP)',
    '1B': 'pJUMP29-1B(sfGFP)',
    '1B*': 'pJUMP29-1B*(sfGFP)',
    '1C': 'pJUMP29-1C(sfGFP)',
    '1C*': 'pJUMP29-1C*(sfGFP)',
    '1Dp': 'pJUMP29-1D\'(sfGFP)',
    '1D': 'pJUMP29-1D(sfGFP)',
    '1Ep': 'pJUMP29-1E\'(sfGFP)',
    '1E': 'pJUMP29-1E(sfGFP)',
    '1F': 'pJUMP29-1F(sfGFP)',
    '2A': 'pJUMP49-2A(sfGFP)',
    '2B': 'pJUMP49-2B(sfGFP)',
    '2B*': 'pJUMP49-2B*(sfGFP)',
    '2C': 'pJUMP49-2C(sfGFP)',
    '2C*': 'pJUMP49-2C*(sfGFP)',
    '2Dp': 'pJUMP49-2D\'(sfGFP)',
    '2D': 'pJUMP49-2D(sfGFP)',
    '2E': 'pJUMP49-2E(sfGFP)',
    'B0033_RN': 'pJUMP18-B0033-MV_RN',
    'B0033_R': 'pJUMP18-B0033_R',
    'B0034_RN': 'pJUMP18-B0034-MV_RN',
    'B0034_R': 'pJUMP18-B0034_R',
    'mCherry_O': 'pJUMP18-mCherry_O',
    'sGFP_O': 'pJUMP18-sGFP_O',
    'B0015_CT': 'pJUMP19-B0015_CT',
    'B0015_T': 'pJUMP19-B0015_T',
    'J100_P': 'pJUMP19-23100_P',
}

In [133]:
def get_sites(part, odd_level=True, vector=False):
    
    enz = BsaI if odd_level else BsmBI
    
    if vector:
        
        site = str(Seq(enz.site).reverse_complement())
        right_idx = part.find(site) - 5
        right_cut = part[right_idx: right_idx+4]

        site = enz.site
        left_idx = part.find(site) + len(site) + 1
        left_cut = part[left_idx: left_idx+4]
        
        fragment = part[left_idx:] + part[:right_idx+4]
        
    else:
        site = enz.site
        left_idx = part.find(site) + len(site) + 1
        left_cut = part[left_idx: left_idx+4]
        
        site = str(Seq(enz.site).reverse_complement())
        right_idx = part.find(site) - 5
        right_cut = part[right_idx: right_idx+4]
        
        fragment = part[left_idx: right_idx+4]
        
    return left_cut, right_cut, fragment

#part_name_u, get_sites(part_seq_u, odd_level=False, vector=True)

Level 0.

In [134]:
fragments = plasmids[plasmids['level']=='fragments']
uac = plasmids.iloc[:, 1][0]

for i, fragment in fragments.iterrows():
    
    uac_sites = get_sites(uac, odd_level=False, vector=True)
    frag_sites = get_sites(fragment[1], odd_level=False, vector=False)
    
    print('---', fragment[0])
    if (uac_sites[1]==frag_sites[0] and uac_sites[0]==frag_sites[1]):
        assembly = reindex_ps1(frag_sites[2][:-4] + uac_sites[2][:-4])
        
        print(assembly)

--- B0015
AGGGCGGCGGATTTGTCCTACTCAGGAGAGCGTTCACCGACAAACAACAGATAAAACGAAAGGCCCAGTCTTTCGACTGAGCCTTTCGTTTTATTTGATGCCTTTAATTAAGGAGTTTTGCAGGTGCACCTGCTTTTCGCTGAATTCGCGGCCGCTTCTAGAGGGTCTGCGATGTTTGGTCTCAGCTTCCAGGCATCAAATAAAACGAAAGGCTCAGTCGAAAGACTGGGCCTTTCGTTTTATCTGTTGTTTGTCGGTGAACGCTCTCTACTAGAGTCACACTGGCTCACCTTCGGGTGGGCCTTTCTGCGTTTATACGCTTGAGACCTTTCATCGCGACCTACTAGTAGCGGCCGCTGCAGGGAGTTGTCTTCGAAGACTTCGCTCTAGTCTTGGACTCCTGTTGATAGATCCAGTAATGACCTCAGAACTCCATCTGGATTTGTTCAGAACGCTCGGTTGCCGCCGGGCGTTTTTTATTGGTGAGAATCCAGGGGTCCCCAATAATTACGATTTAAATTAGTAGCCCGCCTAATGAGCGGGCTTTTTTTTAATTCCCCTATTTGTTTATTTTTCTAAATACATTCAAATATGTATCCGCTCATGAGACAATAACCCTGATAAATGCTTCAATAATATTGAAAAAGGAAGAGTATGAGCATTCAGCATTTTCGTGTGGCGCTGATTCCGTTTTTTGCGGCGTTTTGCCTGCCGGTGTTTGCGCATCCGGAAACCCTGGTGAAAGTGAAAGATGCGGAAGATCAACTGGGTGCGCGCGTGGGCTATATTGAACTGGATCTGAACAGCGGCAAAATTCTGGAATCTTTTCGTCCGGAAGAACGTTTTCCGATGATGAGCACCTTTAAAGTGCTGCTGTGCGGTGCGGTTCTGAGCCGTGTGGATGCGGGCCAGGAACAACTGGGCCGTCGTATTCATTATAGCCAGAACGATCTGGTGGAATATAGCCCGGTGACCGAAAAACATCTGACC