In [94]:
import sys
import numpy as np
import re

In [95]:
# Read in the gene file +/- 1000 base pairs and break it up into 3 pieces
def read_gene_plus_string(Gene_plus):

    # # Import Gene -+1000 base pairs - clean in the same way
    # with open(genefile, 'r') as file:
    #     Gene_plus = file.read().replace('\n', '')

    # Clean away numbers and spaces
    Gene_plus = Gene_plus.replace(' ', '') # spaces
    remove_digits = str.maketrans('', '', '0123456789')
    Gene_plus = Gene_plus.translate(remove_digits)   # numbers
    Left_of_gene = Gene_plus[0:1000] # 1000 bp's to the left of the Gene
    Right_of_gene = Gene_plus[(len(Gene_plus)-1000):] # 1000 bp's to the right of the Gene (includes 3' UTR segment)
    Gene = Gene_plus[1000:(len(Gene_plus)-1000)]
    
    return (Left_of_gene, Gene, Right_of_gene)

In [96]:
# Read in and process the enzyme list
def read_enzyme_list(enzymefile):

    with open(enzymefile) as file:
        lines=file.readlines()
        enzymes=[line.rstrip() for line in lines]
        
    rsitelist = []
    enamelist = []
    if (len(enzymes) % 2) == 1:
        print('Enzyme list has an odd number of lines. Enzyme list should be a list of enzyme names and restriction sites in each line.')
    else:
        for linei in range(1, len(enzymes), 2):
            rsite = enzymes[linei]
            ename = enzymes[linei-1]
            # check length of restriction site
            if len(rsite) == 6 or len(rsite) == 8:
                # check whether any non-ACGT characters
                if len(re.sub('[ACGT]', '', rsite)) == 0:
                    #check whether palindromic so can focus on only one strand
                    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
                    reverse_complement = "".join(complement.get(base, base) for base in reversed(rsite))
                    if rsite==reverse_complement:
                        # add name only if enzyme already in the list
                        rsiteexists=0 
                        for linej in range(len(rsitelist)):
                            if rsitelist[linej] == rsite:
                                rsiteexists=1
                                enamelist[linej] = enamelist[linej]+', '+ename
                                break
                        #add new entry if not yet there
                        if rsiteexists==0:
                            rsitelist.append(rsite)
                            enamelist.append(ename)
    return (np.array(rsitelist), np.array(enamelist))

In [97]:
# genefile   = './Chris_code/clb2_pm_1000.txt'
# enzymefile = './Chris_code/raw_enzyme_list.txt'
# minhomology = 100
# alpha = 1

In [98]:
# Left_of_gene, Gene, Right_of_gene = read_gene_string(genefile)
# rsitelist, enamelist = read_enzyme_list(enzymefile)

### Loop

#### 3.

In [99]:
# gene_rsitelist_sorted, gene_enamelist_sorted, gene_rsite_position_list_sorted = right_search(Gene, rsitelist, enamelist, minhomology=100)
# gene_rsitelist_sorted, gene_enamelist_sorted, gene_rsite_position_list_sorted

#### 4. and 5.

In [100]:
# # 4. and 5.

# ind = 1

# rsite0 = gene_rsitelist_sorted[ind]
# rsite0_pos = gene_rsite_position_list_sorted[ind]
# ename0 = gene_enamelist_sorted[ind]
# X = Gene[rsite0_pos-minhomology:-3]
# alphaX = Right_of_gene[:int(alpha*len(X))]


# X.find(rsite0)==minhomology

In [101]:
# linker = load_linker('')
# FPG = load_FPG('')

# full_sequence = X + linker + FPG + alphaX
# if full_sequence.count(rsite0) != 1:
#     print('rsite is not unique in the gene')

#### 7.

In [102]:
# MCS = load_MCS('')
# # Search for the most outer CS from MCS that are not in 4. 

# rsite1 = ''
# rsite2 = ''

# msc_rsitelist_sorted_right, _, _ = left_search(MCS, rsitelist, enamelist)
# for i, rsite in enumerate(msc_rsitelist_sorted_right):
#     if rsite not in full_sequence:
#         rsite1 = rsite
#         break

# msc_rsitelist_sorted_left, _, _ = right_search(MCS, rsitelist, enamelist)
# for i, rsite in enumerate(msc_rsitelist_sorted_left):
#     if rsite not in full_sequence:
#         rsite2 = rsite
#         break

# if not rsite1 or not rsite2:
#     print('good rsites not found in MCS')


#### 8.

In [103]:
def assemble_plasmid(backbone_no_MCS_5, backbone_no_MCS_3, sequence): ## TODO CHECK if backbone_no_MCS_5 + sequence + backbone_no_MCS_3 + backbone_no_MCS_5 bad
    return backbone_no_MCS_5 + sequence + backbone_no_MCS_3

# FULL CODE

In [105]:
def read_from_fsa(fsa_file_path):
    if not fsa_file_path:
        return '',''
    try:
        with open(fsa_file_path) as f:
            name = f.readline()[1:-1]
            code = f.read().replace('\n', '')
            return name, code.upper()
    except Exception as e:
        print('Unable to read:', fsa_file_path,e)

# read_from_fsa('./Test_examples/pETUL_backbone.fsa')

In [124]:
# 3.

def rsite_search_func(Gene, rsitelist, enamelist, start_end, min_homology=0, 
                max_homology=1000):

    print(start_end)

    if start_end == 5:
        Gene_cut = Gene[min_homology:max_homology]
        rsite_position_list = np.array([Gene_cut.find(rsite) for rsite in rsitelist])
        sorted_inds = np.argsort(rsite_position_list)
        
    elif start_end == 3:
        Gene_cut = Gene[len(Gene)-max_homology:len(Gene)-min_homology]
        rsite_position_list = np.array([Gene_cut.rfind(rsite) for rsite in rsitelist]) 
        sorted_inds = np.argsort(-rsite_position_list)


    # remove not matched rsites
    rsite_position_list = rsite_position_list[sorted_inds]
    matched_inds = (rsite_position_list != -1)
    rsite_position_list = rsite_position_list[matched_inds]
    if start_end == 5:
        rsite_position_list += min_homology

    # apply sort and remove inds to rsitelist and enamelist
    rsitelist = rsitelist[sorted_inds][matched_inds]
    enamelist = enamelist[sorted_inds][matched_inds]

    return rsitelist, enamelist, rsite_position_list

In [159]:
def generate_full_gene_sequence(Gene, Left_of_gene, Right_of_gene, Gene_side, rsite_position, rsite, minhomology, alpha, linker, FPG):

    #TODO 5p i 3p with respect to the main gene sequence

    if Gene_side not in [3,5]:
        raise ValueError('Wrong Gene_side value')

    if Gene_side == 5:
        X = Gene[rsite_position-minhomology:-3] #We retrieve the 5p end of the gene without the stop codon
        alphaX = Right_of_gene[:int(alpha*len(X))]
        full_sequence = X + linker + FPG + alphaX + 'TGA' #we add the stop codon at the end
    elif Gene_side == 3:
        # TODO check
        X = Gene[:rsite_position+len(rsite)+minhomology]
        alphaX = Left_of_gene[-int(alpha*len(X)):]
        full_sequence = alphaX + FPG + linker + X 

    return full_sequence

def generate_full_3UTR_sequence(Gene, Right_of_gene, rsite_position, rsite, minhomology, alpha, linker, FPG):

    return generate_full_gene_sequence(
        Gene=Right_of_gene,
        Left_of_gene=Gene,
        Right_of_gene='', 
        Gene_side=3, 
        rsite_position=rsite_position, 
        rsite=rsite, 
        minhomology=minhomology, 
        alpha=alpha, 
        linker=linker, 
        FPG=FPG
    )

def generate_full_5UTR_sequence(Gene, Left_of_gene, rsite_position, rsite, minhomology, alpha, linker, FPG):

    return generate_full_gene_sequence(
        Gene=Left_of_gene,
        Right_of_gene=Gene, 
        Left_of_gene='',
        Gene_side=5, 
        rsite_position=rsite_position, 
        rsite=rsite, 
        minhomology=minhomology, 
        alpha=alpha, 
        linker=linker, 
        FPG=FPG
    )

def generate_full_delete_sequence(Left_of_gene, Right_of_gene, rsite_side, rsite_position, rsite, minhomology, alpha):

    #TODO 5p i 3p with respect to the main gene sequence

    if rsite_side not in [3,5]:
        raise ValueError('Wrong rsite_side value')
    if rsite_side == 5:
        X = Left_of_gene[rsite_position-minhomology:]
        alphaX = Right_of_gene[-int(alpha*len(X)):]
        full_sequence = X + alphaX
    elif rsite_side == 3:
        X = Right_of_gene[:rsite_position+len(rsite)+minhomology]
        alphaX = Left_of_gene[-int(alpha*len(X)):]
        full_sequence = alphaX + X

    return full_sequence


def find_compatible_MCS_rsites(MCS, rsitelist, enamelist, full_sequences, backbone_no_MCS_5, backbone_no_MCS_3):

    rsite1 = ''
    rsite2 = ''

    backbone_no_MCS = backbone_no_MCS_3 + backbone_no_MCS_5

    msc_rsitelist_sorted_right, _, _ = rsite_search_func(MCS, rsitelist, enamelist, start_end=5)
    for i, rsite in enumerate(msc_rsitelist_sorted_right):
        if not any(rsite in full_sequence for full_sequence in full_sequences) and not rsite in backbone_no_MCS:
            rsite1 = rsite
            break

    msc_rsitelist_sorted_left, _, _ = rsite_search_func(MCS, rsitelist, enamelist, start_end=3)
    for i, rsite in enumerate(msc_rsitelist_sorted_left):
        if not any(rsite in full_sequence for full_sequence in full_sequences) and not rsite in backbone_no_MCS:
            rsite2 = rsite
            break
        
    return rsite1, rsite2

In [160]:
def rsite_search(Gene, rsitelist, enamelist, modality, alpha, min_homology=0, 
                max_homology=1000, left_of_Gene='', right_of_Gene='', FPGs=[], linker=''):

    if modality not in [0,3,5]:
        raise ValueError('Wrong starting_position value')

    if modality == 5:

        rsitelist_gene, enamelist_gene, rsite_position_list_gene = rsite_search_func(
            Gene, rsitelist, enamelist, modality, min_homology)
        rsitelist_5UTR, enamelist_5UTR, rsite_position_list_35TR = rsite_search_func(
            left_of_Gene, rsitelist, enamelist, 3, min_homology)

        full_sequences_gene = [
            [
                generate_full_gene_sequence(Gene, left_of_Gene, right_of_Gene,
                                            3, rsite_pos, rsite, min_homology, alpha, linker, FPG)
                for FPG in FPGs
            ]
            for rsite_pos, rsite in zip(rsite_position_list_gene, rsitelist_gene)
        ]
        full_sequences_5UTR = [
            [
                generate_full_5UTR_sequence(Gene, right_of_Gene, rsite_pos, rsite, min_homology, alpha, linker, FPG)
                for FPG in FPGs
            ]
            for rsite_pos, rsite in zip(rsite_position_list_35TR, rsitelist_5UTR)
        ]
        

        return (
            np.append(rsitelist_gene, rsitelist_5UTR),
            np.append(enamelist_gene, enamelist_5UTR),
            np.append(rsite_position_list_gene, rsite_position_list_35TR),
            [3]*len(rsitelist_gene)+[5]*len(rsitelist_5UTR),
            full_sequences_gene+full_sequences_5UTR
        )

    elif modality == 3:

        rsitelist_gene, enamelist_gene, rsite_position_list_gene = rsite_search_func(
            Gene, rsitelist, enamelist, modality, min_homology)
        rsitelist_3UTR, enamelist_3UTR, rsite_position_list_3UTR = rsite_search_func(
            right_of_Gene, rsitelist, enamelist, 5, min_homology)

        full_sequences_gene = [
            [
                generate_full_gene_sequence(Gene, left_of_Gene, right_of_Gene,
                                            5, rsite_pos, rsite, min_homology, alpha, linker, FPG)
                for FPG in FPGs
            ]
            for rsite_pos, rsite in zip(rsite_position_list_gene, rsitelist_gene)
        ]
        full_sequences_3UTR = [
            [
                generate_full_3UTR_sequence(Gene, right_of_Gene, rsite_pos, rsite, min_homology, alpha, linker, FPG)
                for FPG in FPGs
            ]
            for rsite_pos, rsite in zip(rsite_position_list_3UTR, rsitelist_3UTR)
        ]

        return (
            np.append(rsitelist_gene, rsitelist_3UTR),
            np.append(enamelist_gene, enamelist_3UTR),
            np.append(rsite_position_list_gene, rsite_position_list_3UTR),
            [5]*len(rsitelist_gene)+[3]*len(rsitelist_3UTR),
            full_sequences_gene+full_sequences_3UTR
        )

    else:
        rsitelist_5, enamelist_5, rsite_position_list_5 = rsite_search_func(left_of_Gene, rsitelist,
                                                                            enamelist, 3, min_homology)
        rsitelist_3, enamelist_3, rsite_position_list_3 = rsite_search_func(right_of_Gene, rsitelist,
                                                                            enamelist, 5, min_homology)
        full_sequences_5 = [
            [
                generate_full_delete_sequence(left_of_Gene, right_of_Gene,
                                              5, rsite_pos, rsite, min_homology, alpha)
            ]
            for rsite_pos, rsite in zip(rsite_position_list_5, rsitelist_5)
        ]
        full_sequences_3 = [
            [
                generate_full_delete_sequence(left_of_Gene, right_of_Gene,
                                              3, rsite_pos, rsite, min_homology, alpha)
            ]
            for rsite_pos, rsite in zip(rsite_position_list_3, rsitelist_3)
        ]

        if len(rsitelist_5) and len(rsitelist_3) and \
            len(left_of_Gene)-rsite_position_list_5[0]>rsite_position_list_3[0]:
            return (
                np.append(rsitelist_3, rsitelist_5),
                np.append(enamelist_3, enamelist_5),
                np.append(rsite_position_list_3, rsite_position_list_5),
                [3]*len(rsitelist_3)+[5]*len(rsitelist_5),
                full_sequences_3+full_sequences_5
            )
        else:
            return (
                np.append(rsitelist_5, rsitelist_3),
                np.append(enamelist_5, enamelist_3),
                np.append(rsite_position_list_5, rsite_position_list_3),
                [5]*len(rsitelist_5)+[3]*len(rsitelist_3),
                full_sequences_5+full_sequences_3
            )


In [161]:

# INPUTS
# 1. Plasmid backbone (pETUL): '*_backbone.fsa'
# 2. Starting and ending index of MCS in the plasmid backbone (nucleotides counted from 1)
# 3. min_homology: minimal homology in bp that should be included around the cutsite for integration into the yeast genome
# 4. max_homology: maximal length in bp of the part used for the integration into the yeast genome 
# (this is used as a stopping criteria to search for cutsite, should be < 1000 bp).
# 4. alpha: relative length of the homology part used for integration and homology part used for pop-out
# 5. Gene to be tagged or deleted, including 1000 bp upstream of starting codon and 1000 bp downstream of stop: '*_gene.fsa'
# 6. Linker sequence: '*_linker.fsa'
# 7. A parameter that described the choice between deleting the gene (0), tagging at the 5' end (5) or tagging at the 3' end (3)
# 8. In case of tagging, coding sequences of FPG (including the stop codon) should be given as a separate files ending with: '*_FPG' 
# (or maybe a single file with fasta, whatever is easier)
# 9. A list of commercially available enzymes and their cutsites 

def main(backbone_path, MCS_start_ind, MCS_end_ind, min_homology, max_homology, alpha, 
        Gene_path, linker_path, modality, enzyme_path, FPG_paths=[]):
        
    # Read all the files (1. and 2.)
    _, backbone = read_from_fsa(backbone_path)
    print('Backbone len:', len(backbone))

    MCS = backbone[MCS_start_ind-1:MCS_end_ind]
    backbone_no_MCS_5 = backbone[0:MCS_start_ind-1]
    backbone_no_MCS_3 = backbone[MCS_end_ind:]
    print('MCS len', len(MCS))

    _, linker = read_from_fsa(linker_path)
    print('Linker len:', len(linker))
    _, Gene_plus = read_from_fsa(Gene_path)
    print('Gene plus len:', len(Gene_plus))

    left_of_Gene, Gene, right_of_Gene = read_gene_plus_string(Gene_plus)
    print('Gene len:', len(Gene))

    FPGs = [read_from_fsa(path)[1] for path in FPG_paths] 
    print('FPG len:', len(FPGs))

    rsitelist, enamelist = read_enzyme_list(enzyme_path)


    # 3.
    gene_rsitelist_sorted, gene_enamelist_sorted, gene_rsite_position_list_sorted, rs_places, full_sequences_per_rsite = rsite_search(
        Gene, rsitelist, enamelist, modality, alpha, min_homology, max_homology, left_of_Gene, right_of_Gene, FPGs, linker)

    print(gene_rsitelist_sorted, gene_enamelist_sorted, gene_rsite_position_list_sorted, rs_places)


    compatible_restriction_sites = []
    optimal_plasmid = ''
    MCS_rsites = []

    for i in range(len(gene_rsitelist_sorted)):

        # print('\n')

        # 4. and 5.
        rsite0 = gene_rsitelist_sorted[i]
        rsite0_pos = gene_rsite_position_list_sorted[i]
        ename0 = gene_enamelist_sorted[i]
        rs_place = rs_places[i]
        full_sequences = full_sequences_per_rsite[i]
        print(i, rsite0, ename0, 'side:',rs_place)
            
        # check if all full sequences have only one rsite0  
        if any([(full_sequence.count(rsite0) != 1) for full_sequence in full_sequences]):
            print('rsite {} is not unique in the full sequence'.format(rsite0))
            continue
            

        # 7.
        # Search for the most outer CS from MCS that are not in 4. 
        rsite1, rsite2 = find_compatible_MCS_rsites(MCS, rsitelist, enamelist, full_sequences, backbone_no_MCS_5, backbone_no_MCS_3)

        if not rsite1 or not rsite2:
            print('good rsites not found in MCS, rsites: ', rsite1, rsite2)
            # If not potential cutsites were find in MCS abort the program (finding another cutsite for gene integration would make the sequence for cloning longer)
            break


        # 8. 
        full_plasmid = assemble_plasmid(backbone_no_MCS_5, backbone_no_MCS_3, full_sequences[0])
            
        if full_plasmid.count(rsite0) != 1:
            print('bad backbone for rsite0', rsite0)

        else:
            if len(compatible_restriction_sites) == 0:
                optimal_plasmid = full_plasmid
                MCS_rsites = (rsite1, rsite2)
            compatible_restriction_sites.append((ename0, rsite0, rsite0_pos,rs_place))

    return optimal_plasmid, compatible_restriction_sites, MCS_rsites


In [162]:
backbone_path = './Test_examples/pETUL_backbone.fsa'
linker_path = './Test_examples/long_linker.fsa'

Gene_path = './Test_examples/CLB2_3p_labeling/CLB2_pm_1000.fsa'
FPG_paths = [
    './Test_examples/CLB2_3p_labeling/mCherry_FPG.fsa',
    './Test_examples/CLB2_3p_labeling/ymNeonGreen_FPG.fsa',
    './Test_examples/CLB2_3p_labeling/ymTq2_FPG.fsa'
]
modality = 3

# Gene_path = './Test_examples/CLB5_3p_labeling/CLB5_pm_1000_gene.fsa'
# FPG_paths = [
#     './Test_examples/CLB5_3p_labeling/mCherry_FPG.fsa',
#     './Test_examples/CLB5_3p_labeling/ymNeonGreen_FPG.fsa',
#     './Test_examples/CLB5_3p_labeling/ymTq2_FPG.fsa'
# ]
# gene_side = 5

# Gene_path = './Test_examples/GAL4_deletion/GAL4_pm_1000_gene.fsa'
# FPG_paths = ['']
# modality = 0

# Gene_path = './Test_examples/HTB2_3p_labeling/HTB2_pm_1000.fsa'
# FPG_paths = [
#     './Test_examples/HTB2_3p_labeling/mCherry_FPG.fsa',
#     './Test_examples/HTB2_3p_labeling/sfGFP_FPG.fsa'
# ]
# gene_side = 5

enzymefile = './Chris_code/raw_enzyme_list.txt'

optimal_plasmid, compatible_restriction_sites, mcs = main(backbone_path, 1, 108, 70, 1000, 1.4, Gene_path, linker_path, 
                                    modality, enzymefile, FPG_paths)

print(len(optimal_plasmid), mcs)
compatible_restriction_sites


Backbone len: 6598
MCS len 108
Linker len: 57
Gene plus len: 3476
Gene len: 1476
FPG len: 3
3
5
['AAGCTT' 'TCTAGA' 'TCATGA' 'CAATTG' 'GTATAC' 'CAGCTG' 'GATATC' 'TTCGAA'
 'TTTAAA' 'AATATT' 'AGATCT' 'TTTAAA' 'TTCGAA' 'ATTAAT' 'TCCGGA' 'ATGCAT'
 'TTATAA' 'AATATT' 'GAATTC' 'TCATGA' 'ACTAGT' 'AGTACT' 'CTTAAG' 'TGTACA'
 'ATCGAT'] ['HindIII, HindIII-HFÂ®' 'XbaI' 'BspHI' 'MfeI, MfeI-HFÂ®'
 'BstZ17I, BstZ17I-HFÂ®' 'PvuII, PvuII-HFÂ®' 'EcoRV, EcoRV-HFÂ®' 'BstBI'
 'DraI' 'SspI, SspI-HFÂ®' 'BglII' 'DraI' 'BstBI' 'AseI' 'BspEI'
 'NsiI, NsiI-HFÂ®' 'PsiI' 'SspI, SspI-HFÂ®' 'EcoRI, EcoRI-HFÂ®' 'BspHI'
 'SpeI, SpeI-HFÂ®' 'ScaI-HFÂ®' 'AflII' 'BsrGI, BsrGI-HFÂ®' 'BspDI, ClaI'] [923 907 879 872 813 737 684 627 567 289 270 148 171 232 303 308 447 554
 719 723 734 737 749 791 938] [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
0 AAGCTT HindIII, HindIII-HFÂ® side: 5
5
3
1 TCTAGA XbaI side: 5
5
3
2 TCATGA BspHI side: 5
rsite TCATGA is not unique in the full sequence
3 CAATTG MfeI,

[('HindIII, HindIII-HFÂ®', 'AAGCTT', 923, 5),
 ('XbaI', 'TCTAGA', 907, 5),
 ('BglII', 'AGATCT', 270, 5),
 ('BspEI', 'TCCGGA', 303, 3),
 ('SpeI, SpeI-HFÂ®', 'ACTAGT', 734, 3)]

In [142]:
_, backbone = read_from_fsa('./Test_examples/pETUL_backbone.fsa')
MCS = backbone[0:108]
MCS.find('GGTACC')

0

In [255]:

read_from_fsa('./Test_examples/CLB2_3p_labeling/ymNeonGreen_FPG.fsa')[1].find('GGTACC')

105

In [2]:
def dna_to_protein(dna):
     #Translates DNA into a Protein. it truncates the 3' tail that doesn't make a full codon
     #STOP is denoted by '*'
     
	dna = dna.upper()

	genetic_code = {
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                 
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
        'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*',
        'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W',
    }
	protein = ''


	if(len(dna)%3 != 0):
		dna = dna[: -(len(dna)%3)]

	for i in range(0, len(dna), 3):
		code = dna[i:i+3]
		if(code in genetic_code.keys()):
		    protein += genetic_code[code]
		else:
			protein = ''
			break
	
	return protein


In [3]:
def  find_additional_cutsites(plasmid, rsitelist, enamelist):
#this function finds all appropropriate cutsites that could be added between different pieces of the insert in the plasmid (e.g. between the gene and the linker or between the linker and the FPG etc)
#a good cutsite has to fullfill three criteria:
#1. Divisible by 3 (this could be circuimvented by accomidating the linker length)
#2. Does not cut the final plasmid
#3. Does not introduce a stop codon

    good_rsite_list = []
    good_enzyme_list = []
    for rsite, enzyme in zip(rsitelist, enamelist):
        if len(rsite)%3 == 0 and plasmid.count(rsite) == 0 and dna_to_protein(rsite).count('*') == 0:
            good_rsite_list.append(rsite)
            good_enzyme_list.append(enzyme)

    return good_rsite_list, good_enzyme_list

In [4]:
dna_to_protein('TGCTACtga').count('*')

1