In [1]:
import sys
import numpy as np
import re

In [2]:
# Read in the gene file +/- 1000 base pairs and break it up into 3 pieces
def read_gene_string(genefile):

    # Import Gene -+1000 base pairs - clean in the same way
    with open(genefile, 'r') as file:
        Gene_plus = file.read().replace('\n', '')

    # Clean away numbers and spaces
    Gene_plus = Gene_plus.replace(' ', '') # spaces
    remove_digits = str.maketrans('', '', '0123456789')
    Gene_plus = Gene_plus.translate(remove_digits)   # numbers
    Left_of_gene = Gene_plus[0:1000] # 1000 bp's to the left of the Gene
    Right_of_gene = Gene_plus[(len(Gene_plus)-1000):] # 1000 bp's to the right of the Gene (includes 3' UTR segment)
    Gene = Gene_plus[1000:(len(Gene_plus)-1000)]
    
    return (Left_of_gene, Gene, Right_of_gene)

In [3]:
# Read in and process the enzyme list
def read_enzyme_list(enzymefile):

    with open(enzymefile) as file:
        lines=file.readlines()
        enzymes=[line.rstrip() for line in lines]
        
    rsitelist = []
    enamelist = []
    if (len(enzymes) % 2) == 1:
        print('Enzyme list has an odd number of lines. Enzyme list should be a list of enzyme names and restriction sites in each line.')
    else:
        for linei in range(1, len(enzymes), 2):
            rsite = enzymes[linei]
            ename = enzymes[linei-1]
            # check length of restriction site
            if len(rsite) == 6 or len(rsite) == 8:
                # check whether any non-ACGT characters
                if len(re.sub('[ACGT]', '', rsite)) == 0:
                    #check whether palindromic so can focus on only one strand
                    complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
                    reverse_complement = "".join(complement.get(base, base) for base in reversed(rsite))
                    if rsite==reverse_complement:
                        # add name only if enzyme already in the list
                        rsiteexists=0 
                        for linej in range(len(rsitelist)):
                            if rsitelist[linej] == rsite:
                                rsiteexists=1
                                enamelist[linej] = enamelist[linej]+', '+ename
                                break
                        #add new entry if not yet there
                        if rsiteexists==0:
                            rsitelist.append(rsite)
                            enamelist.append(ename)
    return (np.array(rsitelist), np.array(enamelist))

In [211]:
genefile   = './Chris_code/clb2_pm_1000.txt'
enzymefile = './Chris_code/raw_enzyme_list.txt'
minhomology = 100
alpha = 1

In [109]:
Left_of_gene, Gene, Right_of_gene = read_gene_string(genefile)
rsitelist, enamelist = read_enzyme_list(enzymefile)

### Loop

#### 3.

In [241]:
# 3.
def right_search(Gene, rsitelist, enamelist, minhomology=0):
    # find right-most occurances of given rsites in Gene
    if minhomology>0:
        Gene = Gene[:-minhomology]
    rsite_position_list = np.array([Gene.rfind(rsite) for rsite in rsitelist])

    # sort and remove not matched rsites
    sorted_inds = np.argsort(-rsite_position_list)
    rsite_position_list = rsite_position_list[sorted_inds]
    matched_inds = (rsite_position_list != -1)
    rsite_position_list = rsite_position_list[matched_inds]

    # apply sort and remove inds to rsitelist and enamelist
    rsitelist = rsitelist[sorted_inds][matched_inds]
    enamelist = enamelist[sorted_inds][matched_inds]

    return rsitelist, enamelist, rsite_position_list



def left_search(Gene, rsitelist, enamelist, minhomology=0):
    # find right-most occurances of given rsites in Gene
    rsite_position_list = np.array([Gene[minhomology:].find(rsite) for rsite in rsitelist])

    # sort and remove not matched rsites
    sorted_inds = np.argsort(rsite_position_list)
    rsite_position_list = rsite_position_list[sorted_inds]
    matched_inds = (rsite_position_list != -1)
    rsite_position_list = rsite_position_list[matched_inds] + minhomology

    # apply sort and remove inds to rsitelist and enamelist
    rsitelist = rsitelist[sorted_inds][matched_inds]
    enamelist = enamelist[sorted_inds][matched_inds]

    return rsitelist, enamelist, rsite_position_list




def rsite_search(Gene, rsitelist, enamelist, starting_position, minhomology=0):

    # TODO for deletion search left_of_gene and right_of_gene instead of gene

    if starting_position not in ['left','right']:
        raise ValueError('Wrong starting_position value')

    # find left(right)-most occurances of given rsites in Gene and sort them
    if starting_position == 'left':
        Gene_cut = Gene[minhomology:]
        rsite_position_list = np.array([Gene_cut.find(rsite) for rsite in rsitelist])
        sorted_inds = np.argsort(rsite_position_list)
    elif starting_position == 'right':
        Gene_cut = Gene[:len(Gene)-minhomology]
        rsite_position_list = np.array([Gene_cut.rfind(rsite) for rsite in rsitelist]) 
        sorted_inds = np.argsort(-rsite_position_list)

    # remove not matched rsites
    rsite_position_list = rsite_position_list[sorted_inds]
    matched_inds = (rsite_position_list != -1)
    rsite_position_list = rsite_position_list[matched_inds]
    if starting_position == 'left':
        rsite_position_list += minhomology

    # apply sort and remove inds to rsitelist and enamelist
    rsitelist = rsitelist[sorted_inds][matched_inds]
    enamelist = enamelist[sorted_inds][matched_inds]

    return rsitelist, enamelist, rsite_position_list

In [159]:
gene_rsitelist_sorted, gene_enamelist_sorted, gene_rsite_position_list_sorted = right_search(Gene, rsitelist, enamelist, minhomology=100)
gene_rsitelist_sorted, gene_enamelist_sorted, gene_rsite_position_list_sorted

asdsad


(array(['TCATGA', 'CAATTG', 'GTATAC', 'CAGCTG', 'GATATC', 'TTCGAA',
        'TTTAAA', 'AATATT', 'AGATCT', 'TGTACA', 'ACTAGT'], dtype='<U8'),
 array(['BspHI', 'MfeI, MfeI-HFÂ®', 'BstZ17I, BstZ17I-HFÂ®',
        'PvuII, PvuII-HFÂ®', 'EcoRV, EcoRV-HFÂ®', 'BstBI', 'DraI',
        'SspI, SspI-HFÂ®', 'BglII', 'BsrGI, BsrGI-HFÂ®', 'SpeI, SpeI-HFÂ®'],
       dtype='<U32'),
 array([1355, 1348, 1289, 1213, 1160, 1103, 1043,  765,  746,   65,   39]))

#### 4. and 5.

In [156]:
# TODO load as parameters and load from files
def load_linker(path):
    return 'GGTGCTTCTGTTGGTGCTTCTGTTTCTGTTGGTCCGC'

def load_FPG(path):
    mCherry = 'atggtgagcaagggcgaggaggataacatggccatcatcaaggagttcatgcgcttcaaggtgcatatggagggctccgtgaacggccacgagttcgagatcgagggcgagggcgagggccgcccctacgagggcacccagaccgccaagctgaaggtgaccaagggtggccccctgcccttcgcctgggacatcctgtcccctcagttcatgtacggctccaaggcctacgtgaagcaccccgccgacatccccgactacttgaagctgtccttccccgagggcttcaagtgggagcgcgtgatgaacttcgaggacggcggcgtggtgaccgtgacccaggactcctccctgcaggacggcgagttcatctacaaggtgaagctgcgcggcaccaacttcccctccgacggccccgtaatgcagaagaagaccatgggctgggaggcctcctccgagcggatgtaccccgaggacggcgccctgaagggcgagatcaagcagaggctgaagctgaaggacggcggccactacgacgctgaggtcaagaccacctacaaggccaagaagcccgtgcagctgcccggcgcctacaacgtcaacatcaagttggacatcacctcccacaacgaggactacaccatcgtggaacagtacgaacgcgccgagggccgccactccaccggcggcatggacgagctgtactag'
    return mCherry.upper()

In [203]:
# 4. and 5.

ind = 1

rsite0 = gene_rsitelist_sorted[ind]
rsite0_pos = gene_rsite_position_list_sorted[ind]
ename0 = gene_enamelist_sorted[ind]
X = Gene[rsite0_pos-minhomology:-3]
alphaX = Right_of_gene[:int(alpha*len(X))]


X.find(rsite0)==minhomology

True

In [204]:
linker = load_linker('')
FPG = load_FPG('')

full_sequence = X + linker + FPG + alphaX
if full_sequence.count(rsite0) != 1:
    print('rsite is not unique in the gene')
    pass # TODO go back to 3.

#### 7.

In [217]:
# 7.

# TODO load backbone with MCS, and two indices

def load_MCS(path):
    return 'GTAATACGACTCACTATAGGGCGAATTGGGTACCGGGCCCCCCCTCGAGGTCGACGGTATCGATAAGCTTGATATCGAATTCCTGCAGCCCGGGGGATCCACTAGTTCTAGAGCGGCCGCCACCGCGGTGGAGCTCCAGCTTTTGTTCCCTTTAGTGAGGGTTAATT'

In [206]:
MCS = load_MCS('')
# Search for the most outer CS from MCS that are not in 4. 

rsite1 = ''
rsite2 = ''

msc_rsitelist_sorted_right, _, _ = left_search(MCS, rsitelist, enamelist)
for i, rsite in enumerate(msc_rsitelist_sorted_right):
    if rsite not in full_sequence:
        rsite1 = rsite
        break

msc_rsitelist_sorted_left, _, _ = right_search(MCS, rsitelist, enamelist)
for i, rsite in enumerate(msc_rsitelist_sorted_left):
    if rsite not in full_sequence:
        rsite2 = rsite
        break

if not rsite1 or not rsite2:
    pass # TODO exit program
    print('good rsites not found in MCS')


In [207]:
rsite1, rsite2

('GGTACC', 'GAGCTC')

#### 8.

In [269]:
def load_backbones(path):
    backbone_1="TCGCGCGTTTCGGTGATGACGGTGAAAACCTCTGACACATGCAGCTCCcGGACTCTAATTTGTGAGTTTAGTATACATGCATTTACTTATAATACAGTTTTTTAGTTTTGCTGGCCGCATCTTCTCAAATATGCTTCCCAGCCTGCTTTTCTGTAACGTTCACCCTCTACCTTAGCATCCCTTCCCTTTGCAAATAGTCCTCTTCCAACAATAATAATGTCAGATCCTGTAGAGACCACATCATCCACGGTTCTATACTGTTGACCCAATGCGTCTCCCTTGTCATCTAAACCCACACCGGGTGTCATAATCAACCAATCGTAACCTTCATCTCTTCCACCCATGTCTCTTTGAGCAATAAAGCCGATAACAAAATCTTTGTCGCTCTTCGCAATGTCAACAGTACCCTTAGTATATTCTCCAGTAGATAGGGAGCCCTTGCATGACAATTCTGCTAACATCAAAAGGCCTCTAGGTTCCTTTGTTACTTCTTCTGCCGCCTGCTTCAAACCGCTAACAATACCTGGGCCCACCACACCGTGTGCATTCGTAATGTCTGCCCATTCTGCTATTCTGTATACACCCGCAGAGTACTGCAATTTGACTGTATTACCAATGTCAGCAAATTTTCTGTCTTCGAAGAGTAAAAAATTGTACTTGGCGGATAATGCCTTTAGCGGCTTAACTGTGCCCTCCATGGAAAAATCAGTCAAGATATCCACATGTGTTTTTAGTAAACAAATTTTGGGACCTAATGCTTCAACTAACTCCAGTAATTCCTTGGTGGTACGAACATCCAATGAAGCACACAAGTTTGTTTGCTTTTCGTGCATGATATTAAATAGCTTGGCAGCAACAGGACTAGGATGAGTAGCAGCACGTTCCTTATATGTAGCTTTCGACATGATTTATCTTCGTTTCCTGCAGGTTTTTGTTCTGTGCAGTTGGGTTAAGAATACTGGGCAATTTCATGTTTCTTCAACACTACATATGCGTATATATACCAATCTAAGTCTGTGCTCCTTCCTTCGTTCTTCCTTCTGTTCGGAGATTACCGAATCAAAAAAATTTCAAGGAAACCGAAATCAAAAAAAAGAATAAAAAAAAAATGATGAATTGAAAAGGTGGTATGGTGCACTCTCAGTACTCCgGGAGACGGTCACAGCTTGTCTGTAAGCGGATGCCGGGAGCAGACAAGCCCGTCAGGGCGCGTCAGCGGGTGTTGGCGGGTGTCGGGGCTGGCTTAACTATGCGGCATCAGAGCAGATTGTACTGAGAGTGCACCATATCGACTACGTCGTTAAGGCCGTTTCTGACAGAGTAAAATTCTTGAGGGAACTTTCACCATTATGGGAAATGGTTCAAGAAGGTATTGACTTAAACTCCATCAAATGGTCAGGTCATTGAGTGTTTTTTATTTGTTGTATTTTTTTTTTTTTAGAGAAAATCCTCCAATATATAAATTAGGAATCATAGTTTCATGATTTTCTGTTACACCTAACTTTTTGTGTGGTGCCCTCCTCCTTGTCAATATTAATGTTAAAGTGCAATTCTTTTTCCTTATCACGTTGAGCCATTAGTATCAATTTGCTTACCTGTATTCCTTTACATCCTCCTTTTTCTCCTTCTTGATAAATGTATGTAGATTGCGTATATAGTTTCGTCTACCCTATGAACATATTCCATTTTGTAATTTCGTGTCGTTTCTATTATGAATTTCATTTATAAAGTTTATGTACAAATATCATAAAAAAAGAGAATCTTTTTAAGCAAGGATTTTCTTAACTTCTTCGGCGACAGCATCACCGACTTCGGTGGTACTGTTGGAACCACCTAAATCACCAGTTCTGATACCTGCATCCAAAACCTTTTTAACTGCATCTTCAATGGCCTTACCTTCTTCAGGCAAGTTCAATGACAATTTCAACATCATTGCAGCAGACAAGATAGTGGCGATAGGGTCAACCTTATTCTTTGGCAAATCTGGAGCAGAACCGTGGCATGGTTCGTACAAACCAAATGCGGTGTTCTTGTCTGGCAAAGAGGCCAAGGACGCAGATGGCAACAAACCCAAGGAACCTGGGATAACGGAGGCTTCATCGGAGATGATATCACCAAACATGTTGCTGGTGATTATAATACCATTTAGGTGGGTTGGGTTCTTAACTAGGATCATGGCGGCAGAATCAATCAATTGATGTTGAACCTTCAATGTAGGGAATTCGTTCTTGATGGTTTCCTCCACAGTTTTTCTCCATAATCTTGAAGAGGCCAAAAGATTAGCTTTATCCAAGGACCAAATAGGCAATGGTGGCTCATGTTGTAGGGCCATGAAAGCGGCCATTCTTGTGATTCTTTGCACTTCTGGAACGGTGTATTGTTCACTATCCCAAGCGACACCATCACCATCGTCTTCCTTTCTCTTACCAAAGTAAATACCTCCCACTAATTCTCTGACAACAACGAAGTCAGTACCTTTAGCAAATTGTGGCTTGATTGGAGATAAGTCTAAAAGAGAGTCGGATGCAAAGTTACATGGTCTTAAGTTGGCGTACAATTGAAGTTCTTTACGGATTTTTAGTAAACCTTGTTCAGGTCTAACACTACCGGTACCCCATTTAGGACCACCCACAGCACCTAACAAAACGGCATCAGCCTTCTTGGAGGCTTCCAGCGCCTCATCTGGAAGTGGAACACCTGTAGCATCGATAGCAGCACCACCAATTAAATGATTTTCGAAATCGAACTTGACATTGGAACGAACATCAGAAATAGCTTTAAGAACCTTAATGGCTTCGGCTGTGATTTCTTGACCAACGTGGTCACCTGGCAAAACGACGATCTTCTTAGGGGCAGACATTAGAATGGTATATCCTTGAAATATATATATATATATTGCTGAAATGTAAAAGGTAAGAAAAGTTAGAAAGTAAGACGATTGCTAACCACCTATTGGAAAAAACAATAGGTCCTTAAATAATATTGTCAACTTCAAGTATTGTGATGCAAGCATTTAGTCATGAACGCTTCTCTATTCTATATGAAAAGCCGGTTCCGGCGCTCTCACCTTTCCTTTTTCTCCCAATTTTTCAGTTGAAAAAGGTATATGCGTCAGGCGACCTCTGAAATTAACAAAAAATTTCCAGTCATCGAATTTGATTCTGTGCGATAGCGCCCCTGTGTGTTCTCGTTATGTTGAGGAAAAAAATAATGGTTGCTAAGAGATTCGAACTCTTGCATCTTACGATACCTGAGTATTCCCACAGTTAACTGCGGTCAAGATATTTCTTGAATCAGGCGCCTTAGACCGCTCGGCCAAACAACCAATTACTTGTTGAGAAATAGAGTATAATTATCCTATAAATATAACGTTTTTGAACACACATGAACAAGGAAGTACAGGACAATTGATTTTGAAGAGAATGTGGATTTTGATGTAATTGTTGGGATTCCATTTTTAATAAGGCAATAATATTAGGTATGTAGATATACTAGAAGTTCTCCTCGACCGGTCGATATGCGGTGTGAAATACCGCACAGATGCGTAAGGAGAAAATACCGCATCAGGAAATTGTAAgCGTTAATATTTTGTTAAAATTCGCGTTAAATTTTTGTTAAATCAGCTCATTTTTTAACCAATAGGCCGAAATCGGCAAAATCCCTTATAAATCAAAAGAATAGACCGAGATAGGGTTGAGTGTTGTTCCAGTTTGGAACAAGAGTCCACTATTAAAGAACGTGGACTCCAACGTCAAAGGGCGAAAAACCGTCTATCAGGGCGATGGCCCACTACGTGAACCATCACCCTAATCAAGTTTTTTGGGGTCGAGGTGCCGTAAAGCACTAAATCGGAACCCTAAAGGGAGCCCCCGATTTAGAGCTTGACGGGGAAAGCCGGCGAACGTGGCGAGAAAGGAAGGGAAGAAAGCGAAAGGAGCGGGCGCTAGGGCGCTGGCAAGTGTAGCGGTCACGCTGCGCGTAACCACCACACCCGCCGCGCTTAATGCGCCGCTACAGGGCGCGTCCATTCGCCATTCAGGCTGCGCAACTGTTGGGAAGGGCGATCGGTGCGGGCCTCTTCGCTATTACGCCAGCTGGCGAAAGGGGGATGTGCTGCAAGGCGATTAAGTTGGGTAACGCCAGGGTTTTCCCAGTCACGACGTTGTAAAACGACGGCCAGTGAGCGCGC"
    backbone_2="GCGCGCTTGGCGTAATCATGGTCATAGCTGTTTCCTGTGTGAAATTGTTATCCGCTCACAATTCCACACAACATAcGAGCCGGAAGCATAAAGTGTAAAGCCTGGGGTGCCTAATGAGTGAGcTAACTCACATTAATTGCGTTGCGCTCACTGCCCGCTTTCCAGTCGGGAAACCTGTCGTGCCAGCTGCATTAATGAATCGGCCAACGCGCGGGGAGAGGCGGTTTGCGTATTGGGCGCTCTTCCGCTTCCTCGCTCACTGACTCGCTGCGCTCGGTCGTTCGGCTGCGGCGAGCGGTATCAGCTCACTCAAAGGCGGTAATACGGTTATCCACAGAATCAGGGGATAACGCAGGAAAGAACATGTGAGCAAAAGGCCAGCAAAAGGCCAGGAACCGTAAAAAGGCCGCGTTGCTGGCGTTTTTCCATAGGCTCCGCCCCCCTGACGAGCATCACAAAAATCGACGCTCAAGTCAGAGGTGGCGAAACCCGACAGGACTATAAAGATACCAGGCGTTTCCCCCTGGAAGCTCCCTCGTGCGCTCTCCTGTTCCGACCCTGCCGCTTACCGGATACCTGTCCGCCTTTCTCCCTTCGGGAAGCGTGGCGCTTTCTCATAGCTCACGCTGTAGGTATCTCAGTTCGGTGTAGGTCGTTCGCTCCAAGCTGGGCTGTGTGCACGAACCCCCCGTTCAGCCCGACCGCTGCGCCTTATCCGGTAACTATCGTCTTGAGTCCAACCCGGTAAGACACGACTTATCGCCACTGGCAGCAGCCACTGGTAACAGGATTAGCAGAGCGAGGTATGTAGGCGGTGCTACAGAGTTCTTGAAGTGGTGGCCTAACTACGGCTACACTAGAAGGACAGTATTTGGTATCTGCGCTCTGCTGAAGCCAGTTACCTTCGGAAAAAGAGTTGGTAGCTCTTGATCCGGCAAACAAACCACCGCTGGTAGCGGTGGTTTTTTTGTTTGCAAGCAGCAGATTACGCGCAGAAAAAAAGGATCTCAAGAAGATCCTTTGATCTTTTCTACGGGGTCTGACGCTCAGTGGAACGAAAACTCACGTTAAGGGATTTTGGTCATGAGATTATCAAAAAGGATCTTCACCTAGATCCTTTTAAATTAAAAATGAAGTTTTAAATCAATCTAAAGTATATATGAGTAAACTTGGTCTGACAGTTACCAATGCTTAATCAGTGAGGCACCTATCTCAGCGATCTGTCTATTTCGTTCATCCATAGTTGCCTGACTCCCCGTCGTGTAGATAACTACGATACGGGAGGGCTTACCATCTGGCCCCAGTGCTGCAATGATACCGCGAGACCCACGCTCACCGGCTCCAGATTTATCAGCAATAAACCAGCCAGCCGGAAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTGTCACGCTCGTCGTTTGGTATGGCTTCATTCAGCTCCGGTTCCCAACGATCAAGGCGAGTTACATGATCCCCCATGTTGTGCAAAAAAGCGGTTAGCTCCTTCGGTCCTCCGATCGTTGTCAGAAGTAAGTTGGCCGCAGTGTTATCACTCATGGTTATGGCAGCACTGCATAATTCTCTTACTGTCATGCCATCCGTAAGATGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGGCGACCGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCAAGGATCTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCTTTTACTTTCACCAGCGTTTCTGGGTGAGCAAAAACAGGAAGGCAAAATGCCGCAAAAAAGGGAATAAGGGCGACACGGAAATGTTGAATACTCATACTCTTCCTTTTTCAATATTATTGAAGCATTTATCAGGGTTATTGTCTCATGAGCGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACATTTCCCCGAAAAGTGCCACCTGACGTCTAAGAAACCATTATTATCATGACATTAACCTATAAAAATAGGCGTATCACGAGGCCCTTTCGTC"

    return [backbone_1,backbone_2]

def assemble_plasmid(backbone, sequence):
    return sequence + backbone + sequence
    

In [209]:
backbones = load_backbones('')
for backbone in backbones:
    full_plasmid = assemble_plasmid(backbone, full_sequence)
    if full_plasmid.count(rsite0) == 2:
        print('plasmid found')
        break



plasmid found


In [199]:
full_plasmid.count(rsite0)

3

# FULL CODE

In [293]:
def generate_full_sequence(Gene, rsite_position, rsite, minhomology, alpha, linker, FPG, gene_position, Left_of_gene='', Right_of_gene=''): 
    #TODO 5p i 3p with respect to the main gene sequence

    if gene_position not in ['left','right']:
        raise ValueError('Wrong gene_position value')

    if gene_position == 'left':
        X = Gene[rsite_position-minhomology:-3] #We retrieve the 5p end of the gene without the stop codon
        alphaX = Right_of_gene[:int(alpha*len(X))]
        full_sequence = X + linker + FPG + alphaX + 'TGA' #we add the stop codon at the end
    else:
        # TODO check
        X = Gene[:rsite_position+len(rsite)+minhomology]
        alphaX = Left_of_gene[-int(alpha*len(X)):]
        full_sequence = alphaX + FPG + linker + X 

    return full_sequence


def find_compatible_MCS_rsites(MCS, rsitelist, enamelist, full_sequence):

    rsite1 = ''
    rsite2 = ''

    msc_rsitelist_sorted_right, _, _ = rsite_search(MCS, rsitelist, enamelist, starting_position='right')
    for i, rsite in enumerate(msc_rsitelist_sorted_right):
        if rsite not in full_sequence:
            rsite1 = rsite
            break

    msc_rsitelist_sorted_left, _, _ = rsite_search(MCS, rsitelist, enamelist, starting_position='left')
    for i, rsite in enumerate(msc_rsitelist_sorted_left):
        if rsite not in full_sequence:
            rsite2 = rsite
            break
        
    return rsite1, rsite2

In [294]:
# 1. and 2.
genefile   = './Chris_code/clb2_pm_1000.txt'
enzymefile = './Chris_code/raw_enzyme_list.txt'
minhomology = 100
alpha = 1

Left_of_gene, Gene, Right_of_gene = read_gene_string(genefile)
rsitelist, enamelist = read_enzyme_list(enzymefile)

linker = load_linker('')
FPG = load_FPG('')


# 3.
gene_rsitelist_sorted, gene_enamelist_sorted, gene_rsite_position_list_sorted = rsite_search(
    Gene, rsitelist, enamelist, starting_position='right', minhomology=100)



for i in range(len(gene_rsitelist_sorted)):


    # 4. and 5.
    rsite0 = gene_rsitelist_sorted[i]
    rsite0_pos = gene_rsite_position_list_sorted[i]
    ename0 = gene_enamelist_sorted[i]
    print(i, rsite0)

    full_sequence = generate_full_sequence(
        Gene, rsite0_pos, rsite, minhomology, alpha, linker, FPG, gene_position='left', Right_of_gene=Right_of_gene)
    if full_sequence.count(rsite0) != 1:
        print('rsite {} is not unique in the gene'.format(rsite0))
        continue
    

    # 7.
    MCS = load_MCS('')
    # Search for the most outer CS from MCS that are not in 4. 
    rsite1, rsite2 = find_compatible_MCS_rsites(MCS, rsitelist, enamelist, full_sequence)
    if not rsite1 or not rsite2:
        print('good rsites not found in MCS, rsites: ', rsite1, rsite2)
        # If not potential cutsites were find in MCS abort the program (finding another cutsite for gene integration would make the sequence for cloning longer)
        break
    
    
    # 8.
    backbones = load_backbones('')
    plasmid_found = False
    for backbone in backbones:
        full_plasmid = assemble_plasmid(backbone, full_sequence)
        if full_plasmid.count(rsite0) == 2:
            print('plasmid found')
            plasmid_found = True
            break
        else:
            print('bad backbone')

    if plasmid_found:
        break


0 TCATGA
bad backbone
bad backbone
1 CAATTG
bad backbone
plasmid found


In [2]:
def dna_to_protein(dna):
     #Translates DNA into a Protein. it truncates the 3' tail that doesn't make a full codon
     #STOP is denoted by '*'
     
	dna = dna.upper()

	genetic_code = {
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                 
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
        'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*',
        'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W',
    }
	protein = ''


	if(len(dna)%3 != 0):
		dna = dna[: -(len(dna)%3)]

	for i in range(0, len(dna), 3):
		code = dna[i:i+3]
		if(code in genetic_code.keys()):
		    protein += genetic_code[code]
		else:
			protein = ''
			break
	
	return protein


In [3]:
def  find_additional_cutsites(plasmid, rsitelist, enamelist):
#this function finds all appropropriate cutsites that could be added between different pieces of the insert in the plasmid (e.g. between the gene and the linker or between the linker and the FPG etc)
#a good cutsite has to fullfill three criteria:
#1. Divisible by 3 (this could be circuimvented by accomidating the linker length)
#2. Does not cut the final plasmid
#3. Does not introduce a stop codon

    good_rsite_list = []
    good_enzyme_list = []
    for rsite, enzyme in zip(rsitelist, enamelist):
        if len(rsite)%3 == 0 and plasmid.count(rsite) == 0 and dna_to_protein(rsite).count('*') == 0:
            good_rsite_list.append(rsite)
            good_enzyme_list.append(enzyme)

    return good_rsite_list, good_enzyme_list

In [4]:
dna_to_protein('TGCTACtga').count('*')

1