In [1]:
import dnacauldron as dc
import pandas as pd
from Bio import SeqIO
import csv
import io
import os
from pydna.all import *

In [2]:
##I added the genbank files of the pUAP4 backbone to the /data/CRISPR_library/parts/ folder

In [3]:
#make folders for output
dirName = f'../../data/CRISPR_library/parts'
try:
    # Create target Directory
    os.mkdir(dirName)
    print("Directory " , dirName ,  " created") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")

Directory  ../../data/CRISPR_library/parts  already exists


In [4]:
#make folders for output
dirName = f'../../data/CRISPR_library/parts/scaffolds'
try:
    # Create target Directory
    os.mkdir(dirName)
    print("Directory " , dirName ,  " created") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")

Directory  ../../data/CRISPR_library/parts/scaffolds  already exists


In [5]:
#make folders for output
dirName = f'../../data/CRISPR_library/reports'
try:
    # Create target Directory
    os.mkdir(dirName)
    print("Directory " , dirName ,  " created") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")

Directory  ../../data/CRISPR_library/reports  already exists


In [6]:
def create_primer_fasta(all_guides):
    """function to set the guide names in the sequence repository and create a fasta file"""
    #read allguides csv
    all_guides_df = pd.read_csv(all_guides, header=0)
    #create a temporary buffer containing the name and sequence columns
    buffer = io.StringIO()
    all_guides_df[['Name','Sequence']].to_csv(buffer,sep='\t', header=None, index=None)
    #go back to beginning of the buffer
    buffer.seek(0)
    #convert buffer into fasta file and save the file
    SeqIO.convert(buffer, 'tab','../../data/CRISPR_library/parts/guides.fasta','fasta')

In [7]:
def simulate_pcr(forward_primer_name,reverse_primer_name, scaffold_name, repository):
    """function to simulate a PCR program and generate the virtual PCR product. Use the names provided to select the sequences from the biopython seq object"""
    #select the parts from the sequence repository using the names provided
    forward_primer = repository.collections['parts'][forward_primer_name]
    reverse_primer = repository.collections['parts'][reverse_primer_name]
    scaffold = repository.collections['parts'][scaffold_name]
    
    PCR_prod = pcr(forward_primer, reverse_primer, scaffold)
    #change name of the PCR product
    PCR_prod.id = forward_primer_name + '_scaffold'
    PCR_prod.name = forward_primer_name + '_scaffold'
    #save the PCR products
    PCR_prod.write(filename=f"../../data/CRISPR_library/parts/scaffolds/{forward_primer_name}_scaffold.gb",f='gb')

In [8]:
def add_PCRs2repo(primer_name_prefix,parts_list, reverse_primer_name, scaffold_name, repository):
    """run all of the virtual PCRs for each of the forward primers in the sequence repository. 
    Primer_name_prefix is the start of the primer names in the list, in this case Sam Witham's initials"""
    for part in parts_list:
        #if starts with SW and does not end with _scaffold
        if part.startswith(primer_name_prefix) and not part.endswith('_scaffold'):
            simulate_pcr(part, reverse_primer_name,scaffold_name,repository)

In [9]:
def create_repo(parts_folder_location):
    """function to use the parts folder location and input all files into the sequence repository"""
    #create sequence repository using all parts in the folder location
    repository = dc.SequenceRepository()
    repository.import_records(folder=parts_folder_location, use_file_names_as_ids=True)
    #list the parts in the repository
    parts_list = list(repository.collections['parts'])

    
    
    return repository,parts_list

In [10]:
def map_guide_pairs(all_guides, guide_pairs):
    """function to map the guide names in all_guides.csv to the guide pairs"""
    #read in the csvs
    guide_pairs_df = pd.read_csv(guide_pairs, header=0)
    all_guides_df = pd.read_csv(all_guides, header=0)
    #merge the dfs
    merged = pd.merge(guide_pairs_df, all_guides_df, left_on='guide1', right_on='Description')
    #rename columns
    merged.rename(columns={'Name': 'Name_guide1', 'Description': 'Description_guide1'}, inplace=True)
    #merge again to map guide 2
    merged = pd.merge(merged, all_guides_df, left_on='guide2', right_on='Description')
    merged.rename(columns={'Name': 'Name_guide2', 'Description': 'Description_guide2'}, inplace=True)
    #filter columns
    merged = merged[['guide1','guide2','Name_guide1','Name_guide2','Description_guide1','Description_guide2']]
    return merged

In [22]:
def create_goldengate_hierarchy_csv(mapped_guide_pairs, first_level1_ID,level2_ID_prefix,output_location):
    """function to allocate the guides in each guide pair to loop level 1 acceptors pCk3 and pCk4 respectively"""
    #create level ID
    ID_letters = first_level1_ID[:8]
    ID_number = int(first_level1_ID[-4:])
    #create empty list
    rows_list=[]
    #iterate over rows in mapped_guide_pairs
    for i,data in mapped_guide_pairs.iterrows():
        #add columns to dictionary for guide 1 in the pair (contruct name, acceptor, promoter and guidescaffold 1)
        dict1 = {'construct':f'{ID_letters}{ID_number}','acceptor':'pCk3','part1':'picsl90002','part2':(mapped_guide_pairs.loc[i, 'Name_guide1']+'_scaffold')}
        #increase ID_number by one for the next construct
        ID_number = ID_number + 1
        #append dictionary to rows_list
        rows_list.append(dict1)
        #add columns to dictionary for guide 2 in the pair (contruct name, acceptor, promoter and guidescaffold 2)
        dict2 = {'construct':f'{ID_letters}{ID_number}','acceptor':'pCk4','part1':'picsl90002','part2':(mapped_guide_pairs.loc[i, 'Name_guide2']+'_scaffold')}
        #increase ID_number by one for the next construct
        ID_number = ID_number + 1
        #append dictionary to rows_list
        rows_list.append(dict2)
    #create df of rows_list
    df = pd.DataFrame(rows_list, columns=['construct','acceptor','part1','part2'])
    #remove duplicates
    df.drop_duplicates(subset=['acceptor', 'part1','part2'], keep='first', inplace=True)
    #sort by PCR product name
    df = df.sort_values(['part2']).reset_index(drop=True)
    #add new construct names now duplicates are removed
    #create level ID
    ID_letters = first_level1_ID[:8]
    ID_number = int(first_level1_ID[-4:])    
    #rename construct IDs
    for i, data in df.iterrows():
        #make ID_number have 0's as prefix so four digits long
        ID_number_string = f'{ID_number:04}'
        df.loc[i, 'construct'] = f'{ID_letters}{ID_number_string}'
        #increase ID_number by one for the next construct
        ID_number = ID_number + 1
    #now primer level 1s are added, add level 2 constructs to hierarchy
    #create copy of df
    df_copy = df.copy()
    #add primer name column hierarchy df
    df_copy = df_copy.assign(primer_name=df_copy.part2.str.extract('^(.+?)_'))
    #split df by acceptor
    pck3 = df_copy[df_copy.acceptor=='pCk3']
    pck4 = df_copy[df_copy.acceptor=='pCk4']
    #map the level 1 sgRNA construct names to the guide pairs by merging the dfs
    merged = pd.merge(mapped_guide_pairs, pck3,left_on='Name_guide1', right_on='primer_name',how='left',suffixes=('','_guide1'))
    #merge again to get the second name guide construct ID
    merged = pd.merge(merged, pck4,left_on='Name_guide2', right_on='primer_name',how='left', suffixes=('','_guide2'))
    #now create the level 2 constructs
    #create level ID letters
    ID_letters2 = level2_ID_prefix
    #create another row_list
    rows_list2=[]
    for i,data in merged.iterrows():
       
        #make ID_number have 0's as prefix so four digits long
        ID_number_string = f'{ID_number:04}'
        #add columns to dictionary for guide 1 in the pair (contruct name, acceptor, promoter and guidescaffold 1)
        dict1 = {'construct':f'{ID_letters2}{ID_number_string}','acceptor':'pCsA','part1':'pepsw1kn0114','part2':'pepsw1kn0333','part3':merged.loc[i, 'construct'],'part4':merged.loc[i, 'construct_guide2']}
        #increase ID_number by one for the next construct
        ID_number = ID_number + 1
        #append dictionary to rows_list
        rows_list2.append(dict1)   
        
    #turn rows_list2 into a df
    df_level2 = pd.DataFrame(rows_list2, columns=['construct','acceptor','part1','part2','part3','part4'])
    #concatenate the level 1 and level 2 hierarchy dfs
    cat = pd.concat([df,df_level2])
    #sort by
    #add level 2 constructs
    cat.to_csv(output_location, header=1, index=False)
    return cat

In [11]:
# mapped_guide_pairs.to_csv('../../data/CRISPR_library/all_guide_pairs.csv',header=1,index=False)

NameError: name 'mapped_guide_pairs' is not defined

In [12]:
def hierarchical_goldengate_assembly(parts_location,hierarchy_csv, report_output):
    """function to do hierarchical goldengate assembly for pasts in the hierarchy_csv.Specify parts location and the report_output location"""
    #create an updated sequence repository of all parts
    repository,parts_list = create_repo(parts_location)
    #create an assembly plan
    assembly_plan = dc.AssemblyPlan.from_spreadsheet(
    assembly_class=dc.Type2sRestrictionAssembly,
    path=hierarchy_csv)
    #create plan simulation
    plan_simulation = assembly_plan.simulate(sequence_repository=repository)
    #create report write
    report_writer = dc.AssemblyReportWriter(
    include_mix_graphs=True, include_assembly_plots=True)
    #run simulation
    plan_simulation.write_report(report_output, assembly_report_writer=report_writer)
    

In [482]:
# def goldengate_assembly(parts_location, report_output):
#     """function to do combinatorial goldengate assembly for all parts in a location. Specify parts location and the report_output location"""
#     #create an updated sequence repository of all parts
#     repository,parts_list = create_repo(parts_location)
#     #create an assembly type
#     assembly = dc.Type2sRestrictionAssembly(name="combinatorial_asm",
#     parts=parts_list,
#     expected_constructs="any_number")
#     #create a simulation
#     simulation = assembly.simulate(sequence_repository=repository)
#     #name the report writer
#     report_writer = dc.AssemblyReportWriter(include_mix_graphs=True, include_part_plots=True)
#     #write the report
#     simulation.write_report(target=os.path.join(report_output, "combinatorial"),
#     report_writer=report_writer)

In [23]:
#guide pairs location
ARF9pairs='../../data/CRISPR_library/ARF9guidepairs.csv'
ARF18pairs='../../data/CRISPR_library/ARF18guidepairs.csv'
DREB26pairs='../../data/CRISPR_library/DREB26guidepairs.csv'
NLP7pairs='../../data/CRISPR_library/NLP7guidepairs.csv'

In [24]:
ARF9guides = '../../data/CRISPR_library/sgRNAs-ARF9_new.csv'
ARF18guides ='../../data/CRISPR_library/sgRNAs-ARF18_new.csv'
DREB26guides = '../../data/CRISPR_library/sgRNAs-DREB26_new.csv'
NLP7guides = '../../data/CRISPR_library/sgRNAs-NLP7_new.csv'

In [25]:
#location of all_guides.csv
all_guides = '../../data/CRISPR_library/all_guides.csv'

In [16]:
#Create fasta file containing forward primers of all the guides
create_primer_fasta(all_guides)

In [26]:
#create sequence repo
repository,parts_list = create_repo('../../data/CRISPR_library/parts/')

In [18]:
# #run virtual PCR program for all primers

# simulate_pcr('SW107','QMD025','pslq1661-sgmuc4-e3fe-addgene-51025-chen-sgrna-scaffold', repository)

In [489]:
#run virtual PCRs using all given forward primers starting with 'SW' and save the files to parts/scaffolds
add_PCRs2repo('SW',parts_list, 'QMD025','pslq1661', repository)

In [27]:
#map guide pairs to the all_guides.csv
ARF9 = map_guide_pairs(all_guides, ARF9pairs)
ARF18 = map_guide_pairs(all_guides, ARF18pairs)
DREB26 = map_guide_pairs(all_guides, DREB26pairs)
NLP7 = map_guide_pairs(all_guides, NLP7pairs)

In [28]:
#concatenate the mapped pairs together
mapped_guide_pairs = pd.concat([ARF9,ARF18,DREB26,NLP7])
#reset index
mapped_guide_pairs.reset_index(inplace=True, drop=True)

In [29]:
no_dups = create_goldengate_hierarchy_csv(mapped_guide_pairs,'pEPSW1KN0115','pEPSW2SP','../../data/CRISPR_library/hierarchy.csv')
no_dups

Unnamed: 0,construct,acceptor,part1,part2,part3,part4
0,pEPSW1KN0115,pCk3,picsl90002,SW107_scaffold,,
1,pEPSW1KN0116,pCk3,picsl90002,SW108_scaffold,,
2,pEPSW1KN0117,pCk3,picsl90002,SW109_scaffold,,
3,pEPSW1KN0118,pCk3,picsl90002,SW110_scaffold,,
4,pEPSW1KN0119,pCk4,picsl90002,SW111_scaffold,,
...,...,...,...,...,...,...
91,pEPSW2SP0328,pCsA,pepsw1kn0114,pepsw1kn0333,pEPSW1KN0228,pEPSW1KN0232
92,pEPSW2SP0329,pCsA,pepsw1kn0114,pepsw1kn0333,pEPSW1KN0228,pEPSW1KN0233
93,pEPSW2SP0330,pCsA,pepsw1kn0114,pepsw1kn0333,pEPSW1KN0228,pEPSW1KN0234
94,pEPSW2SP0331,pCsA,pepsw1kn0114,pepsw1kn0333,pEPSW1KN0235,pEPSW1KN0234


In [31]:
#run hierarchical assembly
hierarchical_goldengate_assembly('../../data/CRISPR_library/parts/','../../data/CRISPR_library/hierarchy.csv', '../../data/CRISPR_library/reports')

assembly:   4%|▍         | 9/218 [00:13<01:45,  1.98it/s, now=None]
assembly:   0%|          | 0/218 [00:00<?, ?it/s, now=None][A
assembly:   4%|▍         | 9/218 [00:00<00:02, 85.98it/s, now=None][A

Simulating assembly plan hierarchy...



assembly:   8%|▊         | 17/218 [00:00<00:02, 83.44it/s, now=None][A
assembly:  12%|█▏        | 26/218 [00:00<00:02, 83.35it/s, now=None][A
assembly:  16%|█▌        | 34/218 [00:00<00:02, 82.13it/s, now=None][A
assembly:  20%|█▉        | 43/218 [00:00<00:02, 82.28it/s, now=None][A
assembly:  24%|██▍       | 52/218 [00:00<00:02, 82.72it/s, now=None][A
assembly:  28%|██▊       | 60/218 [00:00<00:01, 81.86it/s, now=None][A
assembly:  32%|███▏      | 69/218 [00:05<00:24,  6.08it/s, now=None][A
assembly:  35%|███▍      | 76/218 [00:05<00:16,  8.36it/s, now=None][A
assembly:  39%|███▉      | 85/218 [00:05<00:11, 11.45it/s, now=None][A
assembly:  43%|████▎     | 93/218 [00:05<00:08, 15.40it/s, now=None][A
assembly:  47%|████▋     | 102/218 [00:05<00:05, 20.46it/s, now=None][A
assembly:  51%|█████     | 111/218 [00:05<00:04, 26.49it/s, now=None][A
assembly:  55%|█████▌    | 120/218 [00:06<00:02, 33.44it/s, now=None][A
assembly:  59%|█████▉    | 129/218 [00:06<00:02, 33.23it/s, 

Generating assemblies reports...



assembly:   1%|          | 2/218 [00:00<01:02,  3.46it/s, now=None][A
assembly:   1%|▏         | 3/218 [00:01<01:45,  2.04it/s, now=None][A
assembly:   2%|▏         | 4/218 [00:02<01:47,  1.99it/s, now=None][A
assembly:   2%|▏         | 5/218 [00:02<01:51,  1.92it/s, now=None][A
assembly:   3%|▎         | 6/218 [00:03<01:55,  1.84it/s, now=None][A
assembly:   3%|▎         | 7/218 [00:03<01:52,  1.87it/s, now=None][A
assembly:   4%|▎         | 8/218 [00:04<01:52,  1.87it/s, now=None][A
assembly:   4%|▍         | 9/218 [00:04<01:50,  1.90it/s, now=None][A
assembly:   5%|▍         | 10/218 [00:05<01:47,  1.94it/s, now=None][A
assembly:   5%|▌         | 11/218 [00:05<01:45,  1.97it/s, now=None][A
assembly:   6%|▌         | 12/218 [00:06<01:43,  1.99it/s, now=None][A
assembly:   6%|▌         | 13/218 [00:06<01:48,  1.89it/s, now=None][A
assembly:   6%|▋         | 14/218 [00:07<01:46,  1.92it/s, now=None][A
assembly:   7%|▋         | 15/218 [00:07<01:43,  1.96it/s, now=None][A

In [217]:
# goldengate_assembly('../../data/CRISPR_library/parts/','../../data/CRISPR_library/parts/reports')



In [156]:
#recommended PCR program
amplicon.program()

|95°C|95°C               |    |tmf:59.3
|____|_____          72°C|72°C|tmr:53.6
|5min|30s  \ 52.0°C _____|____|30s/kb
|    |      \______/ 0: 4|5min|GC 42%
|    |       30s         |    |138bp


In [157]:
amplicon.figure()

                                 5gtttaagagctatgctggaaac...agtcggtgctttttttc3
                                                           |||||||||||||||||
                                                          3tcagccacgaaaaaaagcgatCTCTGGt5
5tgtGGTCTCtattGTTACAGTTACAGAGCAGGAgtttaagagctatgctggaaac3
                                  ||||||||||||||||||||||
                                 3caaattctcgatacgacctttg...tcagccacgaaaaaaag5

In [158]:
amplicon.id

'SW107_scaffold'

In [167]:
dc.SequenceRepository()

<dnacauldron.SequenceRepository.SequenceRepository at 0x7fd54a8f4a10>