# CRISPEY-BAR Oligo Library Design

## Import packages and functions

In [1]:
import os, random
import pandas as pd

crispey_libdesign_code_dir = os.path.expanduser('~/crispey-epistasis/lib_design/')
working_dir = os.path.expanduser("~/crispey3/library_design/")

os.chdir(crispey_libdesign_code_dir)
from extract_guides_functions import extract_guides_for_snps, \
                                     design_donor_for_SNP_guides, \
                                     rank_and_filter_SNP_guides, \
                                     generate_oligo_seq_with_barcode, \
                                     generate_oligo_from_guide_donor_barcode, \
                                     assign_pool_barcode_to_oligos, \
                                     write_output_oligos, \
                                     annotate_variants_by_VEPoutput

os.chdir(working_dir)
print("Current directory: {}".format(os.getcwd()))

Current directory: /home/users/rang/crispey3/library_design


## Design parameters

In [2]:
lib_name = "crispey3"

#####################################################
# guide design for SNPs
#####################################################

guide_length = 20
edit_max_distance_from_PAM5prime = 9
PAM_seq = 'GG' 
min_ok_Azimuth_score_SNP_guides = 0
off_targets_min_mismatch_SNP_guides = 1 # increase this to filter guides that have off-targets with mismatches. 1 only filters out guides with perfect match off-targets

# BOWTIE_exe = "bowtie2"

#####################################################
# donor design for SNPs
#####################################################
agilent_homopolymer_max_len = 10

excluded_seqs = ['A' * agilent_homopolymer_max_len, 
                 'C' * agilent_homopolymer_max_len, 
                 'G' * agilent_homopolymer_max_len, 
                 'T' * agilent_homopolymer_max_len,
                 'GCATGC', # SphI cut site
                 'GGCGCGCC', # AscI cut site
                 'GCGGCCGC'] # NotI cut site

donor_length = 108
min_dist_5prime_arm = 30
min_dist_3prime_arm = 55

#####################################################
# barcode grouping parameters
#####################################################
barcodes_per_group = 118

## Input file names

In [3]:
#####################################################
# Input files
#####################################################
input_dir= working_dir + "Input/"

# input VCF
input_snps_vcf_filename = input_dir+'all_variants_design_oligos.vcf'

# pool assignments
input_vars_pool_assignment_filename = input_dir+'crispey3_vars_pool_assignment.txt'

# VEP annotations of VCF (txt format)
input_snps_vep_output_filename = input_snps_vcf_filename.replace('.vcf', '_VEPoutput.txt')

# oligo design table - contains information about the other segments in the oligo
input_oligo_design_table_filename = input_dir+'crispey3_oligo_design_table.txt'

# SNP guides donor design table 
# contains set names and filtering to sort variants/guides for each set. (use filter_in and filter_out columns)
input_SNP_donor_design_table_filename = input_dir+'crispey3_design_donor_for_snps.txt'

# programmed barcodes list
input_barcode_table_filename = input_dir+'12BP_PBCs_well_grouped.csv'

# technical guides-donors list
input_technical_guides_donors_filename = input_dir+'crispey3_technical_guides_donors.txt'

#####################################################
# Yeast genome reference files
#####################################################
ref_files_dir = os.path.expanduser("~/yeast/genomes/")
# http://downloads.yeastgenome.org/sequence/S288C_reference/genome_releases/
input_genome_fasta_filename = ref_files_dir+'Saccharomyces_cerevisiae.R64-1-1.dna.chromosome.I.fa'

# annotations files are from # http://downloads.yeastgenome.org/sequence/S288C_reference/genome_releases/ 
# version 64_1_1
input_gff_filename = ref_files_dir+'saccharomyces_cerevisiae_R64-1-1_20110208.gff'

## Output file names

These are files names that are used and created during the pipeline.
Names may vary between different designs

In [4]:
###############################################################################################################
# Output files (no need to pass as argument -  depends on the output directory, PAM sequence and max edit distance)
###############################################################################################################
output_directory = working_dir + "Output/"
output_files_uniq_str = lib_name + "_" + PAM_seq + "_" + str(edit_max_distance_from_PAM5prime) + "bp"

# intermediate files during guide-donor-oligo design process
output_SNP_table_filename =    output_directory + "all_SNPs_" + output_files_uniq_str + "_SNP.tab"
output_guides_table_filename = output_directory + "all_SNPs_" + output_files_uniq_str + "_GUIDE.tab"
output_guides_with_features_table_filename = output_directory + "all_SNPs_" + output_files_uniq_str + "_GUIDE_withFeatures.tab"
output_SNP_donor_table_filename = output_directory + "all_SNPs_" + output_files_uniq_str + "_DONOR.tab"
output_guides_with_features_and_rank_table_filename = output_directory + "all_SNPs_" + output_files_uniq_str + "_GUIDE_withFeatures_withRank.tab"
output_oligos_table_filename = output_directory + "all_SNPs_" + output_files_uniq_str + "_OLIGO.tab"
output_oligos_table_with_barcodes_filename = output_directory + "all_SNPs_" + output_files_uniq_str + "_OLIGO_withBarcodes.tab"
output_oligos_table_complete_filename = output_directory + "all_SNPs_" + output_files_uniq_str + "_OLIGO_complete.tab"

# oligos to one table
output_oligo_for_production_nonuniq_filename = output_directory + "oligos_nonuniq_" + output_files_uniq_str + "_OLIGO.txt"
output_oligo_for_production_nonuniq_with_align_filename = output_directory + "oligos_nonuniq_" + output_files_uniq_str + "_OLIGO.txt"

output_oligo_for_production_uniq_filename = output_directory + "oligos_uniq_" + output_files_uniq_str + "_OLIGO.txt"
output_oligo_for_production_uniq_batch_prefix_filename = output_directory + "oligos_uniq_" + output_files_uniq_str + "_"

# SNP table with VEP annotations
output_SNP_withAnnotations_table_filename = output_directory + "all_SNPs_" + lib_name + "_annotated.txt"

# Step 1: Design and extract all guides

In [5]:
# setting the random seed
random.seed(1)

In [6]:
extract_guides_for_snps(input_snps_vcf_filename, input_genome_fasta_filename, 
                        output_SNP_table_filename, output_guides_table_filename,
                        [PAM_seq], guide_length, edit_max_distance_from_PAM5prime,
                        var_id_prefix = "")

---------------------- extracting guides for SNPs -------------------------------
Parsing SNP file line: 5000, found guides for: 4999, #total guides: 12538
Finish parsing VCF: 8553, found: 8553, #guides: 21406

---------------------------- Done extracting guides for SNPs --------------------------


# Step 2: Add guides features using Azimuth
Azimuth is written in python 2, therefore the script for extracting guide features should be run in a python 2 environment with Azimuth and Bowtie2.

In this environment, run **crispey3_add_guide_features.py**. Adjust the following variables in the script accordingly:
- output_guides_table_filename
- output_guides_with_features_table_filename
- input_genome_fasta_filename (reference genome)
- list of genome fasta filenames for off-target search

# Step 3: Design donors for guides

In [7]:
###############################################################################################################
# design donor sequence for each guide (set names specified in donor_design_table) 
###############################################################################################################
out_SNP_donor_df = design_donor_for_SNP_guides(
    input_SNP_donor_design_table_filename, 
    output_SNP_table_filename, 
    output_guides_with_features_table_filename,
    input_genome_fasta_filename,
    donor_length, excluded_seqs, min_dist_5prime_arm, min_dist_3prime_arm,
    output_SNP_donor_table_filename)

Excluded seq: AAAAAAAAAA
Excluded seq: CCCCCCCCCC
Excluded seq: GGGGGGGGGG
Excluded seq: TTTTTTTTTT
Excluded seq: GCATGC
Excluded seq: GGCGCGCC
Excluded seq: GCGGCCGC
--------- designing donors according to line: 0  (# guide ids = 3970)----------
set_name                                      ergosterol
filter_in            VAR:ERG;EGE;EGD;EGC;EGB;EGA;EG9;EG8
filter_out                                          None
donor_mut_type                                   REF2ALT
donor_seq_offsets                                   [14]
Name: 0, dtype: object
-----------------------------------------------------------




Designing donor number: 1000
Designing donor number: 2000




Designing donor number: 3000
--------- designing donors according to line: 1  (# guide ids = 4959)----------
set_name                 gxg
filter_in            VAR:GXG
filter_out              None
donor_mut_type       REF2ALT
donor_seq_offsets       [14]
Name: 1, dtype: object
-----------------------------------------------------------
Designing donor number: 4000
Designing donor number: 5000
Designing donor number: 6000




Designing donor number: 7000




Designing donor number: 8000
--------- designing donors according to line: 2  (# guide ids = 698)----------
set_name                  epival
filter_in            VAR:TDH;VAL
filter_out                  None
donor_mut_type           REF2ALT
donor_seq_offsets           [14]
Name: 2, dtype: object
-----------------------------------------------------------
Designing donor number: 9000




--------- designing donors according to line: 3  (# guide ids = 10998)----------
set_name                 gxe
filter_in            VAR:GXE
filter_out              None
donor_mut_type       REF2ALT
donor_seq_offsets       [14]
Name: 3, dtype: object
-----------------------------------------------------------




Designing donor number: 10000
Designing donor number: 11000
Designing donor number: 12000




Designing donor number: 13000
Designing donor number: 14000




Designing donor number: 15000




Designing donor number: 16000
Designing donor number: 17000




Designing donor number: 18000
Designing donor number: 19000
Designing donor number: 20000
--------- designing donors according to line: 4  (# guide ids = 781)----------
set_name                   hsp90
filter_in            VAR:HSP;HSX
filter_out                  None
donor_mut_type           REF2ALT
donor_seq_offsets           [14]
Name: 4, dtype: object
-----------------------------------------------------------
Designing donor number: 21000
saving donor sequences to: /home/users/rang/crispey3/library_design/Output/all_SNPs_crispey3_GG_9bp_DONOR.tab


# Step 4: Filter and rank guides

In [8]:
###############################################################################################################
# add filter and ranking to the SNP guides (depends on having a SNP and a donor tables) 
###############################################################################################################
out_SNP_guides_withFandR_df = rank_and_filter_SNP_guides(
        input_guides_with_features_table_filename = output_guides_with_features_table_filename,
        input_SNP_table_filename = output_SNP_table_filename,
        input_donor_table_filename = output_SNP_donor_table_filename,
        output_guides_with_features_and_rank_table_filename = output_guides_with_features_and_rank_table_filename,
        off_targets_min_mismatch_SNP_guides = off_targets_min_mismatch_SNP_guides, 
        min_ok_Azimuth_score_SNP_guides = min_ok_Azimuth_score_SNP_guides, 
        edit_max_distance_from_PAM5prime = edit_max_distance_from_PAM5prime)

Saving : /home/users/rang/crispey3/library_design/Output/all_SNPs_crispey3_GG_9bp_GUIDE_withFeatures_withRank.tab


# Step 5a: Write guides and donors into oligos
Assemble guide and donor into oligos according to oligo_design_table. Write N's into barcode segment of the oligos for now. The barcode sequence and pool number will be assigned in the next step

In [9]:
oligo_all_df = generate_oligo_from_guide_donor_barcode(
    input_oligo_design_table_filename = input_oligo_design_table_filename,
    input_guide_table_filename = output_guides_with_features_and_rank_table_filename, 
    input_donor_table_filename = output_SNP_donor_table_filename,
    input_barcode_table_filename = None, # use None to fill N's in barcode segment
    input_guide_iloc = None, # option to filter out guides
    input_donor_iloc = None, # option to filter out donors
    group_size = barcodes_per_group,                   
    output_oligos_table_filename = output_oligos_table_filename)

Before filtering there are 21406 guides and 21381 donors
After filtering there are 21406 guides and 21381 donors
shared columns
['var_id', 'guide_id']
joining the guides and the donors by shared columns (guide_id) creates 21381 oligos
No barcodes provided. Skipping barcode assignment.
parsing oligo 0 out of 21381
parsing oligo 1000 out of 21381

  oligo_design_df) )
  row['barcode_id'] + "#")# + \



parsing oligo 2000 out of 21381
parsing oligo 3000 out of 21381
parsing oligo 4000 out of 21381
parsing oligo 5000 out of 21381
parsing oligo 6000 out of 21381
parsing oligo 7000 out of 21381
parsing oligo 8000 out of 21381
parsing oligo 9000 out of 21381
parsing oligo 10000 out of 21381
parsing oligo 11000 out of 21381
parsing oligo 12000 out of 21381
parsing oligo 13000 out of 21381
parsing oligo 14000 out of 21381
parsing oligo 15000 out of 21381
parsing oligo 16000 out of 21381
parsing oligo 17000 out of 21381
parsing oligo 18000 out of 21381
parsing oligo 19000 out of 21381
parsing oligo 20000 out of 21381
parsing oligo 21000 out of 21381
Saving : /home/users/rang/crispey3/library_design/Output/all_SNPs_crispey3_GG_9bp_OLIGO.tab


# Step 5b: Add humanized yeast oligo set to oligo table
Oligos were generated using separate script (see design_library_humanized_yeast Jupyter notebook)

In [10]:
humanized_oligos_file = os.path.expanduser("~/crispey3/humanized/Output/all_SNPs_humanized_combined_GG_9bp_OLIGO.tab")

oligo_all_df = pd.concat([pd.read_csv(output_oligos_table_filename, sep='\t'),
                          pd.read_csv(humanized_oligos_file, sep='\t')]).reset_index(drop=True)

oligo_all_df.to_csv(output_oligos_table_filename, sep='\t', index = False)

# Step 5c: Assign pool number and barcode sequence to each oligo

In [11]:
###############################################################################################################
# update oligo sequences file with barcodes and pool number
###############################################################################################################
oligo_all_df = assign_pool_barcode_to_oligos(
    input_oligo_design_table_filename = input_oligo_design_table_filename,
    input_oligo_table_filename = output_oligos_table_filename,
    input_pool_assignment_filename = input_vars_pool_assignment_filename,
    input_barcode_table_filename = input_barcode_table_filename,
    output_oligos_with_barcodes_filename = output_oligos_table_with_barcodes_filename,
    max_pool_size=118)

Saving : /home/users/rang/crispey3/library_design/Output/all_SNPs_crispey3_GG_9bp_OLIGO_withBarcodes.tab


# Step 6: Assign technical oligos to pools
Technical oligos (neutral/editing controls) will be assigned barcodes and added to each pool to fill them to the maximum size of 121. These technical oligos are drawn from the last two pools of the CRISPEY3 library, which comprises of:
- Pool 191: Editing controls
    - editing control (KO donor + targeting guide for 42 known effects) - "hdr"
    - Synonymous control (syn donor + targeting guide) - "synonymous"
    - NHEJ control (scramble donor + targeting guide) - "scramble"
- Pool 192: Neutral controls
    - Non-cutting guide control (KO donor + GFP guide) - "noncut"
    - Neutral oligos (scramble donor + scramble guide) - "neutral"
    - Neutral oligos (legacy versions, CRISPEY1) - "neutral (legacy)"


In [12]:
# complete table of technical guides-donors
technical_df = pd.read_csv(input_technical_guides_donors_filename, sep='\t')
neutral_controls = technical_df.query('donor_info_str=="neutral"').sample(frac=1, random_state=1).reset_index(drop=True).copy()
editing_controls = technical_df.query('donor_info_str=="hdr"').sample(frac=1, random_state=1).reset_index(drop=True).copy()

# read in barcodes list
barcodes_df = pd.read_csv(input_barcode_table_filename)
barcodes_df = barcodes_df.rename(columns={'Well':'pool','Barcode':'barcode_seq','Unique_ID':'barcode_id'})

# read oligo design table
oligo_design_df = pd.read_csv(input_oligo_design_table_filename, sep='\t', na_values = "None")
oligo_design_df['DNA_seq'] = oligo_design_df['DNA_seq'].str.upper()


# assign technical oligos to fill each pool
n_counter=0
e_counter=0
technical_oligos_added = []
for pool_num, pool_df in oligo_all_df.groupby('pool'):
    num_neutral_to_add = (121 - len(pool_df)) // 2
    num_editing_to_add = 121 - len(pool_df) - num_neutral_to_add
    
    # select technical guide-donors for pool
    tech_to_add = pd.concat([neutral_controls.loc[ [(n_counter+i)%len(neutral_controls) for i in range(num_neutral_to_add)], :],
                             editing_controls.loc[ [(e_counter+i)%len(editing_controls) for i in range(num_editing_to_add)], :]]).reset_index(drop=True)
    # assign pool barcodes
    tech_to_add = pd.concat([tech_to_add, 
                             barcodes_df.query('pool==@pool_num & ~barcode_id.isin(@pool_df.barcode_id)').reset_index(drop=True)], axis=1)
    # generate oligo_seq
    tech_to_add['oligo_seq'] = tech_to_add.apply(lambda x: generate_oligo_seq_with_barcode(x['guide_noPAM'], x['donor_seq'], x['barcode_seq'], oligo_design_df), axis=1)
    # add newly assigned technical oligos
    technical_oligos_added.append(tech_to_add)
    
    n_counter += num_neutral_to_add
    e_counter += num_editing_to_add

# join with rest of library
oligo_all_df = oligo_all_df.merge(pd.concat(technical_oligos_added), how='outer').fillna({'chrom':'None', 
                                                                                          'SNP_chr_pos':-4, 
                                                                                          'guide0_chr_pos':-1, 
                                                                                          'guide_cut_chr_pos':-4, 
                                                                                          'SNP_pos_in_guide':0, 
                                                                                          'donor_seq_shift':0,
                                                                                          'donor_mut_pos_in_guide':0})

In [13]:
# define pool 191 and 192
pool_191 = technical_df.query('donor_info_str.isin(["hdr","synonymous","scramble"])').sort_values(['donor_info_str', 'oligo_id']).head(121).reset_index(drop=True)
pool_192 = technical_df.query('~guide_id.isin(@pool_191.guide_id)').reset_index(drop=True)

# assign pool barcodes
pool_191 = pd.concat([pool_191, barcodes_df.query('pool==191').reset_index(drop=True)], axis=1)
pool_192 = pd.concat([pool_192, barcodes_df.query('pool==192').reset_index(drop=True)], axis=1)
# generate oligo_seq
pool_191['oligo_seq'] = pool_191.apply(lambda x: generate_oligo_seq_with_barcode(x['guide_noPAM'], x['donor_seq'], x['barcode_seq'], oligo_design_df), axis=1)
pool_192['oligo_seq'] = pool_192.apply(lambda x: generate_oligo_seq_with_barcode(x['guide_noPAM'], x['donor_seq'], x['barcode_seq'], oligo_design_df), axis=1)

# join with rest of library
oligo_all_df = oligo_all_df.merge(pd.concat([pool_191, pool_192]), how='outer').fillna({'chrom':'None', 
                                                                                        'SNP_chr_pos':-4, 
                                                                                        'guide0_chr_pos':-1, 
                                                                                        'guide_cut_chr_pos':-4, 
                                                                                        'SNP_pos_in_guide':0, 
                                                                                        'donor_seq_shift':0,
                                                                                        'donor_mut_pos_in_guide':0})
# adjust dtypes
oligo_all_df[['SNP_chr_pos','guide0_chr_pos',
              'guide_cut_chr_pos', 'SNP_pos_in_guide',
              'donor_seq_shift','donor_mut_pos_in_guide']] = oligo_all_df[['SNP_chr_pos','guide0_chr_pos',
                                                                           'guide_cut_chr_pos', 'SNP_pos_in_guide',
                                                                           'donor_seq_shift','donor_mut_pos_in_guide']].astype(int)

# write completed oligo table to file
oligo_all_df.to_csv(output_oligos_table_complete_filename, sep='\t', index = False)

# Step 7: Write completed library to submission file
Oligos formatted for matrixed oligonucleotide synthesis submission form. Entries are sorted by pool number, followed by barcode ID. Oligo names contain set name, pool number and barcode number

In [14]:
# check inspect library
oligo_all_df = pd.read_csv(output_oligos_table_complete_filename, sep='\t')
set_sizes_df = oligo_all_df['set_name'].value_counts().sort_index()
print("set sizes:")
print(set_sizes_df)
print("total # oligos:")
print(oligo_all_df.shape[0])
print("distribution of pool sizes")
print(oligo_all_df.groupby('pool').size().value_counts())

oligo_for_order_dfs = write_output_oligos(oligo_all_df, 
                    output_oligo_for_production_nonuniq_filename = output_oligo_for_production_nonuniq_filename,
                    output_oligo_for_production_uniq_filename = output_oligo_for_production_uniq_filename,
                    output_oligo_for_production_uniq_batch_prefix_filename = output_oligo_for_production_uniq_batch_prefix_filename)

print("--------------- Finished designing the library! -------------------")

  interactivity=interactivity, compiler=compiler, result=result)


set sizes:
DHFR             70
DPAGT1          118
NSDHL           118
PGK1            118
PKLR            118
UROS            118
epival          692
ergosterol     3968
gxe           10983
gxg            4957
hsp90           781
technical      1191
Name: set_name, dtype: int64
total # oligos:
23232
distribution of pool sizes
121    192
dtype: int64
Saving : /home/users/rang/crispey3/library_design/Output/oligos_nonuniq_crispey3_GG_9bp_OLIGO.txt
--- non uniq set counts:
gxe           10983
gxg            4957
ergosterol     3968
technical      1191
hsp90           781
epival          692
PKLR            118
PGK1            118
DPAGT1          118
UROS            118
NSDHL           118
DHFR             70
Name: set_name, dtype: int64
--- uniq set counts:
gxe           10983
gxg            4957
ergosterol     3968
technical      1191
hsp90           781
epival          692
PKLR            118
PGK1            118
DPAGT1          118
UROS            118
NSDHL           118
DHFR          