# CRISPEY Oligo Library Design
for initial design work

## Import packages and functions

In [49]:
import os, random
import pandas as pd

crispey_libdesign_code_dir = os.path.expanduser('~/crispey-epistasis/lib_design/')
working_dir = os.path.expanduser("~/crispey3/initial_design/")

os.chdir(crispey_libdesign_code_dir)
from extract_guides_functions import extract_guides_for_snps, \
                                     design_donor_for_SNP_guides, \
                                     rank_and_filter_SNP_guides, \
                                     generate_oligo_from_guide_donor_barcode, \
                                     assign_pool_barcode_to_oligos, \
                                     write_output_oligos, \
                                     annotate_variants_by_VEPoutput

os.chdir(working_dir)
print("Current directory: {}".format(os.getcwd()))


Current directory: /home/users/rang/crispey3/initial_design


## Design parameters

In [50]:
lib_name = "gxg_initial"

#####################################################
# guide design for SNPs
#####################################################

guide_length = 20
edit_max_distance_from_PAM5prime = 9
PAM_seq = 'GG' 
min_ok_Azimuth_score_SNP_guides = 0
off_targets_min_mismatch_SNP_guides = 1 # increase this to filter guides that have off-targets with mismatches. 1 only filters out guides with perfect match off-targets

# BOWTIE_exe = "bowtie2"

#####################################################
# donor design for SNPs
#####################################################
agilent_homopolymer_max_len = 10

excluded_seqs = ['A' * agilent_homopolymer_max_len, 
                 'C' * agilent_homopolymer_max_len, 
                 'G' * agilent_homopolymer_max_len, 
                 'T' * agilent_homopolymer_max_len,
                 'GCATGC', # SphI cut site
                 'GGCGCGCC', # AscI cut site
                 'GCGGCCGC'] # NotI cut site

donor_length = 108
min_dist_5prime_arm = 30
min_dist_3prime_arm = 55

#####################################################
# barcode grouping parameters
#####################################################
barcodes_per_group = 118


## Input file names

In [51]:
#####################################################
# Input files
#####################################################
input_dir= working_dir + "Input/"

# input VCF
input_snps_vcf_filename = input_dir+'gxg_variants_design_oligos_initial.vcf'

# pool assignments
input_vars_pool_assignment_filename = input_dir+'crispey3_vars_pool_assignment.txt'

# VEP annotations of VCF (txt format)
input_snps_vep_output_filename = input_snps_vcf_filename.replace('.vcf', '_VEPoutput.txt')

# oligo design table - contains information about the other segments in the oligo
input_oligo_design_table_filename = input_dir+'crispey3_oligo_design_table.txt'

# SNP guides donor design table 
# contains set names and filtering to sort variants/guides for each set. (use filter_in and filter_out columns)
input_SNP_donor_design_table_filename = input_dir+'design_donor_for_snps.txt'

# programmed barcodes list
input_barcode_table_filename = input_dir+'12BP_PBCs_well_grouped.csv'

# #####################################################
# # Yeast genome reference files
# #####################################################
ref_files_dir = os.path.expanduser("~/yeast/genomes/")
# http://downloads.yeastgenome.org/sequence/S288C_reference/genome_releases/
input_genome_fasta_filename = ref_files_dir+'Saccharomyces_cerevisiae.R64-1-1.dna.chromosome.I.fa'

# annotations files are from # http://downloads.yeastgenome.org/sequence/S288C_reference/genome_releases/ 
# version 64_1_1
input_gff_filename = ref_files_dir+'saccharomyces_cerevisiae_R64-1-1_20110208.gff'

# input_genome_fasta_filename = os.path.expanduser("~/scratch/hg19/hg19.fa")

## Output file names

These are files names that are used and created during the pipeline.
Names may vary between different designs

In [52]:
###############################################################################################################
# Output files (no need to pass as argument -  depends on the output directory, PAM sequence and max edit distance)
###############################################################################################################
output_directory = working_dir + "Output/"
output_files_uniq_str = lib_name + "_" + PAM_seq + "_" + str(edit_max_distance_from_PAM5prime) + "bp"

# intermediate files during guide-donor-oligo design process
output_SNP_table_filename =    output_directory + "all_SNPs_" + output_files_uniq_str + "_SNP.tab"
output_guides_table_filename = output_directory + "all_SNPs_" + output_files_uniq_str + "_GUIDE.tab"
output_guides_with_features_table_filename = output_directory + "all_SNPs_" + output_files_uniq_str + "_GUIDE_withFeatures.tab"
output_SNP_donor_table_filename = output_directory + "all_SNPs_" + output_files_uniq_str + "_DONOR.tab"
output_guides_with_features_and_rank_table_filename = output_directory + "all_SNPs_" + output_files_uniq_str + "_GUIDE_withFeatures_withRank.tab"
output_oligos_table_filename = output_directory + "all_SNPs_" + output_files_uniq_str + "_OLIGO.tab"
output_oligos_table_with_barcodes_filename = output_directory + "all_SNPs_" + output_files_uniq_str + "_OLIGO_withBarcodes.tab"

# oligos to one table
output_oligo_for_production_nonuniq_filename = output_directory + "oligos_nonuniq_" + output_files_uniq_str + "_OLIGO.txt"
output_oligo_for_production_nonuniq_with_align_filename = output_directory + "oligos_nonuniq_" + output_files_uniq_str + "_OLIGO.txt"

output_oligo_for_production_uniq_filename = output_directory + "oligos_uniq_" + output_files_uniq_str + "_OLIGO.txt"
output_oligo_for_production_uniq_batch_prefix_filename = output_directory + "oligos_uniq_" + output_files_uniq_str + "_"

# SNP table with VEP annotations
output_SNP_withAnnotations_table_filename = output_directory + "all_SNPs_" + lib_name + "_annotated.txt"


# Step 1: Design and extract all guides

In [53]:
# setting the random seed
random.seed(1)


In [24]:
extract_guides_for_snps(input_snps_vcf_filename, input_genome_fasta_filename, 
                        output_SNP_table_filename, output_guides_table_filename,
                        [PAM_seq], guide_length, edit_max_distance_from_PAM5prime,
                        var_id_prefix = "")


---------------------- extracting guides for SNPs -------------------------------
Finish parsing VCF: 403, found: 380, #guides: 626

---------------------------- Done extracting guides for SNPs --------------------------


# Step 2: Add guides features using Azimuth

Azimuth is written in python 2, therefore the script for extracting guide features should be run in a python 2 environment with Azimuth and Bowtie2 (for example environemt see crispr2_7 requirement file)

In this environment run **crispey3_add_guide_features.py** with:
- output_guides_table_filename
- output_guides_with_features_table_filename
- input_genome_fasta_filename (reference genome)
- list of genome fasta filenames for off-target search

Adjust the script's variables to CRISPEY library parameters before running!

# Step 3: Design donors for guides

In [54]:
out_SNP_donor_df = design_donor_for_SNP_guides(
    input_SNP_donor_design_table_filename, 
    output_SNP_table_filename, 
    output_guides_with_features_table_filename,
    input_genome_fasta_filename,
    donor_length, excluded_seqs, min_dist_5prime_arm, min_dist_3prime_arm,
    output_SNP_donor_table_filename)


Excluded seq: AAAAAAAAAA
Excluded seq: CCCCCCCCCC
Excluded seq: GGGGGGGGGG
Excluded seq: TTTTTTTTTT
Excluded seq: GCATGC
Excluded seq: GGCGCGCC
Excluded seq: GCGGCCGC
--------- designing donors according to line: 0  (# guide ids = 69940)----------
set_name                 gxg
filter_in               None
filter_out              None
donor_mut_type       REF2ALT
donor_seq_offsets       [14]
Name: 0, dtype: object
-----------------------------------------------------------




Designing donor number: 1000




Designing donor number: 2000






Designing donor number: 3000




Designing donor number: 4000




Designing donor number: 5000




Designing donor number: 6000






Designing donor number: 7000






Designing donor number: 8000




Designing donor number: 9000




Designing donor number: 10000






Designing donor number: 11000




Designing donor number: 12000




Designing donor number: 13000




Designing donor number: 14000






Designing donor number: 15000






Designing donor number: 16000




Designing donor number: 17000






Designing donor number: 18000






Designing donor number: 19000










Designing donor number: 20000






Designing donor number: 21000




Designing donor number: 22000




Designing donor number: 23000




Designing donor number: 24000








Designing donor number: 25000




Designing donor number: 26000




Designing donor number: 27000




Designing donor number: 28000






Designing donor number: 29000






Designing donor number: 30000








Designing donor number: 31000






Designing donor number: 32000






Designing donor number: 33000






Designing donor number: 34000




Designing donor number: 35000




Designing donor number: 36000






Designing donor number: 37000




Designing donor number: 38000




Designing donor number: 39000




Designing donor number: 40000




Designing donor number: 41000






Designing donor number: 42000






Designing donor number: 43000




Designing donor number: 44000








Designing donor number: 45000




Designing donor number: 46000




Designing donor number: 47000






Designing donor number: 48000




Designing donor number: 49000




Designing donor number: 50000




Designing donor number: 51000






Designing donor number: 52000










Designing donor number: 53000




Designing donor number: 54000






Designing donor number: 55000








Designing donor number: 56000






Designing donor number: 57000






Designing donor number: 58000




Designing donor number: 59000




Designing donor number: 60000




Designing donor number: 61000




Designing donor number: 62000






Designing donor number: 63000




Designing donor number: 64000




Designing donor number: 65000




Designing donor number: 66000








Designing donor number: 67000






Designing donor number: 68000




saving donor sequences to: /home/users/rang/crispey3/initial_design/Output/all_SNPs_gxg_initial_GG_9bp_DONOR.tab


# Step 4: Filter and rank guides

In [55]:
###############################################################################################################
# add filter and ranking to the SNP guides (depends on having a SNP and a donor tables) 
###############################################################################################################
out_SNP_guides_withFandR_df = rank_and_filter_SNP_guides(
        input_guides_with_features_table_filename = output_guides_with_features_table_filename,
        input_SNP_table_filename = output_SNP_table_filename,
        input_donor_table_filename = output_SNP_donor_table_filename,
        output_guides_with_features_and_rank_table_filename = output_guides_with_features_and_rank_table_filename,
        off_targets_min_mismatch_SNP_guides = off_targets_min_mismatch_SNP_guides, 
        min_ok_Azimuth_score_SNP_guides = min_ok_Azimuth_score_SNP_guides, 
        edit_max_distance_from_PAM5prime = edit_max_distance_from_PAM5prime)

Saving : /home/users/rang/crispey3/initial_design/Output/all_SNPs_gxg_initial_GG_9bp_GUIDE_withFeatures_withRank.tab


# Step 5: Write guides, donors and assigned barcodes into oligos
Assemble guide and donor into oligos according to oligo_design_table. Write N's into barcode segment of the oligos for now. The barcode sequence and pool number will be assigned later

In [56]:
SNPs_oligo_df = generate_oligo_from_guide_donor_barcode(
    input_oligo_design_table_filename = input_oligo_design_table_filename,
    input_guide_table_filename = output_guides_with_features_and_rank_table_filename, 
    input_donor_table_filename = output_SNP_donor_table_filename,
    input_barcode_table_filename = None, # use None to fill N's in barcode segment
    input_guide_iloc = None, # option to filter out guides
    input_donor_iloc = None, # option to filter out donors
    group_size = barcodes_per_group,                   
    output_oligos_table_filename = output_oligos_table_filename)

Before filtering there are 69940 guides and 68670 donors
After filtering there are 69940 guides and 68670 donors
shared columns
['var_id', 'guide_id']
joining the guides and the donors by shared columns (guide_id) creates 68670 oligos
No barcodes provided. Skipping barcode assignment.
parsing oligo 0 out of 68670
parsing oligo 1000 out of 68670
parsing oligo 2000 out of 68670
parsing oligo 3000 out of 68670
parsing oligo 4000 out of 68670
parsing oligo 5000 out of 68670
parsing oligo 6000 out of 68670
parsing oligo 7000 out of 68670
parsing oligo 8000 out of 68670
parsing oligo 9000 out of 68670
parsing oligo 10000 out of 68670
parsing oligo 11000 out of 68670
parsing oligo 12000 out of 68670
parsing oligo 13000 out of 68670
parsing oligo 14000 out of 68670
parsing oligo 15000 out of 68670
parsing oligo 16000 out of 68670
parsing oligo 17000 out of 68670
parsing oligo 18000 out of 68670
parsing oligo 19000 out of 68670
parsing oligo 20000 out of 68670
parsing oligo 21000 out of 68670
p

# Step 6: Write oligos to file
Oligos formatted for matrixed oligonucleotide synthesis submission form. Entries are sorted by pool number, followed by barcode ID. Oligo names contain set name, pool number and barcode number

In [10]:
oligo_all_df = SNPs_oligo_df
set_sizes_df =  oligo_all_df['set_name'].value_counts().sort_index()

print("set sizes:")
print(set_sizes_df)
print("total # oligos:")
print(oligo_all_df.shape[0])

oligo_for_order_dfs = write_output_oligos(oligo_all_df, 
                    output_oligo_for_production_nonuniq_filename = output_oligo_for_production_nonuniq_filename,
                    output_oligo_for_production_uniq_filename = output_oligo_for_production_uniq_filename,
                    output_oligo_for_production_uniq_batch_prefix_filename = output_oligo_for_production_uniq_batch_prefix_filename)

print("--------------- Finished designing the library! -------------------")

set sizes:
gxg    4861
Name: set_name, dtype: int64
total # oligos:
4861
Saving : /home/users/rang/crispey3/costanzo_variants/Output/crispey_oligos_nonuniq_gxg_GG_9bp_OLIGO.txt
--- non uniq set counts:
gxg    4861
Name: set_name, dtype: int64
--- uniq set counts:
gxg    4861
Name: set_name, dtype: int64
Saving : /home/users/rang/crispey3/costanzo_variants/Output/crispey_oligos_uniq_gxg_GG_9bp_OLIGO.txt
--------------- Finished designing the library! -------------------


# Step 7: (optional) Annotate VCF file
Upload VCF file to VEP web interface (http://uswest.ensembl.org/Homo_sapiens/Tools/VEP?db=core) and download all annotations in TXT format. Use annotate_variants_by_VEPoutput function to produce an annotated variants table for future reference.

TODO: add function from select_gxg_library_varaints_2020 Jupyter notebook to extract_guides_functions file, write final table to file

In [None]:
annotate_variants_by_VEPoutput(input_snps_vcf_filename, 
                               input_snps_vep_output_filename, 
                               input_gff_filename, 
                               output_SNP_withAnnotations_table_filename)