## Designing guides for a Type 1 CRISPR cas3 system

Rich Stoner, Synthego

Based on work described in this paper

**Repurposing endogenous type I CRISPR-Cas systems for programmable gene repression**

https://www.ncbi.nlm.nih.gov/pubmed/25326321





In [None]:
# we'll use coral to manage the sequence data, and pandas for loading the csv
import coral as cor
import pandas as pd

In [None]:
# location of the plasmid sequence file
ape_path = './CRISPR/pUA66-lacZ.ape'

# location
nar_data = pd.read_csv('./CRISPR/nar_supplemental_material.csv')

In [None]:
# these are the guides found in the Lou et. al paper
nar_data

In [None]:
# read the APE file (coral uses biopython on the backend)
plasmid = cor.seqio.read_dna(ape_path)

In [None]:
# render it in the notebook
plasmid.display()

In [None]:
# show a bit of detail on the defined features
plasmid.features

In [None]:
# extract the transcription start site
tss_feature = plasmid.select_features('Start of transcription')[0]
tss = tss_feature.start

In [None]:
# and the gfp domain (not used for the rest of doc however)
gfpmut2_feature = plasmid.select_features('gfpmut2')[0]
gfpmut2 = plasmid.extract(gfpmut2_feature)
len(gfpmut2)

In [None]:
# define the different PAM sequences to search for on the watson strand
TYPE_1_PAM_W_SITES = ['AAG', 'ATG', 'AGG', 'GAG']

pam_locations = {}
for ps in TYPE_1_PAM_W_SITES:
    pam_locations[ps] = plasmid.locate(ps)

In [None]:
# example watson strand locations for the AGG PAM
print(pam_locations['AGG'][0])

In [None]:
# define your guide length 

guide_length = 30

# and the repeat sequence (aka your clustered repeat)

repeated_section = 'GTTCCCCGCGCCAGCGGGGATAAACCG'

# and create a list to store the guides
guides = []

In [None]:
# locate all target sequences on the watson strand

for pam, pams_by_strand in pam_locations.items():
    
    for w_pam_locus in pams_by_strand[0]:
        
        guide = {
            'pam_locus' : w_pam_locus,
            'distance_from_tss' : w_pam_locus - tss,
            'name' : 'W Guide %d' % len(guides),
            'PAM' : pam,
            'strand' : 'watson',
        }

        guide['pam_plus_sequence'] = plasmid.top[w_pam_locus : w_pam_locus+len(pam)+guide_length ] 
        guide['guide_only'] = plasmid.top[ w_pam_locus+len(pam) : w_pam_locus+len(pam)+guide_length]

        if guide['distance_from_tss'] < 0:
            guide['distance_from_tss'] += 3
        
        guides.append(guide)

In [None]:
# now do the same on the crick strand

pam_locations = {}

TYPE_1_PAM_C_SITES = ['AAG', 'ATG', 'AGG', 'GAG']

for ps in TYPE_1_PAM_C_SITES:
    pam_locations[ps] = plasmid.locate(ps) 

In [None]:
for pam, pams_by_strand in pam_locations.items():
        
    for c_pam_locus in pams_by_strand[1]:
        
        c_pam_in_top_coords = len(plasmid) - c_pam_locus
            
        guide = {
            'pam_locus' : c_pam_locus,
            'distance_from_tss' : c_pam_in_top_coords - tss -len(pam),
            'name' : 'C Guide %d' % len(guides),
            'PAM': pam,
            'strand' : 'crick',                
        }

        guide['pam_plus_sequence'] = plasmid.bottom[c_pam_locus : c_pam_locus+len(pam)+guide_length ] 

        guide['guide_only'] = plasmid.bottom[ c_pam_locus+len(pam) : c_pam_locus+len(pam)+guide_length]            
        
        if guide['distance_from_tss'] < 0:
            guide['distance_from_tss'] += 3
        
        guides.append(guide)


In [None]:
print('Found valid %d guides' % len(guides))

In [None]:
# let's look and see if we've found every single 

for n,row in nar_data.iterrows():
    dt_tss = row['Distance from TSS']
    
    b_found = False
    
    for g in guides:
        if g['distance_from_tss'] == dt_tss:
            b_found = True
            
    if b_found:
        print('found a matching guide from the paper')        
    else:
        print('Didnt find it')
        print(row)

In [None]:
# uncomment this to print all of the guides found
# for g in guides:
#     if g['strand'] == 'watson':
#         print(g['name'], g['strand'],g['pam_locus'], g['distance_from_tss'], g['pam_plus_sequence'])
        
#     elif g['strand'] == 'crick':
#         print(g['name'], g['strand'],g['pam_locus'], g['distance_from_tss'], g['pam_plus_sequence'])