## Designing guides for a Type 1 CRISPR Cas3 system

Rich Stoner, Synthego

Based on work described in this paper

**Repurposing endogenous type I CRISPR-Cas systems for programmable gene repression**

https://www.ncbi.nlm.nih.gov/pubmed/25326321





In [2]:
# we'll use coral to manage the sequence data, and pandas for loading the csv
import coral as cor
import pandas as pd

In [6]:
# location of the plasmid sequence file
ape_path = './plasmids/pUA66-lacZ.ape'

# location
nar_data = pd.read_csv('./supplemental_data/nar_supplemental_material.csv')

In [7]:
# these are the guides found in the Lou et. al paper
nar_data

Unnamed: 0,Space name,target strand,Distance from TSS,Protospacer sequence
0,T1 -141,T,4361,AGGCCCTTTCGTCTTCACACTCGAGCACGACAG
1,T2/lacZ,T,-37,AGGCTTTACACTTTATGCTTCCGGCTCGTATGT
2,T3,T,27,AGGAAACAGCTATGACCATGATTACGGATTCAC
3,T4,T,149,AGGAGATATACATATGAGTAAAGGAGAAGAACT
4,T5,T,263,AGGTGATGCAACATACGGAAAACTTACCCTTAA
5,T6,T,506,AGGTGATACCCTTGTTAATAGAATCGAGTTAAA
6,NT1 -129,NT,4373,AAGACGAAAGGGCCTCGTGATACGCCTATTTTT
7,NT2,NT,-20,AAGCATAAAGTGTAAAGCCTGGGGTGCCTAATG
8,NT3,NT,119,AAGGCGATTAAGTTGGGTAACGCCAGGGTTTTC
9,NT4,NT,180,AAGTTCTTCTCCTTTACTCATATGTATATCTCC


In [8]:
# read the APE file (coral uses biopython on the backend)
plasmid = cor.seqio.read_dna(ape_path)

In [9]:
# render it in the notebook
plasmid.display()

In [10]:
# show a bit of detail on the defined features
plasmid.features

[lacZ promoter 'misc_feature' feature (0 to 255) on strand 0,
 loc2 'misc_feature' feature (6 to 24) on strand 1,
 -35 element 'misc_feature' feature (89 to 95) on strand 0,
 -10 element 'misc_feature' feature (113 to 119) on strand 0,
 Start of transcription 'misc_feature' feature (125 to 126) on strand 0,
 loc1 'misc_feature' feature (237 to 255) on strand 1,
 gfpmut2 'misc_feature' feature (287 to 1004) on strand 0,
 rrnB_T1 'terminator' feature (1034 to 1112) on strand 0,
 sc101 ori 'rep_origin' feature (1146 to 2678) on strand 0,
 kanR 'CDS' feature (3498 to 4293) on strand 1]

In [11]:
# extract the transcription start site
tss_feature = plasmid.select_features('Start of transcription')[0]
tss = tss_feature.start

In [12]:
# and the gfp domain (not used for the rest of doc however)
gfpmut2_feature = plasmid.select_features('gfpmut2')[0]
gfpmut2 = plasmid.extract(gfpmut2_feature)
len(gfpmut2)

717

In [13]:
# define the different PAM sequences to search for on the watson strand
TYPE_1_PAM_W_SITES = ['AAG', 'ATG', 'AGG', 'GAG']

pam_locations = {}
for ps in TYPE_1_PAM_W_SITES:
    pam_locations[ps] = plasmid.locate(ps)

In [14]:
# example watson strand locations for the AGG PAM
print(pam_locations['AGG'][0])

[12, 76, 85, 152, 254, 274, 295, 381, 388, 556, 567, 631, 664, 1017, 1034, 1053, 1118, 1143, 1220, 1279, 1340, 1382, 1438, 1845, 1849, 2112, 2258, 2351, 2489, 2628, 2913, 2966, 3031, 3115, 3139, 3183, 3290, 3321, 3340, 3438, 3461, 3517, 3527, 3571, 3719, 3902, 3966, 3973, 3985, 4065, 4123, 4135, 4385, 4406, 4474, 4486]


In [15]:
# define your guide length 

guide_length = 30

# and the repeat sequence (aka your clustered repeat)

repeated_section = 'GTTCCCCGCGCCAGCGGGGATAAACCG'

# and create a list to store the guides
guides = []

In [16]:
# locate all target sequences on the watson strand

for pam, pams_by_strand in pam_locations.items():
    
    for w_pam_locus in pams_by_strand[0]:
        
        guide = {
            'pam_locus' : w_pam_locus,
            'distance_from_tss' : w_pam_locus - tss,
            'name' : 'W Guide %d' % len(guides),
            'PAM' : pam,
            'strand' : 'watson',
        }

        guide['pam_plus_sequence'] = plasmid.top[w_pam_locus : w_pam_locus+len(pam)+guide_length ] 
        guide['guide_only'] = plasmid.top[ w_pam_locus+len(pam) : w_pam_locus+len(pam)+guide_length]

        if guide['distance_from_tss'] < 0:
            guide['distance_from_tss'] += 3
        
        guides.append(guide)

In [17]:
# now do the same on the crick strand

pam_locations = {}

TYPE_1_PAM_C_SITES = ['AAG', 'ATG', 'AGG', 'GAG']

for ps in TYPE_1_PAM_C_SITES:
    pam_locations[ps] = plasmid.locate(ps) 

In [18]:
for pam, pams_by_strand in pam_locations.items():
        
    for c_pam_locus in pams_by_strand[1]:
        
        c_pam_in_top_coords = len(plasmid) - c_pam_locus
            
        guide = {
            'pam_locus' : c_pam_locus,
            'distance_from_tss' : c_pam_in_top_coords - tss -len(pam),
            'name' : 'C Guide %d' % len(guides),
            'PAM': pam,
            'strand' : 'crick',                
        }

        guide['pam_plus_sequence'] = plasmid.bottom[c_pam_locus : c_pam_locus+len(pam)+guide_length ] 

        guide['guide_only'] = plasmid.bottom[ c_pam_locus+len(pam) : c_pam_locus+len(pam)+guide_length]            
        
        if guide['distance_from_tss'] < 0:
            guide['distance_from_tss'] += 3
        
        guides.append(guide)


In [19]:
print('Found valid %d guides' % len(guides))

Found valid 568 guides


In [20]:
# let's look and see if we've found every single 

for n,row in nar_data.iterrows():
    dt_tss = row['Distance from TSS']
    
    b_found = False
    
    for g in guides:
        if g['distance_from_tss'] == dt_tss:
            b_found = True
            
    if b_found:
        print('found a matching guide from the paper')        
    else:
        print('Didnt find it')
        print(row)

found a matching guide from the paper
found a matching guide from the paper
found a matching guide from the paper
found a matching guide from the paper
found a matching guide from the paper
found a matching guide from the paper
found a matching guide from the paper
found a matching guide from the paper
found a matching guide from the paper
found a matching guide from the paper
found a matching guide from the paper
found a matching guide from the paper


In [None]:
# uncomment this to print all of the guides found
# for g in guides:
#     if g['strand'] == 'watson':
#         print(g['name'], g['strand'],g['pam_locus'], g['distance_from_tss'], g['pam_plus_sequence'])
        
#     elif g['strand'] == 'crick':
#         print(g['name'], g['strand'],g['pam_locus'], g['distance_from_tss'], g['pam_plus_sequence'])