In [None]:
#choose guides falling in my promoters of interest for the mutation library
#guide pairs falling around 100bp from each other are chosen to increase chances of 100bp deletion

In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, squareform
import itertools

In [2]:
ARF9_guides_file = '../../data/CRISPR_library/sgRNAs-ARF9.csv'
ARF18_guides_file = '../../data/CRISPR_library/sgRNAs-ARF18.csv'
DREB26_guides_file = '../../data/CRISPR_library/sgRNAs-DREB26.csv'
NLP7_guides_file = '../../data/CRISPR_library/sgRNAs-NLP7.csv'

In [3]:
# ARF9_guides_file = '../../data/CRISPR_library/500bp_region/sgRNAs-ARF9.csv'
# ARF18_guides_file = '../../data/CRISPR_library/500bp_region/sgRNAs-ARF18.csv'
# DREB26_guides_file = '../../data/CRISPR_library/500bp_region/sgRNAs-DREB26.csv'
# NLP7_guides_file = '../../data/CRISPR_library/500bp_region/sgRNAs-NLP7.csv'

In [4]:
def select_guides(input_file, region_start, region_end,lower_bound_distance, upper_bound_distance):
    """function to put select guide pairs with a specified lowerbound and upper bound range for the distance.
    Selects guides within a specified chromosome region too"""
    guides = pd.read_csv(input_file, header=2)
    #rename headers
    cols = ['Benchling_position', 'distance_from_ATG_bp', 'chromosome_position', 'strand', 'sequence','PAM', 'specificity_score_2013', 'specificity_score_2016', 'efficiency_score']
    guides.columns = cols
    #add old guide no. column
    guides['oldname'] = guides.index + 1
    #prepend string to name
    guides['oldname'] ='guide' + guides['oldname'].astype(str)
    #select guides only between specified region
    guides = guides[(guides.chromosome_position >= region_start) & (guides.chromosome_position <= region_end)].copy()
    
    #remove if efficiency less than 40%
    guides = guides[guides.efficiency_score >= 40]
    #rename guides to match index and add to a new column
    #reset index first
    guides = guides.reset_index(drop=True)
    #rename the new index column
    guides.index.name = 'name'
    #then turn new index into column
    guides = guides.reset_index()
    #add 1 to each value in the new column
    guides.name = guides.name+1
    #then prepend the string 'guides' to that column
    guides['name'] = 'guide' + guides['name'].astype(str) 
    #pairwise differences between distance from ATG
    differences = abs(guides['distance_from_ATG_bp'].values - guides['distance_from_ATG_bp'].values[:, None])
    #turn back into dataframe
    differences = pd.DataFrame(differences, columns = guides.index, index = guides.index)

    
    
    #filter values less than 110 and more than 90 
    df = guides
    results = []
    #created based on https://stackoverflow.com/questions/51409927/python-selecting-pairs-of-objects-from-a-data-frame
    
    
    def dist(a, b):
        """
        Calculates the Euclidean distance between two 3D-vectors.
        """
        diff = np.array(a) - np.array(b)    
        d = np.sqrt(np.dot(diff, diff))
        return d

    for i, row1 in df.iterrows():

         # calculate distance between current coordinate and all original rows in the data
        df['distance'] = df.apply(lambda row2: dist(row1['distance_from_ATG_bp'], row2['distance_from_ATG_bp']), axis=1)
        # filter only those within a specific distance and drop rows with same name as current coordinate
        df_tmp = df[(df['distance'] > lower_bound_distance) & (df['distance'] < upper_bound_distance) & (df['name'] != row1['name'])].copy()
         # prepare final data
        df_tmp['name2'] = row1['name']
        df_tmp['guide2'] = df_tmp['name']
        df_tmp['guide1'] = df_tmp['name2']
        df_tmp['pairs'] = list(zip(df_tmp['name'], df_tmp['name2']))
        # remember data
        results.append(df_tmp)

    # combine all into one dataframe
    df = pd.concat(results)
    # select columns of interest
    df = df[['pairs','guide1','guide2', 'distance']]
    #split each tuple value into just the guide number so they can be sorted later


    df['number1'] = df.guide1.str.split('e',expand=True)[1]
    df['number2'] = df.guide2.str.split('e',expand=True)[1]
    #turn into integars
    df = df.astype({'number1': 'int','number2':'int'})
    #make tuple column
    df['pair_numbers'] = list(zip(df.number2, df.number1))

    def sort_tuple(idx):
        x, y = idx
        if y < x:
            return y, x
        return x, y
    #sort values of each tuple from low to high
    df['pair_numbers'] = df['pair_numbers'].apply(sort_tuple)
    
    # drop duplicates
    df.drop_duplicates(subset=['pair_numbers'], inplace=True)
    #add functionality so that it reduces guide pairs so that all guides are included at least once, 
    #and if a guide appears in more than one pair, rank by distance between the individuals in each pair and 
    #choose the pair with the shortest distance in between them as long as it doesn't remove the last remaining individual of a different guide
    #reset index so I can compare pair rows
    df = df.reset_index(drop=True)
    #append guide 2 to guide 1
    df['all_guides'] = df['number1']
    df2 = df['all_guides'].append(df['number2'])
    #convert to dataframe
    df2 = pd.DataFrame(df2)
    #add column name
    df2.columns = ['guides_present_in_pairs']
    #add other columns (merge on index)
    merged = pd.merge(df[['pairs','pair_numbers','number1','number2','guide1','guide2','distance']],df2, left_index=True, right_index=True)  
    
    #sort merged by distance
    merged.sort_values('distance', inplace=True)
    #drop duplicates from all_guides column, keeping shortest distance

    merged = merged.drop_duplicates(subset=['guides_present_in_pairs'])
    #sort values of each tuple from low to high
    merged.sort_values(['number1','number2'], inplace=True)

    #make guides present in at least 1 pair
    guides_present_in_pairs = merged.sort_values('guides_present_in_pairs')
    guides_present_in_pairs['guides_present_in_pairs'] = 'guide' + guides_present_in_pairs['guides_present_in_pairs'].astype(str)
    #merged old guides df with guides_present to get the old names
    guides_present_in_pairs = pd.merge(guides_present_in_pairs, guides, left_on='guides_present_in_pairs', right_on='name', how='left')
    #drop duplicates in merged
    merged.drop_duplicates(subset=['pair_numbers'], inplace=True)
    #reset index
    merged = merged.reset_index(drop=True)

 
    
    #df.reset_index(drop=True)
    return guides_present_in_pairs[['guides_present_in_pairs', 'oldname']],merged[['pairs','guide1','guide2','distance']], guides[['name','oldname','Benchling_position', 'distance_from_ATG_bp', 'strand', 'sequence','PAM', 'specificity_score_2013', 'specificity_score_2016', 'efficiency_score']].reset_index(drop=True)

In [5]:
def list_used_guides(guide_pairs,guide_info):
    """function to take in the guide pairs and guide information dfs and return a list of guides which are in at least 1 pair"""
    #return list of primers which are in one of the selected pairs
    usedguides = guide_info[guide_info.name.isin(guide_pairs.guide1)| guide_info.name.isin(guide_pairs.guide2)]
    print(f'number of guides in region after filtering by efficiency = {len(guide_info)}')
    print(f'number of guides in pairs = {len(usedguides)}')
    return usedguides
    

In [6]:
#return df with list of primer pairs, and a df containing guides information
ARF9_guides_present,ARF9pairs,ARF9info =select_guides(ARF9_guides_file,region_start=12450889,region_end=12451388,lower_bound_distance=90,upper_bound_distance=140)
ARF18_guides_present,ARF18pairs,ARF18info =select_guides(ARF18_guides_file,22887610,22888109,90,140)
DREB26_guides_present,DREB26pairs,DREB26info =select_guides(DREB26_guides_file,7696155,7696654,90,140)
NLP7_guides_present,NLP7pairs,NLP7info=select_guides(NLP7_guides_file,12479404,12479903,90,140)

In [7]:
#return list of primers which are in one of the selected pairs
ARF9_usedguides = list_used_guides(ARF9pairs,ARF9info)

number of guides in region after filtering by efficiency = 13
number of guides in pairs = 13


In [8]:
ARF9_guides_present

Unnamed: 0,guides_present_in_pairs,oldname
0,guide1,guide14
1,guide2,guide15
2,guide3,guide16
3,guide4,guide19
4,guide5,guide22
5,guide6,guide23
6,guide7,guide24
7,guide8,guide25
8,guide9,guide26
9,guide10,guide27


In [9]:
#list the ARF9 pairs - note there are several guides present in more than 1 pair
ARF9pairs

Unnamed: 0,pairs,guide1,guide2,distance
0,"(guide5, guide1)",guide1,guide5,134.0
1,"(guide5, guide2)",guide2,guide5,130.0
2,"(guide5, guide3)",guide3,guide5,112.0
3,"(guide6, guide4)",guide4,guide6,121.0
4,"(guide11, guide6)",guide6,guide11,111.0
5,"(guide11, guide7)",guide7,guide11,110.0
6,"(guide11, guide8)",guide8,guide11,93.0
7,"(guide12, guide9)",guide9,guide12,131.0
8,"(guide12, guide10)",guide10,guide12,115.0
9,"(guide13, guide10)",guide10,guide13,128.0


In [10]:
#return list of primers which are in one of the selected pairs
ARF18_usedguides = list_used_guides(ARF18pairs,ARF18info)

number of guides in region after filtering by efficiency = 27
number of guides in pairs = 27


In [11]:
ARF18_guides_present

Unnamed: 0,guides_present_in_pairs,oldname
0,guide1,guide6
1,guide2,guide7
2,guide3,guide8
3,guide4,guide11
4,guide5,guide15
5,guide6,guide16
6,guide7,guide17
7,guide8,guide19
8,guide9,guide20
9,guide10,guide21


In [12]:
#ARF18 pairs
ARF18pairs

Unnamed: 0,pairs,guide1,guide2,distance
0,"(guide5, guide1)",guide1,guide5,108.0
1,"(guide5, guide2)",guide2,guide5,107.0
2,"(guide7, guide3)",guide3,guide7,102.0
3,"(guide9, guide4)",guide4,guide9,117.0
4,"(guide12, guide5)",guide5,guide12,95.0
5,"(guide12, guide6)",guide6,guide12,91.0
6,"(guide13, guide6)",guide6,guide13,99.0
7,"(guide14, guide7)",guide7,guide14,99.0
8,"(guide15, guide8)",guide8,guide15,106.0
9,"(guide16, guide8)",guide8,guide16,111.0


In [13]:
#return list of primers which are in one of the selected pairs
DREB26_usedguides = list_used_guides(DREB26pairs,DREB26info)

number of guides in region after filtering by efficiency = 21
number of guides in pairs = 17


In [14]:
DREB26info

Unnamed: 0,name,oldname,Benchling_position,distance_from_ATG_bp,strand,sequence,PAM,specificity_score_2013,specificity_score_2016,efficiency_score
0,guide1,guide1,4525,19,-1,TAATGTTGTTGTGTACGTAC,AGG,96.262657,50.836909,57.303532
1,guide2,guide2,4508,36,-1,TACAGGCTTTGTAGAGTGAT,TGG,97.881548,63.889747,53.110453
2,guide3,guide3,4499,45,-1,TGTAGAGTGATTGGACAATG,TGG,96.16782,59.789238,71.417088
3,guide4,guide4,4485,59,-1,ACAATGTGGTAGTGATCAAG,TGG,95.455236,44.605263,66.454743
4,guide5,guide5,4451,93,-1,TTGAGTTTTTGTTGTTGTTG,TGG,64.213558,7.601505,54.620118
5,guide6,guide7,4427,117,-1,TGGATAGAGATTTTTTGATG,AGG,95.652047,54.962529,54.740411
6,guide7,guide8,4391,153,-1,GAGAAGATTATATATAGAGA,TGG,76.256548,17.312048,52.713341
7,guide8,guide11,4377,167,-1,TAGAGATGGATTGATTTGGG,AGG,91.297425,33.304776,58.681317
8,guide9,guide12,4376,168,-1,AGAGATGGATTGATTTGGGA,GGG,92.874957,33.752324,46.92062
9,guide10,guide13,4375,169,-1,GAGATGGATTGATTTGGGAG,GGG,95.949611,45.485204,51.921321


In [15]:
DREB26_guides_present


Unnamed: 0,guides_present_in_pairs,oldname
0,guide1,guide1
1,guide2,guide2
2,guide3,guide3
3,guide4,guide4
4,guide6,guide7
5,guide7,guide8
6,guide8,guide11
7,guide9,guide12
8,guide10,guide13
9,guide11,guide14


In [19]:
DREB26pairs

Unnamed: 0,pairs,guide1,guide2,distance
0,"(guide6, guide1)",guide1,guide6,98.0
1,"(guide7, guide2)",guide2,guide7,117.0
2,"(guide7, guide3)",guide3,guide7,108.0
3,"(guide7, guide4)",guide4,guide7,94.0
4,"(guide13, guide6)",guide6,guide13,125.0
5,"(guide14, guide8)",guide8,guide14,103.0
6,"(guide14, guide9)",guide9,guide14,102.0
7,"(guide14, guide10)",guide10,guide14,101.0
8,"(guide14, guide11)",guide11,guide14,96.0
9,"(guide15, guide12)",guide12,guide15,101.0


In [20]:
DREB26info

Unnamed: 0,name,oldname,Benchling_position,distance_from_ATG_bp,strand,sequence,PAM,specificity_score_2013,specificity_score_2016,efficiency_score
0,guide1,guide1,4525,19,-1,TAATGTTGTTGTGTACGTAC,AGG,96.262657,50.836909,57.303532
1,guide2,guide2,4508,36,-1,TACAGGCTTTGTAGAGTGAT,TGG,97.881548,63.889747,53.110453
2,guide3,guide3,4499,45,-1,TGTAGAGTGATTGGACAATG,TGG,96.16782,59.789238,71.417088
3,guide4,guide4,4485,59,-1,ACAATGTGGTAGTGATCAAG,TGG,95.455236,44.605263,66.454743
4,guide5,guide5,4451,93,-1,TTGAGTTTTTGTTGTTGTTG,TGG,64.213558,7.601505,54.620118
5,guide6,guide7,4427,117,-1,TGGATAGAGATTTTTTGATG,AGG,95.652047,54.962529,54.740411
6,guide7,guide8,4391,153,-1,GAGAAGATTATATATAGAGA,TGG,76.256548,17.312048,52.713341
7,guide8,guide11,4377,167,-1,TAGAGATGGATTGATTTGGG,AGG,91.297425,33.304776,58.681317
8,guide9,guide12,4376,168,-1,AGAGATGGATTGATTTGGGA,GGG,92.874957,33.752324,46.92062
9,guide10,guide13,4375,169,-1,GAGATGGATTGATTTGGGAG,GGG,95.949611,45.485204,51.921321


In [21]:
#return list of primers which are in one of the selected pairs
NLP7_usedguides = list_used_guides(NLP7pairs,NLP7info)

number of guides in region after filtering by efficiency = 15
number of guides in pairs = 11


In [22]:
NLP7pairs

Unnamed: 0,pairs,guide1,guide2,distance
0,"(guide9, guide1)",guide1,guide9,108.0
1,"(guide9, guide2)",guide2,guide9,93.0
2,"(guide9, guide3)",guide3,guide9,92.0
3,"(guide10, guide7)",guide7,guide10,131.0
4,"(guide10, guide8)",guide8,guide10,127.0
5,"(guide14, guide10)",guide10,guide14,103.0
6,"(guide15, guide11)",guide11,guide15,129.0
7,"(guide15, guide12)",guide12,guide15,103.0


In [23]:
NLP7_guides_present

Unnamed: 0,guides_present_in_pairs,oldname
0,guide1,guide1
1,guide2,guide2
2,guide3,guide3
3,guide7,guide8
4,guide8,guide9
5,guide9,guide10
6,guide10,guide14
7,guide11,guide15
8,guide12,guide18
9,guide14,guide20


In [24]:
#arf9 extending over part 2 open chromatin region ending at 12450824
ARF9_guides_present2,ARF9pairs2,ARF9info2 =select_guides(ARF9_guides_file,region_start=12450310,region_end=12451388,lower_bound_distance=90,upper_bound_distance=157)

In [25]:
ARF2_usedguides2 = list_used_guides(ARF9pairs2,ARF9info2)

number of guides in region after filtering by efficiency = 30
number of guides in pairs = 30


In [26]:
ARF9_guides_present2

Unnamed: 0,guides_present_in_pairs,oldname
0,guide1,guide14
1,guide2,guide15
2,guide3,guide16
3,guide4,guide19
4,guide5,guide22
5,guide6,guide23
6,guide7,guide24
7,guide8,guide25
8,guide9,guide26
9,guide10,guide27


In [27]:
ARF9pairs2

Unnamed: 0,pairs,guide1,guide2,distance
0,"(guide5, guide1)",guide1,guide5,134.0
1,"(guide5, guide2)",guide2,guide5,130.0
2,"(guide5, guide3)",guide3,guide5,112.0
3,"(guide6, guide4)",guide4,guide6,121.0
4,"(guide11, guide6)",guide6,guide11,111.0
5,"(guide11, guide7)",guide7,guide11,110.0
6,"(guide11, guide8)",guide8,guide11,93.0
7,"(guide12, guide9)",guide9,guide12,131.0
8,"(guide12, guide10)",guide10,guide12,115.0
9,"(guide13, guide10)",guide10,guide13,128.0


In [28]:
#arf18 extending until 30 PAMs
ARF18_guides_present2,ARF18pairs2,ARF18info2 =select_guides(ARF18_guides_file,region_start=22887460,region_end=22888109,lower_bound_distance=90,upper_bound_distance=140)

In [29]:
ARF18_guides_present2

Unnamed: 0,guides_present_in_pairs,oldname
0,guide1,guide6
1,guide2,guide7
2,guide3,guide8
3,guide4,guide11
4,guide5,guide15
5,guide6,guide16
6,guide7,guide17
7,guide8,guide19
8,guide9,guide20
9,guide10,guide21


In [30]:
ARF18pairs2

Unnamed: 0,pairs,guide1,guide2,distance
0,"(guide5, guide1)",guide1,guide5,108.0
1,"(guide5, guide2)",guide2,guide5,107.0
2,"(guide7, guide3)",guide3,guide7,102.0
3,"(guide9, guide4)",guide4,guide9,117.0
4,"(guide12, guide5)",guide5,guide12,95.0
5,"(guide12, guide6)",guide6,guide12,91.0
6,"(guide13, guide6)",guide6,guide13,99.0
7,"(guide14, guide7)",guide7,guide14,99.0
8,"(guide15, guide8)",guide8,guide15,106.0
9,"(guide16, guide8)",guide8,guide16,111.0


In [31]:
ARF18_usedguides2 = list_used_guides(ARF18pairs2,ARF18info2)

number of guides in region after filtering by efficiency = 30
number of guides in pairs = 30


In [32]:
DREB26_guides_present2,DREB26pairs2,DREB26info2 =select_guides(DREB26_guides_file,7695780,7696654,90,168)

In [33]:
DREB26_usedguides2 = list_used_guides(DREB26pairs2,DREB26info2)

number of guides in region after filtering by efficiency = 30
number of guides in pairs = 30


In [34]:
DREB26_guides_present2

Unnamed: 0,guides_present_in_pairs,oldname
0,guide1,guide1
1,guide2,guide2
2,guide3,guide3
3,guide4,guide4
4,guide5,guide5
5,guide6,guide7
6,guide7,guide8
7,guide8,guide11
8,guide9,guide12
9,guide10,guide13


In [35]:
DREB26pairs2

Unnamed: 0,pairs,guide1,guide2,distance
0,"(guide6, guide1)",guide1,guide6,98.0
1,"(guide7, guide2)",guide2,guide7,117.0
2,"(guide7, guide3)",guide3,guide7,108.0
3,"(guide7, guide4)",guide4,guide7,94.0
4,"(guide13, guide5)",guide5,guide13,149.0
5,"(guide13, guide6)",guide6,guide13,125.0
6,"(guide14, guide8)",guide8,guide14,103.0
7,"(guide14, guide9)",guide9,guide14,102.0
8,"(guide14, guide10)",guide10,guide14,101.0
9,"(guide14, guide11)",guide11,guide14,96.0


In [448]:
NLP7_guides_present2,NLP7pairs2,NLP7info2=select_guides(NLP7_guides_file,12478898,12479903,90,256)

In [449]:
NLP7_usedguides2 = list_used_guides(NLP7pairs2,NLP7info2)

number of guides in region after filtering by efficiency = 30
number of guides in pairs = 30


In [450]:
NLP7_guides_present2

Unnamed: 0,guides_present_in_pairs,oldname
0,guide1,guide1
1,guide2,guide2
2,guide3,guide3
3,guide4,guide4
4,guide5,guide6
5,guide6,guide7
6,guide7,guide8
7,guide8,guide9
8,guide9,guide10
9,guide10,guide14


In [447]:
NLP7pairs2

Unnamed: 0,pairs,guide1,guide2,distance
0,"(guide9, guide1)",guide1,guide9,108.0
1,"(guide9, guide2)",guide2,guide9,93.0
2,"(guide9, guide3)",guide3,guide9,92.0
3,"(guide10, guide4)",guide4,guide10,189.0
4,"(guide10, guide5)",guide5,guide10,151.0
5,"(guide10, guide6)",guide6,guide10,150.0
6,"(guide10, guide7)",guide7,guide10,131.0
7,"(guide10, guide8)",guide8,guide10,127.0
8,"(guide10, guide9)",guide9,guide10,103.0
9,"(guide13, guide9)",guide9,guide13,189.0
