In [None]:
#choose guides falling in my promoters of interest for the mutation library
#guide pairs falling around 100bp from each other are chosen to increase chances of 100bp deletion

In [71]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, squareform

In [56]:
ARF9_guides_file = '../../data/CRISPR_library/sgRNAs-ARF9.csv'
ARF18_guides_file = '../../data/CRISPR_library/sgRNAs-ARF18.csv'
DREB26_guides_file = '../../data/CRISPR_library/sgRNAs-DREB26.csv'
NLP7_guides_file = '../../data/CRISPR_library/sgRNAs-NLP7.csv'

In [209]:
def select_guides(input_file):
    guides = pd.read_csv(input_file, header=2)
    #rename headers
    cols = ['Benchling_position', 'distance_from_ATG_bp', 'strand', 'sequence','PAM', 'specificity_score_2013', 'specificity_score_2016', 'efficiency_score','deleteme']
    guides.columns = cols
    guides = guides[['Benchling_position', 'distance_from_ATG_bp', 'strand', 'sequence','PAM', 'specificity_score_2013', 'specificity_score_2016', 'efficiency_score']]
    #add guide no. column
    guides['name'] = guides.index + 1
    #prepend string to name
    guides['name'] ='guide' + guides['name'].astype(str)
    #make distance matrix
    #summary = guides.groupby(['name','distance_from_ATG_bp']).size().unstack()
    #distance from atg column
    #remove if efficiency less than 40%
    guides = guides[guides.efficiency_score > 40]

    #pairwise differences between distance from ATG
    differences = abs(guides['distance_from_ATG_bp'].values - guides['distance_from_ATG_bp'].values[:, None])
    #turn back into dataframe
    differences = pd.DataFrame(differences, columns = guides.index, index = guides.index)
    #filter values less than 110 and more than 90 
    df = guides
    results = []
    #created based on https://stackoverflow.com/questions/51409927/python-selecting-pairs-of-objects-from-a-data-frame
    for i, row1 in df.iterrows():

         # calculate distance between current coordinate and all original rows in the data
        df['distance'] = df.apply(lambda row2: dist(row1['distance_from_ATG_bp'], row2['distance_from_ATG_bp']), axis=1)
        # filter only those within a specific distance and drop rows with same name as current coordinate
        df_tmp = df[(df['distance'] > 90) & (df['distance'] < 110) & (df['name'] != row1['name'])].copy()
         # prepare final data
        df_tmp['name2'] = row1['name']
        df_tmp['guide2'] = df_tmp['name']
        df_tmp['guide1'] = df_tmp['name2']
        df_tmp['pairs'] = list(zip(df_tmp['name'], df_tmp['name2']))
        # remember data
        results.append(df_tmp)

    # combine all into one dataframe
    df = pd.concat(results)
    # select columns of interest
    df = df[['pairs','guide1','guide2', 'distance']]
    def sort_tuple(idx):
        x, y = idx
        if y < x:
            return y, x
        return x, y
    #sort values of each tuple from low to high
    df['pairs'] = df['pairs'].apply(sort_tuple)
    
    # drop duplicates
    df.drop_duplicates(subset=['pairs'], inplace=True)     
    
    return df.reset_index(drop=True), guides[['name','Benchling_position', 'distance_from_ATG_bp', 'strand', 'sequence','PAM', 'specificity_score_2013', 'specificity_score_2016', 'efficiency_score']].reset_index(drop=True)

In [210]:
pairs,ARF9 =select_guides(ARF9_guides_file)
inboth = ARF9[ARF9.name.isin(pairs.guide1)| ARF9.name.isin(pairs.guide2)]
pairs

Unnamed: 0,pairs,guide1,guide2,distance
0,"(guide1, guide8)",guide1,guide8,98.0
1,"(guide1, guide9)",guide1,guide9,106.0
2,"(guide12, guide2)",guide2,guide12,93.0
3,"(guide13, guide2)",guide2,guide13,99.0
4,"(guide13, guide3)",guide3,guide13,96.0
5,"(guide14, guide8)",guide8,guide14,99.0
6,"(guide15, guide8)",guide8,guide15,103.0
7,"(guide14, guide9)",guide9,guide14,91.0
8,"(guide15, guide9)",guide9,guide15,95.0
9,"(guide10, guide16)",guide10,guide16,108.0


In [211]:
inboth

Unnamed: 0,name,Benchling_position,distance_from_ATG_bp,strand,sequence,PAM,specificity_score_2013,specificity_score_2016,efficiency_score
0,guide1,5721,33,1,GATAGAGAGACTGTGTGTTT,TGG,95.532602,55.27281,43.682139
1,guide2,5693,61,1,TTCAGCTTCCGGTTCTTCAT,CGG,86.672752,25.559409,42.003969
2,guide3,5690,64,-1,ATCTCTCTCCGATGAAGAAC,CGG,93.448259,49.000705,46.880008
6,guide8,5623,131,-1,AGGAATTAAATTGAAGAAGA,AGG,74.574487,13.560658,56.049875
7,guide9,5615,139,-1,AATTGAAGAAGAAGGTCCCT,TGG,95.172168,40.478023,50.287386
8,guide10,5610,144,1,AACGTGGAGACGGAATCCAA,GGG,99.027037,77.699162,66.886488
9,guide11,5609,145,1,AAACGTGGAGACGGAATCCA,AGG,97.977391,54.182481,56.554217
10,guide12,5600,154,1,AGAACAAGAAAACGTGGAGA,CGG,88.893283,26.254992,61.194235
11,guide13,5594,160,1,CATTAGAGAACAAGAAAACG,TGG,88.25412,30.854637,65.575095
12,guide14,5524,230,1,GTTACAGTTACAGAGCAGGA,AGG,99.0347,64.583247,63.565916


In [187]:
inboth = ARF9[ARF9.name.isin(pairs.guide1)| ARF9.name.isin(pairs.guide2)]

In [188]:
inboth

Unnamed: 0,name,Benchling_position,distance_from_ATG_bp,strand,sequence,PAM,specificity_score_2013,specificity_score_2016,efficiency_score
0,guide1,5721,33,1,GATAGAGAGACTGTGTGTTT,TGG,95.532602,55.27281,43.682139
1,guide2,5693,61,1,TTCAGCTTCCGGTTCTTCAT,CGG,86.672752,25.559409,42.003969
2,guide3,5690,64,-1,ATCTCTCTCCGATGAAGAAC,CGG,93.448259,49.000705,46.880008
7,guide8,5623,131,-1,AGGAATTAAATTGAAGAAGA,AGG,74.574487,13.560658,56.049875
8,guide9,5615,139,-1,AATTGAAGAAGAAGGTCCCT,TGG,95.172168,40.478023,50.287386
9,guide10,5610,144,1,AACGTGGAGACGGAATCCAA,GGG,99.027037,77.699162,66.886488
10,guide11,5609,145,1,AAACGTGGAGACGGAATCCA,AGG,97.977391,54.182481,56.554217
11,guide12,5600,154,1,AGAACAAGAAAACGTGGAGA,CGG,88.893283,26.254992,61.194235
12,guide13,5594,160,1,CATTAGAGAACAAGAAAACG,TGG,88.25412,30.854637,65.575095
13,guide14,5524,230,1,GTTACAGTTACAGAGCAGGA,AGG,99.0347,64.583247,63.565916
