In [7]:
### This notebook contains code that filter sgRNAs that only edit synonymous mutations
### The main difference from previous version is it fetch the BE window sequences from annotation file directly
import pandas as pd 
from tqdm import tqdm
import pickle
from Bio import SeqIO
import re

from os.path import exists,join
from os import listdir,makedirs

import sys
sys.path.append('../')
from src import syn_sgFinder
from itertools import chain,product

In [2]:
%load_ext autoreload
%autoreload 2

In [8]:
out_path = '../../data/output/sg_Finder'
mrna_path = '../../data/MANE'
fpath = '../../data/output/sg_out/'

# Load all expressed transcripts
ess_genelist = pd.read_csv('../../data/output/K562_comm_ess.txt', header =None)
neutral_genelist = pd.read_csv('../../data/output/K562_neutral.txt', header =None)

list_exp_trsp = pickle.load(open('../../data/22Q2_gene_effect/expressed_transcripts.pkl','rb'))
# Parse mRNA sequence dictionary, for use when sequence aligned to intron-exon junctions
record_dict = SeqIO.to_dict(SeqIO.parse(join(mrna_path,'MANE.GRCh38.v1.0.refseq_rna.fna'), "fasta"))

fpath = '../../data/sg_out/'
list_gene = [i for i in listdir(fpath) if not i.startswith('.')]
list_gene = [i for i in list_gene if len(listdir(join(fpath,i))) > 5]
list_gene_ess = [i for i in list_gene if i in list(ess_genelist[0])]
list_gene_neutral = [i for i in list_gene if i in list(neutral_genelist[0])]

In [9]:
def run_sgFinder(input_genes, length, tag = 'ess', sg_p ='../../data/output/sg_out' ):
    out_path = '../../data/output/sg_Finder'
    
    df_synsg_abe = pd.DataFrame()
    df_synsg_cbe = pd.DataFrame()
    df_abeCT = pd.DataFrame()
    df_cbeCT = pd.DataFrame()
    
    dict_filter = {} # Record filtered sgRNA index
    for genes in tqdm(input_genes):
        
        if not exists(join(out_path,genes,tag)):
            makedirs(join(out_path,genes,tag))
            
        # Get gene transcript, sequence information
        sg_gene = syn_sgFinder.sgFinder_gene(genes,sg_path = '../../data/sg_out' )
        sg_gene.gene_cds_proc(record_dict,list_exp_trsp) # filter out non-expressed transcript

        # Process sgRNA for each transcripts
        df_synsg_abe_dt = pd.DataFrame()
        df_synsg_cbe_dt = pd.DataFrame()
        for idx, transcripts in list(product(sg_gene.sg_file.index,\
                                                               sg_gene.dict_exons.keys())):
            sg_trsp = syn_sgFinder.sgFinder_transcript(sg_gene.gene,idx,transcripts)
            check_point = sg_trsp.process_BE_window(sg_gene,dict_filter, window_length = length)
            if check_point:
                sg_trsp.all_pos_consq(sg_gene)

                df_synsg_abe = df_synsg_abe.append(sg_trsp.df_abe,ignore_index = True)
                df_synsg_cbe = df_synsg_cbe.append(sg_trsp.df_cbe,ignore_index = True)

                df_abeCT = df_abeCT.append(sg_trsp.df_abe_control,ignore_index = True)
                df_cbeCT = df_cbeCT.append(sg_trsp.df_cbe_control,ignore_index = True)

                df_synsg_abe_dt = df_synsg_abe_dt.append(sg_trsp.df_abe_ind,ignore_index = True)
                df_synsg_cbe_dt = df_synsg_cbe_dt.append(sg_trsp.df_cbe_ind,ignore_index = True)

        df_synsg_abe_dt.to_csv(join(out_path,genes,tag,f'df_abe_detail.csv'))
        df_synsg_cbe_dt.to_csv(join(out_path,genes,tag,f'df_cbe_detail.csv'))

    # Process and saving final result
    ABE_synsg = df_synsg_abe[df_synsg_abe['syn_or_not'].apply(lambda x: all(x)&any(x))]
    CBE_synsg = df_synsg_cbe[df_synsg_cbe['syn_or_not'].apply(lambda x: all(x)&any(x))]

    if not exists(join(out_path,tag)):
        makedirs(join(out_path,tag))
    ABE_synsg.to_csv(join(out_path,tag,'ABE_synsg.csv'))
    CBE_synsg.to_csv(join(out_path,tag,'CBE_synsg.csv'))

    df_abeCT.to_csv(join(out_path,tag,'ABE_CT.csv'))
    df_cbeCT.to_csv(join(out_path,tag,'CBE_CT.csv'))
    return(dict_filter)

In [10]:
filter_15 = run_sgFinder(list_gene_ess, length = 15, tag = 'ess_15' )

  0%|          | 0/122 [00:00<?, ?it/s]

Processing AARS1...Keep 1039/1167 low off-target sgRNAs...


  1%|          | 1/122 [00:15<30:56, 15.35s/it]

Processing ABCE1...Keep 44/486 low off-target sgRNAs...


  2%|▏         | 2/122 [00:15<13:22,  6.69s/it]

Processing ANAPC11...Keep 88/103 low off-target sgRNAs...


  2%|▏         | 3/122 [00:17<08:20,  4.20s/it]

Processing ANAPC4...Keep 525/624 low off-target sgRNAs...


  3%|▎         | 4/122 [00:24<10:42,  5.45s/it]

Processing ANKLE2...Keep 998/1165 low off-target sgRNAs...
ANKLE2 NM_015114                       sequence length: 4590,                       index length:4588
position not found


  4%|▍         | 5/122 [00:39<17:12,  8.82s/it]

Processing ATP6V0C...Keep 82/211 low off-target sgRNAs...


  5%|▍         | 6/122 [00:40<12:00,  6.21s/it]

Processing BUD31...Keep 116/152 low off-target sgRNAs...


  6%|▌         | 7/122 [00:42<09:06,  4.75s/it]

Processing CDC123...Keep 254/283 low off-target sgRNAs...


  7%|▋         | 8/122 [00:45<08:20,  4.39s/it]

Processing CDC16...Keep 337/416 low off-target sgRNAs...


  7%|▋         | 9/122 [00:50<08:34,  4.55s/it]

Processing CDC45...Keep 490/554 low off-target sgRNAs...
position not found
position not found


  8%|▊         | 10/122 [00:57<09:47,  5.25s/it]

Processing CDC5L...Keep 630/765 low off-target sgRNAs...
position not found


  9%|▉         | 11/122 [01:06<12:01,  6.50s/it]

Processing CDT1...Keep 692/822 low off-target sgRNAs...
position not found


 10%|▉         | 12/122 [01:17<13:58,  7.62s/it]

Processing CLP1...Keep 423/477 low off-target sgRNAs...


 11%|█         | 13/122 [01:23<13:03,  7.18s/it]

Processing COPA...Keep 1123/1279 low off-target sgRNAs...


 11%|█▏        | 14/122 [01:40<18:14, 10.14s/it]

Processing COPB2...Keep 733/856 low off-target sgRNAs...
position not found


 12%|█▏        | 15/122 [01:51<18:26, 10.34s/it]

Processing CPSF3...Keep 397/478 low off-target sgRNAs...


 13%|█▎        | 16/122 [01:57<15:55,  9.01s/it]

Processing CSE1L...Keep 621/743 low off-target sgRNAs...


 14%|█▍        | 17/122 [02:06<15:52,  9.07s/it]

Processing DDX54...Keep 1019/1194 low off-target sgRNAs...
position not found
position not found
position not found


 15%|█▍        | 18/122 [02:21<18:46, 10.83s/it]

Processing DDX56...Keep 424/571 low off-target sgRNAs...
DDX56 NM_019082                       sequence length: 1855,                       index length:2872


 16%|█▌        | 19/122 [02:21<13:25,  7.82s/it]

Processing DHX15...Keep 710/810 low off-target sgRNAs...
position not found


 16%|█▋        | 20/122 [02:32<14:41,  8.64s/it]

Processing DONSON...Keep 449/652 low off-target sgRNAs...
DONSON NM_017613                       sequence length: 2500,                       index length:2498


 17%|█▋        | 21/122 [02:39<13:42,  8.15s/it]

Processing DTL...Keep 455/540 low off-target sgRNAs...


 18%|█▊        | 22/122 [02:46<13:01,  7.81s/it]

Processing DYNC1H1...Keep 4860/5487 low off-target sgRNAs...
position not found


 19%|█▉        | 23/122 [04:22<56:31, 34.26s/it]

Processing ECD...Keep 504/594 low off-target sgRNAs...
ECD NM_007265                       sequence length: 3006,                       index length:2999
position not found


 20%|█▉        | 24/122 [04:30<43:01, 26.34s/it]

Processing EEF1A1...Keep 0/526 low off-target sgRNAs...
Processing EEF2...Keep 1082/1197 low off-target sgRNAs...


 21%|██▏       | 26/122 [04:47<29:03, 18.16s/it]

Processing EIF2S1...Keep 252/296 low off-target sgRNAs...
position not found


 22%|██▏       | 27/122 [04:51<23:07, 14.61s/it]

Processing EIF3I...Keep 207/348 low off-target sgRNAs...
EIF3I NM_003757                       sequence length: 1421,                       index length:1919


 23%|██▎       | 28/122 [04:51<17:00, 10.85s/it]

Processing EIF4A3...Keep 371/443 low off-target sgRNAs...
EIF4A3 NM_014740                       sequence length: 2524,                       index length:2523


 24%|██▍       | 29/122 [04:57<14:39,  9.46s/it]

Processing FBL...Keep 297/390 low off-target sgRNAs...


 25%|██▍       | 30/122 [05:01<12:21,  8.06s/it]

Processing GARS1...Keep 561/689 low off-target sgRNAs...
GARS1 NM_002047                       sequence length: 2437,                       index length:2436


 25%|██▌       | 31/122 [05:10<12:39,  8.35s/it]

Processing GPN3...Keep 187/253 low off-target sgRNAs...


 26%|██▌       | 32/122 [05:13<10:09,  6.77s/it]

Processing GRPEL1...Keep 211/265 low off-target sgRNAs...


 27%|██▋       | 33/122 [05:17<08:34,  5.78s/it]

Processing HCFC1...Keep 2481/2967 low off-target sgRNAs...


 28%|██▊       | 34/122 [06:01<24:59, 17.04s/it]

Processing HINFP...Keep 144/158 low off-target sgRNAs...
HINFP NM_198971                       sequence length: 3185,                       index length:3182


 29%|██▊       | 35/122 [06:03<18:23, 12.68s/it]

Processing HNRNPK...Keep 19/406 low off-target sgRNAs...


 30%|██▉       | 36/122 [06:03<12:54,  9.00s/it]

Processing HSPA9...Keep 181/670 low off-target sgRNAs...
HSPA9 NM_004134                       sequence length: 4404,                       index length:4401


 30%|███       | 37/122 [06:06<10:11,  7.19s/it]

Processing IARS1...Keep 800/940 low off-target sgRNAs...
IARS1 NM_002161                       sequence length: 4483,                       index length:4481


 31%|███       | 38/122 [06:19<12:33,  8.97s/it]

Processing ISCU...Keep 80/98 low off-target sgRNAs...


 32%|███▏      | 39/122 [06:21<09:14,  6.68s/it]

Processing KARS1...Keep 239/442 low off-target sgRNAs...


 33%|███▎      | 40/122 [06:25<07:59,  5.84s/it]

Processing KIF11...Keep 732/887 low off-target sgRNAs...


 34%|███▎      | 41/122 [06:37<10:30,  7.78s/it]

Processing KPNB1...Keep 554/729 low off-target sgRNAs...


 34%|███▍      | 42/122 [06:46<10:54,  8.19s/it]

Processing MARS1...Keep 879/1027 low off-target sgRNAs...


 35%|███▌      | 43/122 [07:00<13:07,  9.97s/it]

Processing NAPA...Keep 282/312 low off-target sgRNAs...


 36%|███▌      | 44/122 [07:05<10:46,  8.29s/it]

Processing NCBP2...Keep 70/81 low off-target sgRNAs...


 37%|███▋      | 45/122 [07:06<07:55,  6.17s/it]

Processing NEDD1...Keep 423/503 low off-target sgRNAs...
NEDD1 NM_152905                       sequence length: 4015,                       index length:4014


 38%|███▊      | 46/122 [07:13<08:03,  6.36s/it]

Processing NUDT21...Keep 166/226 low off-target sgRNAs...


 39%|███▊      | 47/122 [07:15<06:34,  5.26s/it]

Processing PAFAH1B1...Keep 141/408 low off-target sgRNAs...


 39%|███▉      | 48/122 [07:18<05:25,  4.40s/it]

Processing PCNA...Keep 112/269 low off-target sgRNAs...


 40%|████      | 49/122 [07:20<04:25,  3.63s/it]

Processing PFDN2...Keep 152/180 low off-target sgRNAs...


 41%|████      | 50/122 [07:22<04:00,  3.34s/it]

Processing PHB...Keep 16/165 low off-target sgRNAs...


 42%|████▏     | 51/122 [07:22<02:52,  2.43s/it]

Processing PLK1...Keep 406/797 low off-target sgRNAs...


 43%|████▎     | 52/122 [07:29<04:26,  3.80s/it]

Processing POLD1...Keep 1335/1521 low off-target sgRNAs...
position not found
position not found


 43%|████▎     | 53/122 [07:53<11:06,  9.65s/it]

Processing POLE...Keep 2486/2818 low off-target sgRNAs...
position not found
position not found


 44%|████▍     | 54/122 [08:40<23:48, 21.01s/it]

Processing POLR1A...Keep 1784/2028 low off-target sgRNAs...


 45%|████▌     | 55/122 [09:14<27:36, 24.73s/it]

Processing POLR2B...Keep 893/1017 low off-target sgRNAs...
position not found


 46%|████▌     | 56/122 [09:29<24:12, 22.01s/it]

Processing POLR2C...Keep 173/272 low off-target sgRNAs...


 47%|████▋     | 57/122 [09:32<17:37, 16.27s/it]

Processing POLR2E...Keep 68/133 low off-target sgRNAs...


 48%|████▊     | 58/122 [09:33<12:31, 11.73s/it]

Processing POLR2L...Keep 67/80 low off-target sgRNAs...


 48%|████▊     | 59/122 [09:35<08:59,  8.57s/it]

Processing POLR3A...Keep 1351/1530 low off-target sgRNAs...


 49%|████▉     | 60/122 [10:00<13:55, 13.48s/it]

Processing PRC1...Keep 451/529 low off-target sgRNAs...
PRC1 NM_003981                       sequence length: 3072,                       index length:3069


 50%|█████     | 61/122 [10:07<12:01, 11.82s/it]

Processing PRPF19...Keep 481/555 low off-target sgRNAs...
PRPF19 NM_014502                       sequence length: 2337,                       index length:2336


 51%|█████     | 62/122 [10:16<10:41, 10.69s/it]

Processing PSMA1...Keep 45/92 low off-target sgRNAs...


 52%|█████▏    | 63/122 [10:16<07:35,  7.72s/it]

Processing PSMA2...Keep 48/184 low off-target sgRNAs...


 52%|█████▏    | 64/122 [10:17<05:28,  5.66s/it]

Processing PSMA3...Keep 91/171 low off-target sgRNAs...


 53%|█████▎    | 65/122 [10:19<04:12,  4.43s/it]

Processing PSMA5...Keep 133/158 low off-target sgRNAs...


 54%|█████▍    | 66/122 [10:21<03:31,  3.79s/it]

Processing PSMA7...Keep 231/264 low off-target sgRNAs...


 55%|█████▍    | 67/122 [10:25<03:29,  3.81s/it]

Processing PSMB4...Keep 291/306 low off-target sgRNAs...


 56%|█████▌    | 68/122 [10:30<03:43,  4.13s/it]

Processing PSMB5...Keep 60/79 low off-target sgRNAs...


 57%|█████▋    | 70/122 [10:31<01:58,  2.28s/it]

Processing RAN...Keep 4/131 low off-target sgRNAs...
Processing RANGAP1...Keep 589/735 low off-target sgRNAs...


 59%|█████▉    | 72/122 [10:41<02:46,  3.33s/it]

Processing RBBP4...Keep 4/354 low off-target sgRNAs...
Processing RNGTT...Keep 360/454 low off-target sgRNAs...


 60%|█████▉    | 73/122 [10:48<03:28,  4.25s/it]

Processing RPA1...Keep 499/545 low off-target sgRNAs...


 61%|██████    | 74/122 [10:57<04:31,  5.66s/it]

Processing RPAP1...Keep 1518/1875 low off-target sgRNAs...


 61%|██████▏   | 75/122 [11:27<10:11, 13.01s/it]

Processing RPL11...Keep 50/174 low off-target sgRNAs...


 65%|██████▍   | 79/122 [11:28<02:57,  4.13s/it]

Processing RPL15...Keep 0/143 low off-target sgRNAs...
Processing RPL18A...Keep 0/227 low off-target sgRNAs...
Processing RPL19...Keep 0/220 low off-target sgRNAs...
Processing RPL4...Keep 12/456 low off-target sgRNAs...


 66%|██████▋   | 81/122 [11:28<01:52,  2.74s/it]

Processing RPS12...Keep 1/121 low off-target sgRNAs...
Processing RPS13...Keep 8/120 low off-target sgRNAs...


 70%|██████▉   | 85/122 [11:29<00:45,  1.22s/it]

Processing RPS15...Keep 0/192 low off-target sgRNAs...
Processing RPS15A...Keep 1/121 low off-target sgRNAs...
Processing RPS18...Keep 0/163 low off-target sgRNAs...
Processing RPS2...Keep 1/391 low off-target sgRNAs...
Processing RPS29...Keep 0/38 low off-target sgRNAs...


 72%|███████▏  | 88/122 [11:29<00:26,  1.30it/s]

Processing RPS4X...Keep 7/290 low off-target sgRNAs...
RPS4X NM_001007                       sequence length: 1474,                       index length:1473
Processing RPS6...Keep 21/265 low off-target sgRNAs...
RPS6 NM_001010                       sequence length: 1369,                       index length:1368


 74%|███████▍  | 90/122 [11:29<00:19,  1.62it/s]

Processing RPS8...Keep 1/220 low off-target sgRNAs...
Processing RRM1...Keep 350/434 low off-target sgRNAs...
Processing RRM2...Keep 99/407 low off-target sgRNAs...


 75%|███████▌  | 92/122 [11:38<00:48,  1.63s/it]

Processing RUVBL1...Keep 284/330 low off-target sgRNAs...


 76%|███████▌  | 93/122 [11:43<01:04,  2.22s/it]

Processing SDE2...Keep 451/540 low off-target sgRNAs...


 77%|███████▋  | 94/122 [11:51<01:35,  3.43s/it]

Processing SEC61A1...Keep 411/527 low off-target sgRNAs...


 78%|███████▊  | 95/122 [11:59<01:56,  4.33s/it]

Processing SF1...Keep 417/514 low off-target sgRNAs...


 79%|███████▊  | 96/122 [12:06<02:11,  5.07s/it]

Processing SF3A1...Keep 826/1012 low off-target sgRNAs...


 80%|███████▉  | 97/122 [12:22<03:14,  7.79s/it]

Processing SF3A2...Keep 404/727 low off-target sgRNAs...


 80%|████████  | 98/122 [12:30<03:04,  7.71s/it]

Processing SF3B3...Keep 1202/1364 low off-target sgRNAs...


 81%|████████  | 99/122 [12:54<04:42, 12.26s/it]

Processing SFPQ...Keep 723/924 low off-target sgRNAs...


 82%|████████▏ | 100/122 [13:08<04:39, 12.72s/it]

Processing SNRNP200...Keep 1979/2502 low off-target sgRNAs...


 83%|████████▎ | 101/122 [13:52<07:34, 21.66s/it]

Processing SNRPB...Keep 204/284 low off-target sgRNAs...


 84%|████████▎ | 102/122 [13:56<05:30, 16.53s/it]

Processing SNRPD2...Keep 23/62 low off-target sgRNAs...
Processing SNRPF...Keep 13/82 low off-target sgRNAs...
SNRPF NM_003095                       sequence length: 452,                       index length:1365


 85%|████████▌ | 104/122 [13:56<02:43,  9.11s/it]

Processing SNW1...Keep 192/536 low off-target sgRNAs...
SNW1 NM_012245                       sequence length: 2129,                       index length:2127


 87%|████████▋ | 106/122 [14:00<01:33,  5.82s/it]

Processing SPC24...Keep 74/98 low off-target sgRNAs...
SPC24 NM_182513                       sequence length: 2290,                       index length:2319
Processing SRSF2...Keep 256/337 low off-target sgRNAs...


 88%|████████▊ | 107/122 [14:05<01:24,  5.63s/it]

Processing SRSF7...Keep 213/233 low off-target sgRNAs...


 89%|████████▊ | 108/122 [14:09<01:12,  5.18s/it]

Processing SSU72...Keep 170/216 low off-target sgRNAs...


 89%|████████▉ | 109/122 [14:13<01:00,  4.68s/it]

Processing TARS1...Keep 567/681 low off-target sgRNAs...


 90%|█████████ | 110/122 [14:24<01:19,  6.64s/it]

Processing TUBB...Keep 0/213 low off-target sgRNAs...
TUBB NM_178014                       sequence length: 2515,                       index length:2511
Processing TUT1...Keep 564/683 low off-target sgRNAs...


 92%|█████████▏| 112/122 [14:35<01:01,  6.18s/it]

Processing TXNL4A...Keep 54/81 low off-target sgRNAs...


 93%|█████████▎| 113/122 [14:36<00:44,  4.95s/it]

Processing U2AF2...Keep 556/608 low off-target sgRNAs...


 93%|█████████▎| 114/122 [14:48<00:52,  6.59s/it]

Processing UBA1...Keep 1123/1263 low off-target sgRNAs...


 94%|█████████▍| 115/122 [15:12<01:18, 11.27s/it]

Processing UBL5...Keep 13/60 low off-target sgRNAs...


 95%|█████████▌| 116/122 [15:12<00:49,  8.23s/it]

Processing UBTF...Keep 553/835 low off-target sgRNAs...
UBTF NM_014233                       sequence length: 4795,                       index length:4794


 96%|█████████▌| 117/122 [15:23<00:45,  9.11s/it]

Processing VARS1...Keep 0/1631 low off-target sgRNAs...
VARS1 NM_006295                       sequence length: 4111,                       index length:4109
Processing VCP...Keep 762/856 low off-target sgRNAs...
VCP NM_007126                       sequence length: 3746,                       index length:3744


 98%|█████████▊| 119/122 [15:40<00:26,  8.69s/it]

Processing WDR70...Keep 284/571 low off-target sgRNAs...
WDR70 NM_018034                       sequence length: 2877,                       index length:2873


 98%|█████████▊| 120/122 [15:46<00:16,  8.00s/it]

Processing WEE1...Keep 261/414 low off-target sgRNAs...


 99%|█████████▉| 121/122 [15:51<00:07,  7.38s/it]

Processing YARS1...Keep 500/574 low off-target sgRNAs...
YARS1 NM_003680                       sequence length: 2443,                       index length:3117


100%|██████████| 122/122 [15:52<00:00,  7.81s/it]


In [11]:
filter_neutral = run_sgFinder(list_gene_neutral, length = 15, tag = 'neutral_15' )

  8%|▊         | 3/40 [00:00<00:01, 28.51it/s]

Processing ATP6V1G3...Keep 0/39 low off-target sgRNAs...
Processing BRDT...Keep 517/722 low off-target sgRNAs...
Processing C10orf53...Keep 77/82 low off-target sgRNAs...
Processing CER1...Keep 293/359 low off-target sgRNAs...
CER1 NM_005454                       sequence length: 1231,                       index length:1229
Processing CETN1...Keep 208/244 low off-target sgRNAs...


 25%|██▌       | 10/40 [00:07<00:20,  1.47it/s]

Processing CHAT...Keep 700/779 low off-target sgRNAs...
Processing CNPY1...Keep 58/71 low off-target sgRNAs...
Processing CST9...Keep 150/194 low off-target sgRNAs...
Processing CST9L...Keep 116/173 low off-target sgRNAs...
Processing DEFA6...Keep 105/123 low off-target sgRNAs...
Processing EGR4...Keep 693/803 low off-target sgRNAs...
Processing FCRL4...Keep 420/583 low off-target sgRNAs...


 35%|███▌      | 14/40 [00:07<00:10,  2.49it/s]

Processing GH2...Keep 68/195 low off-target sgRNAs...
Processing GLT6D1...Keep 285/322 low off-target sgRNAs...
Processing GPX6...Keep 84/239 low off-target sgRNAs...
Processing IFNA10...Keep 16/245 low off-target sgRNAs...
Processing IL26...Keep 118/149 low off-target sgRNAs...
Processing INSM2...Keep 853/970 low off-target sgRNAs...


 55%|█████▌    | 22/40 [00:20<00:17,  1.04it/s]

Processing KRT26...Keep 405/523 low off-target sgRNAs...
Processing KRT76...Keep 355/853 low off-target sgRNAs...
Processing LALBA...Keep 127/143 low off-target sgRNAs...
Processing LBX1...Keep 408/459 low off-target sgRNAs...
Processing LCT...Keep 2246/2509 low off-target sgRNAs...


 55%|█████▌    | 22/40 [00:39<00:17,  1.04it/s]

position not found


 65%|██████▌   | 26/40 [00:55<00:45,  3.24s/it]

Processing MBL2...Keep 235/304 low off-target sgRNAs...
Processing MRGPRX4...Keep 121/442 low off-target sgRNAs...
Processing MUCL3...Keep 0/1748 low off-target sgRNAs...
Processing NANOS2...Keep 205/221 low off-target sgRNAs...
Processing NOTO...Keep 336/382 low off-target sgRNAs...
Processing PDILT...Keep 565/653 low off-target sgRNAs...
Processing PRDM9...Keep 112/1156 low off-target sgRNAs...


 75%|███████▌  | 30/40 [00:57<00:21,  2.16s/it]

Processing PRLHR...Keep 550/618 low off-target sgRNAs...
Processing PRSS33...Keep 352/402 low off-target sgRNAs...


 80%|████████  | 32/40 [01:02<00:17,  2.24s/it]

Processing RFPL4B...Keep 269/337 low off-target sgRNAs...


 85%|████████▌ | 34/40 [01:06<00:13,  2.18s/it]

Processing RNASE11...Keep 175/227 low off-target sgRNAs...
Processing RPTN...Keep 527/1002 low off-target sgRNAs...
Processing SLC6A18...Keep 721/852 low off-target sgRNAs...
Processing TEX55...Keep 605/678 low off-target sgRNAs...
Processing TMPRSS15...Keep 765/903 low off-target sgRNAs...


100%|██████████| 40/40 [01:17<00:00,  1.94s/it]

Processing TPD52L3...Keep 130/150 low off-target sgRNAs...
Processing VN1R2...Keep 278/473 low off-target sgRNAs...





In [12]:
### Save filtered dataframe dictionary
values_2 = []
for keys in filter_15:
    values_2.append(len(filter_15[keys]))
values_1 = []
for keys in filter_neutral:
    values_1.append(len(filter_neutral[keys]))
filter_ess = pd.DataFrame({'gene':filter_15.keys(),'n_filtered':values_2})
filter_neutral = pd.DataFrame({'gene':filter_neutral.keys(),'n_filtered':values_1})
filter_ess.to_csv(join(out_path,'ess_15','filter_gene.csv'))
filter_neutral.to_csv(join(out_path,'neutral_15','filter_gene.csv'))

In [260]:
# When gene is + and sg is +  .seq[sequence_idx:sequence_idx+11]
# When gene is + and sg is - .seq[sequence_idx+12:sequence_idx+23].reverse_complement()
# When gene is - (the sequence is -, ATG) and sg is - .seq[sequence_idx-22:sequence_idx-11]
# When gene is - and sg is + .seq[sequence_idx-10:sequence_idx+1].reverse_complement()

Seq('CGCTTTCGCGC')