In [None]:
# This notebook is an additional one for including missense sgRNA

In [1]:
import pandas as pd 
from os import listdir
from os.path import join
import sys
sys.path.append('../src')
from utils import *
import matplotlib.pyplot as plt 

import collections
import itertools
from Bio import SeqIO
import math


In [28]:
%load_ext autoreload
%autoreload 2

In [2]:
out_path = '../data/output/sg_Finder'
mrna_path = '../data/MANE'
fpath = '../data/output/sg_out/'

# Load all expressed transcripts
ess_genelist = pd.read_csv('../data/output/k562_comm_ess.txt', header =None)
neutral_genelist = pd.read_csv('../data/output/k562_neutral.txt', header =None)

fpath = '../data/output/sg_out/'
list_gene = [i for i in listdir(fpath) if not i.startswith('.')]
list_gene = [i for i in list_gene if len(listdir(join(fpath,i))) > 5]
list_gene_ess = [i for i in list_gene if i in list(ess_genelist[0])]
list_gene_neutral = [i for i in list_gene if i in list(neutral_genelist[0])]

In [3]:
# How many genes not have any sgRNA detected from CHOPCHOP
# for gene_folder in join(out_path,gene,)
# 6 genes are first filtered out
inter_gene =list(set(ess_genelist[0]).intersection(set(list_gene_ess)))
left_out_genes = [ i for i in list(set(ess_genelist[0]))  if i not in list_gene_ess]

In [4]:
print(f'{len(left_out_genes)} genes didnt have any sgRNA\
generated by CHOPCHOP:{left_out_genes}')
### Usually because the isoforms varies too much that CHOPCHOP can't find a union. Tested SARS1 on website version.

6 genes didnt have any sgRNAgenerated by CHOPCHOP:['GINS1', 'POLD3', 'RPS9', 'FARSB', 'LRR1', 'SARS1']


In [5]:
### Filtered genes. They are the same across 11nt window and 13nt window
df_filter = pd.read_csv(join(out_path, 'ess_15','filter_gene.csv'), index_col =0)
df_filter = df_filter.set_index('gene')

***Get synonymous and stop codon sgRNA for ess and neutral genes***

In [6]:
# For missense controls: we want the sgRNA edit mut number be close to the syn edit number
# For empty window controls: Add more sgRNA that are close to the target edit site

In [3]:
stop_abe_ess,mis_abe_ess, syn_abe_ess = sg_consequence('ess_15', list_gene_ess, 'ess','df_abe_detail.csv')
stop_cbe_ess,mis_cbe_ess, syn_cbe_ess = sg_consequence('ess_15', list_gene_ess, 'ess','df_cbe_detail.csv')

In [5]:
syn_abe_ess['mut_n'] = syn_abe_ess['Synonymous'].apply(lambda x: len(x))
syn_cbe_ess['mut_n'] = syn_cbe_ess['Synonymous'].apply(lambda x: len(x))
mis_cbe_ess['mut_n'] = mis_cbe_ess['Synonymous'].apply(lambda x: len(x))
mis_abe_ess['mut_n'] = mis_abe_ess['Synonymous'].apply(lambda x: len(x))

In [6]:
syn_abe_stats = syn_abe_ess.groupby('gene').agg({'mut_n':lambda x: dict(collections.Counter(x)), 'Synonymous':'size',\
     'AA_pos':lambda x: [i[0].split('-')[0] for i in x]}) # Just keep the edited amino acid start positions, 1 position for each sg
syn_abe_stats['mut_freq'] = syn_abe_stats.apply(lambda x:{k:v/x['Synonymous'] for k,v in x['mut_n'].items()}, axis = 1)
syn_abe_stats['sg_freq'] = syn_abe_stats['Synonymous']/syn_abe_stats['Synonymous'].sum()

mis_abe_ess['cut_position'] = mis_abe_ess.apply(lambda x: [i.split('-')[0] for i in x['AA_pos']][0], axis = 1)

In [7]:
syn_cbe_stats = syn_cbe_ess.groupby('gene').agg({'mut_n':lambda x: dict(collections.Counter(x)), 'Synonymous':'size',\
     'AA_pos':lambda x: [i[0].split('-')[0] for i in x]}) # Just keep the edited amino acid start positions, 1 position for each sg
syn_cbe_stats['mut_freq'] = syn_cbe_stats.apply(lambda x:{k:v/x['Synonymous'] for k,v in x['mut_n'].items()}, axis = 1)
syn_cbe_stats['sg_freq'] = syn_cbe_stats['Synonymous']/syn_cbe_stats['Synonymous'].sum()

mis_cbe_ess['cut_position'] = mis_cbe_ess.apply(lambda x: [i.split('-')[0] for i in x['AA_pos']][0], axis = 1)

In [9]:
def calc_distance(x,target_pos):
    l_d = []
    for k in target_pos:
        d = abs(int(x['cut_position']) - int(k))
        if d == 0:
            d = 0.5
        l_d.append(d)
    return(sum([1/i for i in l_d]))

In [10]:
tot = math.ceil(len(syn_abe_ess)/10)
mis_sg_abe = []
with open('file_abe.txt', 'w') as output:

    for genes in list(syn_abe_ess.gene.unique()):
        # print(genes)
        n_sg_freq = syn_abe_stats.loc[genes, 'sg_freq']
        n_mis_sg = tot*n_sg_freq
        # First determine how many sgRNAs, then determine what cutsite number is for that sgRNA
        n_sg = np.rint(n_mis_sg) 
        if n_sg == 0: n_sg = 1 # add +1 if no sgRNA assigned for that gene

        mut_freq = syn_abe_stats.loc[genes, 'mut_freq']
        mut_mis_n = dict(sorted({k:v*n_sg for k,v in mut_freq.items()}.items(),key=lambda item: item[1],reverse = True))
        # print(n_sg, mut_mis_n, syn_abe_stats.loc[genes,'AA_pos'])

        df_mis_gene = mis_abe_ess[mis_abe_ess['gene'] == genes]
        if df_mis_gene.empty:
            print(f'no missense sg for {genes}')
            continue
        if not df_mis_gene[df_mis_gene['mut_n'].isin(list(mut_mis_n.keys()))].empty:
            df_mis_gene = df_mis_gene[df_mis_gene['mut_n'].isin(list(mut_mis_n.keys()))]
        # tot_dis = [(x-y for x in syn_abe_stats.loc[genes,'AA_pos']) for y in syn_abe_stats.loc[genes,'AA_pos']]

        df_mis_gene['distance'] = df_mis_gene.apply(lambda x: calc_distance(x,syn_abe_stats.loc[genes,'AA_pos']), axis = 1)
        df_mis_gene = df_mis_gene.sort_values('distance', ascending = False)


        if n_sg == 1:
            sg = df_mis_gene[df_mis_gene['mut_n'] == list(mut_mis_n.keys())[0]].head(1)
            if sg.empty and len(list(mut_mis_n.keys()))> 1:
                sg = df_mis_gene[df_mis_gene['mut_n'] == list(mut_mis_n.keys())[1]].head(1)
            else:
                sg = df_mis_gene.head(1)

        else:
            list_sg = []
            for k, v in mut_mis_n.items():
                v = math.ceil(v)
                list_sg.append(df_mis_gene[df_mis_gene['mut_n'] == k].drop_duplicates(subset='cut_position', keep="first").head(v))
            sg = pd.concat(list_sg, axis = 0)
        if len(sg) > n_sg:
            sg = sg.head(int(n_sg))
        #     output.write('genes'+'\n')
        #     output.write()
        output.write(genes+'\n')
        output.write(str(n_sg)+'\n')
        output.write(str(mut_mis_n)+'\n')
        output.write(str(syn_abe_stats.loc[genes,'AA_pos'])+'\n')
        output.write(str(sg)+'\n')
        mis_sg_abe.append(sg)
        

no missense sg for RPS2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [11]:
tot = math.ceil(len(syn_cbe_ess)/10)
mis_sg_cbe = []
with open('file_cbe.txt', 'w') as output:

    for genes in list(syn_cbe_ess.gene.unique()):
        # print(genes)
        n_sg_freq = syn_cbe_stats.loc[genes, 'sg_freq']
        n_mis_sg = tot*n_sg_freq
        # First determine how many sgRNAs, then determine what cutsite number is for that sgRNA
        n_sg = np.rint(n_mis_sg) 
        if n_sg == 0: n_sg = 1 # add +1 if no sgRNA assigned for that gene

        mut_freq = syn_cbe_stats.loc[genes, 'mut_freq']
        mut_mis_n = dict(sorted({k:v*n_sg for k,v in mut_freq.items()}.items(),key=lambda item: item[1],reverse = True))
        # print(n_sg, mut_mis_n, syn_abe_stats.loc[genes,'AA_pos'])

        df_mis_gene = mis_cbe_ess[mis_cbe_ess['gene'] == genes]
        if df_mis_gene.empty:
            print(f'no missense sg for {genes}')
            continue
        if not df_mis_gene[df_mis_gene['mut_n'].isin(list(mut_mis_n.keys()))].empty:
            df_mis_gene = df_mis_gene[df_mis_gene['mut_n'].isin(list(mut_mis_n.keys()))]
        # tot_dis = [(x-y for x in syn_abe_stats.loc[genes,'AA_pos']) for y in syn_abe_stats.loc[genes,'AA_pos']]

        df_mis_gene['distance'] = df_mis_gene.apply(lambda x: calc_distance(x,syn_cbe_stats.loc[genes,'AA_pos']), axis = 1)
        df_mis_gene = df_mis_gene.sort_values('distance', ascending = False)


        if n_sg == 1:
            sg = df_mis_gene[df_mis_gene['mut_n'] == list(mut_mis_n.keys())[0]].head(1)
            if sg.empty and len(list(mut_mis_n.keys()))> 1:
                sg = df_mis_gene[df_mis_gene['mut_n'] == list(mut_mis_n.keys())[1]].head(1)
            else:
                sg = df_mis_gene.head(1)

        else:
            list_sg = []
            for k, v in mut_mis_n.items():
                v = math.ceil(v)
                list_sg.append(df_mis_gene[df_mis_gene['mut_n'] == k].drop_duplicates(subset='cut_position', keep="first").head(v))
            sg = pd.concat(list_sg, axis = 0)
        if len(sg) > n_sg:
            sg = sg.head(int(n_sg))
        #     output.write('genes'+'\n')
        #     output.write()
        output.write(genes+'\n')
        output.write(str(n_sg)+'\n')
        output.write(str(mut_mis_n)+'\n')
        output.write(str(syn_cbe_stats.loc[genes,'AA_pos'])+'\n')
        output.write(str(sg)+'\n')
        mis_sg_cbe.append(sg)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
### 1. For each gene, the frequency of syn and mis sgRNA should match
### 2. For each gene, the sgRNAs should be targeting same region 
### 3. For each gene, the number of edits of syn and mis should match

In [12]:
df_ety_abe = pd.read_csv(join(out_path,'ess_15','ABE_CT.csv'))
df_ety_cbe = pd.read_csv(join(out_path,'ess_15','CBE_CT.csv'))
df_ety_abe = df_ety_abe.set_index('sgRNA')
df_ety_cbe = df_ety_cbe.set_index('sgRNA')

# For each gene, the sgRNAs should target the similar regions as possible

In [13]:
df_ety_abe['cut_position'] = df_ety_abe['sgRNA_start']
df_ety_cbe['cut_position'] = df_ety_cbe['sgRNA_start']

In [14]:
tot = math.ceil(len(syn_abe_ess)/10)
ety_sg_abe = []
with open('file_abe_ety.txt', 'w') as output:

    for genes in list(syn_abe_ess.gene.unique()):
        # print(genes)
        n_sg_freq = syn_abe_stats.loc[genes, 'sg_freq']
        n_ety_sg = tot*n_sg_freq
        # First determine how many sgRNAs, then determine what cutsite number is for that sgRNA
        n_sg = np.rint(n_ety_sg) 
        if n_sg == 0: n_sg = 1 # add +1 if no sgRNA assigned for that gene

        df_ety_gene = df_ety_abe[df_ety_abe['gene'] == genes]
        if df_ety_gene.empty:
            print(f'no empty window sg for {genes}')
            continue

        df_ety_gene['distance'] = df_ety_gene.apply(lambda x: calc_distance(x,syn_abe_stats.loc[genes,'AA_pos']), axis = 1)
        df_ety_gene = df_ety_gene.sort_values('distance', ascending = False)
        # print(df_ety_gene)

        sg = df_ety_gene.drop_duplicates(subset='cut_position', keep="first").head(int(n_sg))
        #     output.write('genes'+'\n')
        #     output.write()
        output.write(genes+'\n')
        output.write(str(n_sg)+'\n')
        output.write(str(syn_abe_stats.loc[genes,'AA_pos'])+'\n')
        output.write(str(sg)+'\n')
        ety_sg_abe.append(sg)
        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

no empty window sg for RPS2
no empty window sg for PCNA
no empty window sg for EIF2S1
no empty window sg for RPL11
no empty window sg for PSMA3
no empty window sg for PSMB5
no empty window sg for CDC16
no empty window sg for PSMA2
no empty window sg for PSMA5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

no empty window sg for HNRNPK
no empty window sg for UBL5
no empty window sg for NEDD1
no empty window sg for KPNB1
no empty window sg for POLR2E
no empty window sg for PAFAH1B1
no empty window sg for POLR2L
no empty window sg for PSMA1
no empty window sg for ISCU
no empty window sg for NCBP2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [15]:
tot = math.ceil(len(syn_cbe_ess)/10)
ety_sg_cbe = []
with open('file_cbeEty.txt', 'w') as output:

    for genes in list(syn_cbe_ess.gene.unique()):
        # print(genes)
        n_sg_freq = syn_cbe_stats.loc[genes, 'sg_freq']
        n_ety_sg = tot*n_sg_freq
        # First determine how many sgRNAs, then determine what cutsite number is for that sgRNA
        n_sg = np.rint(n_ety_sg) 
        if n_sg == 0: n_sg = 1 # add +1 if no sgRNA assigned for that gene

        df_ety_gene = df_ety_cbe[df_ety_cbe['gene'] == genes]
        if df_ety_gene.empty:
            print(f'no empty window sg for {genes}')
            continue

        df_ety_gene['distance'] = df_ety_gene.apply(lambda x: calc_distance(x,syn_cbe_stats.loc[genes,'AA_pos']), axis = 1)
        df_ety_gene = df_ety_gene.sort_values('distance', ascending = False)
        # print(df_ety_gene)

        sg = df_ety_gene.drop_duplicates(subset='cut_position', keep="first").head(int(n_sg))
        #     output.write('genes'+'\n')
        #     output.write()
        output.write(genes+'\n')
        output.write(str(n_sg)+'\n')
        output.write(str(syn_cbe_stats.loc[genes,'AA_pos'])+'\n')
        output.write(str(sg)+'\n')
        ety_sg_cbe.append(sg)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

no empty window sg for ATP6V0C
no empty window sg for ABCE1
no empty window sg for HINFP


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

no empty window sg for PSMB5
no empty window sg for PSMB4
no empty window sg for TXNL4A
no empty window sg for GRPEL1
no empty window sg for RAN
no empty window sg for POLR2C
no empty window sg for SRSF2
no empty window sg for POLR2E
no empty window sg for POLR2L
no empty window sg for SNRPF
no empty window sg for RPS13
no empty window sg for PSMA1
no empty window sg for NAPA
no empty window sg for NCBP2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [16]:
mis_df_cbe = pd.concat(mis_sg_cbe, axis = 0)
mis_df_abe = pd.concat(mis_sg_abe, axis = 0)
ety_df_cbe = pd.concat(ety_sg_cbe, axis = 0)
ety_df_abe = pd.concat(ety_sg_abe, axis = 0)


In [17]:
# drop the sgRNA that's already in the library
ETY_ordered = SeqIO.parse('../data/ETY.fasta','fasta')


In [18]:
list_ety_ordered = [i.seq for i in ETY_ordered]    

In [19]:
ety_cbe_order = [i for i in ety_df_cbe.index if i not in list_ety_ordered]
ety_abe_order = [i for i in ety_df_abe.index if i not in list_ety_ordered]
mis_cbe_order = list(mis_df_cbe.index)
mis_abe_order = list(mis_df_abe.index)

In [22]:
import pickle

In [23]:
pickle.dump(mis_abe_order+ety_abe_order, open('ABE_MIS_ETY.pkl','wb'))
pickle.dump(mis_cbe_order+ety_cbe_order, open('CBE_MIS_ETY.pkl','wb'))

In [28]:
rmisabe_noprimer_list,rmisabe_pass_list = get_noprimer_list(mis_abe_order+ety_abe_order)
rmiscbe_noprimer_list,rmiscbe_pass_list = get_noprimer_list(mis_cbe_order+ety_cbe_order)


In [30]:
list_primers = []

In [31]:
F = 'GGGATCACTTACTACTTCCG'
R = 'GGTACCCTATCACTTAACCG'
list_primers.append(F)
list_primers.append(R)

Fwd,Rev,oligoTM, primerTM = calc_best_primerTM(F,R, rmisabe_noprimer_list)
ABEMIS_final_seq = attach_group_primer(rmisabe_pass_list,Fwd[:18],Rev[:18])
df_misabe_final = pd.DataFrame({'sequence':ABEMIS_final_seq})
df_misabe_final['sequence'] = df_misabe_final['sequence'].apply(lambda x:''.join(x))
df_misabe_final = df_misabe_final.reset_index()
df_misabe_final['name'] = 'ABEMIS_'+df_misabe_final['index'].astype(str)

In [32]:
F = 'CACTAGAGTAATCGCTACCG'
R = 'GGGCTACTAGACATAACTCG'
list_primers.append(F)
list_primers.append(R)

Fwd,Rev,oligoTM, primerTM = calc_best_primerTM(F,R, rmiscbe_noprimer_list)
CBEMIS_final_seq = attach_group_primer(rmiscbe_pass_list,Fwd[:18],Rev[:18])
df_miscbe_final = pd.DataFrame({'sequence':CBEMIS_final_seq})
df_miscbe_final['sequence'] = df_miscbe_final['sequence'].apply(lambda x:''.join(x))
df_miscbe_final = df_miscbe_final.reset_index()
df_miscbe_final['name'] = 'CBEMIS_'+df_miscbe_final['index'].astype(str)

In [33]:
df = pd.concat([df_misabe_final,df_miscbe_final], axis = 0)

In [34]:
df[['name','sequence']].to_csv('../data/output/sgMIS_order2.csv',index = False)

***Blastn primers in terminal***

In [None]:
# conda activate synSg
# cd /data/output/primers
# makeblastdb -in rREsg_ABE_MIS.fasta -dbtype nucl -parse_seqids -max_file_sz 4000000000
# blastn -query PossiblePrimers.fasta -db rREsg_ABE_MIS.fasta -out Blast_Primer_ABE_MIS.txt -word_size 7 -dust no
#or
# blastn -query PossiblePrimers.fasta -db rREsg_ABE_MIS.fasta -out Blast_Primer_ABE_MIS.txt -word_size 7 -dust no -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids"

# makeblastdb -in rREsg_CBE_MIS.fasta -dbtype nucl -parse_seqids -max_file_sz 4000000000
# blastn -query PossiblePrimers.fasta -db rREsg_CBE_MIS.fasta -out Blast_Primer_CBE_MIS.txt -word_size 7 -dust no
#or
# blastn -query PossiblePrimers.fasta -db rREsg_CBE_MIS.fasta -out Blast_Primer_CBE_MIS.txt -word_size 7 -dust no -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids"

In [43]:
BlastQueryPrimer_ABEMIS = read_blast_res('Blast_Primer_ABE_MIS.txt')
BlastQueryPrimer_CBEMIS = read_blast_res('Blast_Primer_CBE_MIS.txt')

In [44]:
PrimerHeteroTM_ABEMIS = calc_TM(BlastQueryPrimer_ABEMIS[0:200],rmisabe_noprimer_list)

100%|██████████| 200/200 [02:32<00:00,  1.32it/s]


In [45]:
PrimerHeteroTM_CBEMIS = calc_TM(BlastQueryPrimer_CBEMIS[0:200],rmiscbe_noprimer_list)

100%|██████████| 200/200 [02:33<00:00,  1.31it/s]


In [46]:
GoodPrimer_ABEMIS = sel_primers(21,PrimerHeteroTM_ABEMIS)
GoodPrimer_CBEMIS = sel_primers(21,PrimerHeteroTM_CBEMIS)


In [48]:
GoodPrimer_ABEMIS

Unnamed: 0,Primer A,Tm A,Primer B,Tm B,Sense Tm,AntiSense B Tm,SenseAnti Sum,Tm Dif
5438,CTACCCTAACCACTATTCCG,48.630052,GGCAAATGATCTGACTGTCG,51.260223,-63.963557,-81.402469,-145.366027,2.630171
4293,CAGCCTAGACAAAGTTAGCG,50.630893,GGGATAATCTGAGGGAAACG,49.563550,-47.155030,-90.094286,-137.249316,1.067343
2043,CCGTATCTATATCCCGATCG,48.623922,CAGCCTAGACAAAGTTAGCG,50.630893,-54.602413,-76.064958,-130.667371,2.006970
1606,GCAAGAGGTTCGGTAAATCG,51.298629,GCACTTCCCATGTATTCACG,51.227038,-54.968901,-72.398886,-127.367787,0.071591
728,GCCGTAGTAAGTTGAATGCG,51.714860,GCCTGATATCACTCCTATCG,48.941093,-79.449986,-47.897866,-127.347852,2.773767
...,...,...,...,...,...,...,...,...
84,CGTTAAGAACTACCCGATCG,50.238133,CGTTAAGAACTACCCGATCG,50.238133,5.302336,50.169678,55.472015,0.000000
1008,GAGTAGATACCCTAGGAACG,47.800627,GAGTAGATACCCTAGGAACG,47.800627,8.088309,47.730453,55.818762,0.000000
3360,GTTGGATTGTGTACGCATCG,51.994069,GTTGGATTGTGTACGCATCG,51.994069,6.282672,51.925690,58.208362,0.000000
3192,CGATATCCATGGGTAGTACG,48.997316,CGATATCCATGGGTAGTACG,48.997316,9.848352,48.927760,58.776112,0.000000


In [51]:
list_primers = []

In [56]:
F = 'CTACCCTAACCACTATTCCG'
R = 'GGCAAATGATCTGACTGTCG'
list_primers.append(F)
list_primers.append(R)

Fwd,Rev,oligoTM, primerTM = calc_best_primerTM(F,R, rmisabe_noprimer_list)
ABEMIS_final_seq = attach_group_primer(rmisabe_pass_list,Fwd[:18],Rev[:18])
df_abemis_final = pd.DataFrame({'sequence':ABEMIS_final_seq})
df_abemis_final['sequence'] = df_abemis_final['sequence'].apply(lambda x:''.join(x))
df_abemis_final = df_abemis_final.reset_index()
df_abemis_final['name'] = 'ABEMIS_'+df_abemis_final['index'].astype(str)

In [52]:
GoodPrimer_CBEMIS

Unnamed: 0,Primer A,Tm A,Primer B,Tm B,Sense Tm,AntiSense B Tm,SenseAnti Sum,Tm Dif
6347,CCTCGTTCTAAGTACACTCG,49.500430,CCATAGGGCATTACTATCCG,49.120702,-63.330900,-79.449986,-142.780887,0.379728
3838,CGCAATCCTATACCGTTACG,50.670030,CGACAGTCAGACTAAGATCG,49.605820,-66.929547,-66.929547,-133.859095,1.064210
4917,CCGTAACCTAGCTATACACG,49.307670,CGACAGTCAGACTAAGATCG,49.605820,-66.929547,-66.929547,-133.859095,0.298151
3257,CCATAGGGCATTACTATCCG,49.120702,CGACAGTCAGACTAAGATCG,49.605820,-79.449986,-53.031819,-132.481806,0.485119
1699,CGACAGTCAGACTAAGATCG,49.605820,CCATAGGGCATTACTATCCG,49.120702,-79.449986,-52.083726,-131.533712,0.485119
...,...,...,...,...,...,...,...,...
6132,CCCTGGCTTATAACGTTACG,50.400600,CCCTGGCTTATAACGTTACG,50.400600,6.523076,50.331333,56.854409,0.000000
2016,CATTGGCGCTCCTATTAACG,51.431680,CATTGGCGCTCCTATTAACG,51.431680,5.511951,51.362720,56.874670,0.000000
4032,GGAGTTACCCTAGGAAATCG,48.916520,GGAGTTACCCTAGGAAATCG,48.916520,8.088309,48.846042,56.934351,0.000000
2268,GGAACCGGATCACATAATCG,50.443308,GGAACCGGATCACATAATCG,50.443308,7.061186,50.373846,57.435032,0.000000


In [53]:
F = 'CCTCGTTCTAAGTACACTCG'
R = 'CCATAGGGCATTACTATCCG'
list_primers.append(F)
list_primers.append(R)
Fwd,Rev,oligoTM, primerTM = calc_best_primerTM(F,R, rmiscbe_noprimer_list)
CBEMIS_final_seq = attach_group_primer(rmiscbe_pass_list,Fwd[:18],Rev[:18])
df_cbemis_final = pd.DataFrame({'sequence':CBEMIS_final_seq})
df_cbemis_final['sequence'] = df_cbemis_final['sequence'].apply(lambda x:''.join(x))
df_cbemis_final = df_cbemis_final.reset_index()
df_cbemis_final['name'] = 'CBEMIS_'+df_cbemis_final['index'].astype(str)

In [57]:
df = pd.concat([df_abemis_final,df_cbemis_final], axis = 0)

In [58]:
df[['name','sequence']].to_csv('../data/output/mis_sg_order.csv',index = False)

In [59]:
from itertools import combinations
for (i,j) in combinations(list_primers,2):
    print(p3.calcHeterodimer(i,j))

ThermoResult(structure_found=True, tm=-63.33, dg=-1747.76, dh=-19800.00, ds=-58.20)
ThermoResult(structure_found=True, tm=-56.51, dg=-1287.99, dh=-23000.00, ds=-70.00)
ThermoResult(structure_found=True, tm=-78.09, dg=-1598.14, dh=-16300.00, ds=-47.40)
ThermoResult(structure_found=True, tm=-56.51, dg=-1287.99, dh=-23000.00, ds=-70.00)
ThermoResult(structure_found=True, tm=-78.09, dg=-1598.14, dh=-16300.00, ds=-47.40)
ThermoResult(structure_found=True, tm=-11.91, dg=-4008.31, dh=-38500.00, ds=-111.21)
ThermoResult(structure_found=True, tm=-31.31, dg=-2488.00, dh=-30900.00, ds=-91.61)
ThermoResult(structure_found=True, tm=-11.91, dg=-4008.31, dh=-38500.00, ds=-111.21)
ThermoResult(structure_found=True, tm=-31.31, dg=-2488.00, dh=-30900.00, ds=-91.61)
ThermoResult(structure_found=True, tm=-63.96, dg=-1466.19, dh=-20200.00, ds=-60.40)
ThermoResult(structure_found=True, tm=-71.82, dg=-838.37, dh=-19200.00, ds=-59.20)
ThermoResult(structure_found=True, tm=-63.96, dg=-1466.19, dh=-20200.00, ds