In [53]:
import re
import os
import glob
import pandas as pd
from tqdm import tqdm
from scipy.stats import zscore
from Bio import SeqIO

In [3]:
# for record in SeqIO.parse('../oligo/data/fastas/prey.fasta', "fasta"):
#     print('>'+record.id+'_comp')
#     print(record.seq.complement())
#     with open('../oligo/data/fastas/prey_comp.fasta', 'w') as f:
#         f.write('>'+record.id+'_comp'+'\n')
#         f.write(str(record.seq.complement())+'\n')

In [19]:
def get_truth_set(truth_file):
    df = pd.read_csv(truth_file)
    truth_set = [frozenset({i,j}) for i,j in zip(df['Node1'],df['Node2'])]
    truth_set = set(truth_set)
    return(truth_set)

def get_library_set(lib_file):
    df = pd.read_csv(library_file)
    lib_set = [frozenset({i,j}) for i,j in zip(df['Node1'],df['Node2']) if 'N' not in i and 'N' not in j]
    #lib_set = [x for x in lib_set if 'N' not in list(x)]
    lib_set = set(lib_set)
    return(lib_set)

def calc_zscore(results_file):
    df = pd.read_csv(results_file)
    df['zscore'] = (df['count'] - df['count'].mean())/df['count'].std(ddof=0)
    return(df)

def label_exp(df, truth_set, lib_set, exp):
    df['label'] = ['oligo match' if frozenset({i,j}) in truth_set else 'library match' if frozenset({i,j}) in lib_set else 'off target' for i, j in zip(df['node1'],df['node2'])]
    df['exp'] = exp
    return(df)

In [47]:
kmers = list(range(11,16))
kmers

[11, 12, 13, 14, 15]

In [55]:
for k in kmers:
    library_file = f'../oligo/data/bait_kmers/library_{k}mers.csv'
    lib_set = get_library_set(library_file)
    for i in range(1,4):
        files = glob.glob(f'../oligo/results/{k}mer/oligo_{i}*{k}mers_unique.csv')
        print(files)
        truth_file = f'../oligo/data/bait_kmers/oligo_{i}_{k}mers.csv'
        outfile_prefix = f'oligo_{i}'
        for f in tqdm(files):
            truth_set = get_truth_set(truth_file)
            exp = re.search('(?<=_)\w\w(?=_)', f)[0]
            df = calc_zscore(f)
            df_out = label_exp(df, truth_set, lib_set, exp)
            df_out.to_csv(f'../oligo/results/{k}mer/{outfile_prefix}_{exp}_{k}mer_enrichment.csv', index=False)

['../oligo/results/11mer/oligo_1_1w_11mers_unique.csv', '../oligo/results/11mer/oligo_1_bg_11mers_unique.csv', '../oligo/results/11mer/oligo_1_3w_11mers_unique.csv', '../oligo/results/11mer/oligo_1_0w_11mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████| 4/4 [01:15<00:00, 18.87s/it]


['../oligo/results/11mer/oligo_2_0w_11mers_unique.csv', '../oligo/results/11mer/oligo_2_3w_11mers_unique.csv', '../oligo/results/11mer/oligo_2_1w_11mers_unique.csv', '../oligo/results/11mer/oligo_2_bg_11mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████| 4/4 [01:07<00:00, 16.90s/it]


['../oligo/results/11mer/oligo_3_0w_11mers_unique.csv', '../oligo/results/11mer/oligo_3_1w_11mers_unique.csv', '../oligo/results/11mer/oligo_3_bg_11mers_unique.csv', '../oligo/results/11mer/oligo_3_3w_11mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████| 4/4 [01:10<00:00, 17.75s/it]


['../oligo/results/12mer/oligo_1_0w_12mers_unique.csv', '../oligo/results/12mer/oligo_1_3w_12mers_unique.csv', '../oligo/results/12mer/oligo_1_1w_12mers_unique.csv', '../oligo/results/12mer/oligo_1_bg_12mers_unique.csv']


 50%|█████████████████████████████████████                                     | 2/4 [02:11<02:11, 65.66s/it]


KeyboardInterrupt: 

In [65]:
kmers = list(range(5,16))
kmers

[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

In [66]:
for k in kmers:
    library_file = f'../oligo/data/bait_kmers/library_{k}mers.csv'
    lib_set = get_library_set(library_file)
    for i in range(1,4):
        files = glob.glob(f'../oligo/results/{k}mer/oligo_{i}*{k}mers_unique.csv')
        print(files)
        truth_file = f'../oligo/data/bait_kmers/oligo_{i}_comp_{k}mers.csv'
        outfile_prefix = f'oligo_{i}'
        for f in tqdm(files):
            truth_set = get_truth_set(truth_file)
            exp = re.search('(?<=_)\w\w(?=_)', f)[0]
            df = calc_zscore(f)
            df_out = label_exp(df, truth_set, lib_set, exp)
            df_out.to_csv(f'../oligo/results/{k}mer/{outfile_prefix}_{exp}_comp_{k}mer_enrichment.csv', index=False)

['../oligo/results/5mer/oligo_1_3w_5mers_unique.csv', '../oligo/results/5mer/oligo_1_bg_5mers_unique.csv', '../oligo/results/5mer/oligo_1_1w_5mers_unique.csv', '../oligo/results/5mer/oligo_1_0w_5mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [00:03<00:00,  1.03it/s]


['../oligo/results/5mer/oligo_2_3w_5mers_unique.csv', '../oligo/results/5mer/oligo_2_bg_5mers_unique.csv', '../oligo/results/5mer/oligo_2_0w_5mers_unique.csv', '../oligo/results/5mer/oligo_2_1w_5mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [00:00<00:00, 55.22it/s]


['../oligo/results/5mer/oligo_3_0w_5mers_unique.csv', '../oligo/results/5mer/oligo_3_1w_5mers_unique.csv', '../oligo/results/5mer/oligo_3_3w_5mers_unique.csv', '../oligo/results/5mer/oligo_3_bg_5mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [00:00<00:00, 44.69it/s]


['../oligo/results/6mer/oligo_1_bg_6mers_unique.csv', '../oligo/results/6mer/oligo_1_3w_6mers_unique.csv', '../oligo/results/6mer/oligo_1_1w_6mers_unique.csv', '../oligo/results/6mer/oligo_1_0w_6mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [00:00<00:00, 12.97it/s]


['../oligo/results/6mer/oligo_2_bg_6mers_unique.csv', '../oligo/results/6mer/oligo_2_3w_6mers_unique.csv', '../oligo/results/6mer/oligo_2_0w_6mers_unique.csv', '../oligo/results/6mer/oligo_2_1w_6mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [00:00<00:00, 12.97it/s]


['../oligo/results/6mer/oligo_3_0w_6mers_unique.csv', '../oligo/results/6mer/oligo_3_1w_6mers_unique.csv', '../oligo/results/6mer/oligo_3_bg_6mers_unique.csv', '../oligo/results/6mer/oligo_3_3w_6mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [00:00<00:00, 19.43it/s]


['../oligo/results/7mer/oligo_1_0w_7mers_unique.csv', '../oligo/results/7mer/oligo_1_1w_7mers_unique.csv', '../oligo/results/7mer/oligo_1_3w_7mers_unique.csv', '../oligo/results/7mer/oligo_1_bg_7mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [00:00<00:00,  5.65it/s]


['../oligo/results/7mer/oligo_2_1w_7mers_unique.csv', '../oligo/results/7mer/oligo_2_0w_7mers_unique.csv', '../oligo/results/7mer/oligo_2_3w_7mers_unique.csv', '../oligo/results/7mer/oligo_2_bg_7mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [00:00<00:00,  7.18it/s]


['../oligo/results/7mer/oligo_3_3w_7mers_unique.csv', '../oligo/results/7mer/oligo_3_bg_7mers_unique.csv', '../oligo/results/7mer/oligo_3_1w_7mers_unique.csv', '../oligo/results/7mer/oligo_3_0w_7mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [00:00<00:00,  7.45it/s]


['../oligo/results/8mer/oligo_1_1w_8mers_unique.csv', '../oligo/results/8mer/oligo_1_0w_8mers_unique.csv', '../oligo/results/8mer/oligo_1_bg_8mers_unique.csv', '../oligo/results/8mer/oligo_1_3w_8mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [00:01<00:00,  2.23it/s]


['../oligo/results/8mer/oligo_2_0w_8mers_unique.csv', '../oligo/results/8mer/oligo_2_1w_8mers_unique.csv', '../oligo/results/8mer/oligo_2_bg_8mers_unique.csv', '../oligo/results/8mer/oligo_2_3w_8mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [00:01<00:00,  2.35it/s]


['../oligo/results/8mer/oligo_3_bg_8mers_unique.csv', '../oligo/results/8mer/oligo_3_3w_8mers_unique.csv', '../oligo/results/8mer/oligo_3_0w_8mers_unique.csv', '../oligo/results/8mer/oligo_3_1w_8mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [00:01<00:00,  2.45it/s]


['../oligo/results/9mer/oligo_1_0w_9mers_unique.csv', '../oligo/results/9mer/oligo_1_1w_9mers_unique.csv', '../oligo/results/9mer/oligo_1_3w_9mers_unique.csv', '../oligo/results/9mer/oligo_1_bg_9mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [00:06<00:00,  1.51s/it]


['../oligo/results/9mer/oligo_2_1w_9mers_unique.csv', '../oligo/results/9mer/oligo_2_0w_9mers_unique.csv', '../oligo/results/9mer/oligo_2_3w_9mers_unique.csv', '../oligo/results/9mer/oligo_2_bg_9mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [00:05<00:00,  1.27s/it]


['../oligo/results/9mer/oligo_3_3w_9mers_unique.csv', '../oligo/results/9mer/oligo_3_bg_9mers_unique.csv', '../oligo/results/9mer/oligo_3_1w_9mers_unique.csv', '../oligo/results/9mer/oligo_3_0w_9mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [00:04<00:00,  1.21s/it]


['../oligo/results/10mer/oligo_1_1w_10mers_unique.csv', '../oligo/results/10mer/oligo_1_bg_10mers_unique.csv', '../oligo/results/10mer/oligo_1_3w_10mers_unique.csv', '../oligo/results/10mer/oligo_1_0w_10mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [00:19<00:00,  4.81s/it]


['../oligo/results/10mer/oligo_2_0w_10mers_unique.csv', '../oligo/results/10mer/oligo_2_3w_10mers_unique.csv', '../oligo/results/10mer/oligo_2_1w_10mers_unique.csv', '../oligo/results/10mer/oligo_2_bg_10mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [00:19<00:00,  4.87s/it]


['../oligo/results/10mer/oligo_3_0w_10mers_unique.csv', '../oligo/results/10mer/oligo_3_1w_10mers_unique.csv', '../oligo/results/10mer/oligo_3_bg_10mers_unique.csv', '../oligo/results/10mer/oligo_3_3w_10mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [00:19<00:00,  4.96s/it]


['../oligo/results/11mer/oligo_1_1w_11mers_unique.csv', '../oligo/results/11mer/oligo_1_bg_11mers_unique.csv', '../oligo/results/11mer/oligo_1_3w_11mers_unique.csv', '../oligo/results/11mer/oligo_1_0w_11mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [01:17<00:00, 19.37s/it]


['../oligo/results/11mer/oligo_2_0w_11mers_unique.csv', '../oligo/results/11mer/oligo_2_3w_11mers_unique.csv', '../oligo/results/11mer/oligo_2_1w_11mers_unique.csv', '../oligo/results/11mer/oligo_2_bg_11mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [01:09<00:00, 17.32s/it]


['../oligo/results/11mer/oligo_3_0w_11mers_unique.csv', '../oligo/results/11mer/oligo_3_1w_11mers_unique.csv', '../oligo/results/11mer/oligo_3_bg_11mers_unique.csv', '../oligo/results/11mer/oligo_3_3w_11mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [01:12<00:00, 18.05s/it]


['../oligo/results/12mer/oligo_1_0w_12mers_unique.csv', '../oligo/results/12mer/oligo_1_3w_12mers_unique.csv', '../oligo/results/12mer/oligo_1_1w_12mers_unique.csv', '../oligo/results/12mer/oligo_1_bg_12mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [03:48<00:00, 57.14s/it]


['../oligo/results/12mer/oligo_2_1w_12mers_unique.csv', '../oligo/results/12mer/oligo_2_bg_12mers_unique.csv', '../oligo/results/12mer/oligo_2_3w_12mers_unique.csv', '../oligo/results/12mer/oligo_2_0w_12mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [02:35<00:00, 38.92s/it]


['../oligo/results/12mer/oligo_3_3w_12mers_unique.csv', '../oligo/results/12mer/oligo_3_1w_12mers_unique.csv', '../oligo/results/12mer/oligo_3_bg_12mers_unique.csv', '../oligo/results/12mer/oligo_3_0w_12mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [02:56<00:00, 44.19s/it]


['../oligo/results/13mer/oligo_1_3w_13mers_unique.csv', '../oligo/results/13mer/oligo_1_bg_13mers_unique.csv', '../oligo/results/13mer/oligo_1_1w_13mers_unique.csv', '../oligo/results/13mer/oligo_1_0w_13mers_unique.csv']


100%|████████████████████████████████████████| 4/4 [07:00<00:00, 105.19s/it]


['../oligo/results/13mer/oligo_2_0w_13mers_unique.csv', '../oligo/results/13mer/oligo_2_bg_13mers_unique.csv', '../oligo/results/13mer/oligo_2_1w_13mers_unique.csv', '../oligo/results/13mer/oligo_2_3w_13mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [03:51<00:00, 57.81s/it]


['../oligo/results/13mer/oligo_3_0w_13mers_unique.csv', '../oligo/results/13mer/oligo_3_3w_13mers_unique.csv', '../oligo/results/13mer/oligo_3_bg_13mers_unique.csv', '../oligo/results/13mer/oligo_3_1w_13mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [04:41<00:00, 70.37s/it]


['../oligo/results/14mer/oligo_1_1w_14mers_unique.csv', '../oligo/results/14mer/oligo_1_bg_14mers_unique.csv', '../oligo/results/14mer/oligo_1_3w_14mers_unique.csv', '../oligo/results/14mer/oligo_1_0w_14mers_unique.csv']


100%|████████████████████████████████████████| 4/4 [09:39<00:00, 144.85s/it]


['../oligo/results/14mer/oligo_2_0w_14mers_unique.csv', '../oligo/results/14mer/oligo_2_3w_14mers_unique.csv', '../oligo/results/14mer/oligo_2_1w_14mers_unique.csv', '../oligo/results/14mer/oligo_2_bg_14mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [04:46<00:00, 71.69s/it]


['../oligo/results/14mer/oligo_3_0w_14mers_unique.csv', '../oligo/results/14mer/oligo_3_1w_14mers_unique.csv', '../oligo/results/14mer/oligo_3_bg_14mers_unique.csv', '../oligo/results/14mer/oligo_3_3w_14mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [05:58<00:00, 89.57s/it]


['../oligo/results/15mer/oligo_1_0w_15mers_unique.csv', '../oligo/results/15mer/oligo_1_1w_15mers_unique.csv', '../oligo/results/15mer/oligo_1_bg_15mers_unique.csv', '../oligo/results/15mer/oligo_1_3w_15mers_unique.csv']


100%|████████████████████████████████████████| 4/4 [13:59<00:00, 209.94s/it]


['../oligo/results/15mer/oligo_2_3w_15mers_unique.csv', '../oligo/results/15mer/oligo_2_1w_15mers_unique.csv', '../oligo/results/15mer/oligo_2_bg_15mers_unique.csv', '../oligo/results/15mer/oligo_2_0w_15mers_unique.csv']


100%|█████████████████████████████████████████| 4/4 [05:33<00:00, 83.47s/it]


['../oligo/results/15mer/oligo_3_1w_15mers_unique.csv', '../oligo/results/15mer/oligo_3_bg_15mers_unique.csv', '../oligo/results/15mer/oligo_3_3w_15mers_unique.csv', '../oligo/results/15mer/oligo_3_0w_15mers_unique.csv']


100%|████████████████████████████████████████| 4/4 [06:59<00:00, 104.85s/it]


In [60]:
redo_files = ['../oligo/results/13mer/oligo_1_0w_13mers_unique.csv','../oligo/results/14mer/oligo_1_bg_14mers_unique.csv','../oligo/results/15mer/oligo_1_bg_15mers_unique.csv']
redo_k = [13, 14, 15]
i=1
for k, f in zip(redo_k, redo_files):
    print(k, '\t', f)
    library_file = f'../oligo/data/bait_kmers/library_{k}mers.csv'
    lib_set = get_library_set(library_file)
    truth_file = f'../oligo/data/bait_kmers/oligo_{i}_{k}mers.csv'
    outfile_prefix = f'oligo_{i}'
    exp = re.search('(?<=_)\w\w(?=_)', f)[0]
    df = calc_zscore(f)
    df_out = label_exp(df, truth_set, lib_set, exp)
    df_out.to_csv(f'../oligo/results/{k}mer/{outfile_prefix}_{exp}_{k}mer_enrichment.csv', index=False)

13 	 ../oligo/results/13mer/oligo_1_0w_13mers_unique.csv
14 	 ../oligo/results/14mer/oligo_1_bg_14mers_unique.csv
15 	 ../oligo/results/15mer/oligo_1_bg_15mers_unique.csv


In [63]:
k=11
library_file = f'../oligo/data/bait_kmers/library_comp_{k}mers.csv'
lib_set = get_library_set(library_file)

In [64]:
lib_set

{frozenset({'ACGCTATCGA', 'CGCTATCGAC'}),
 frozenset({'TATTCTTAGG', 'TTATTCTTAG'}),
 frozenset({'CTTAGGTTCG', 'TTAGGTTCGT'}),
 frozenset({'TAGGTTCGTC', 'TTAGGTTCGT'}),
 frozenset({'TTTTATTCTT', 'TTTTTATTCT'}),
 frozenset({'TTTTTGTTTT', 'TTTTTTGTTT'}),
 frozenset({'TTTATTCTTA', 'TTTTATTCTT'}),
 frozenset({'TCTGCGCTCC', 'TTCTGCGCTC'}),
 frozenset({'CGACTCTCTC', 'GACTCTCTCT'}),
 frozenset({'TTGTTTTTAT', 'TTTGTTTTTA'}),
 frozenset({'CGTCGTCGTT', 'TCGTCGTCGT'}),
 frozenset({'CGACTCTCTC', 'TCGACTCTCT'}),
 frozenset({'TTATTCTTAG', 'TTTATTCTTA'}),
 frozenset({'CTGCGCTCCC', 'TCTGCGCTCC'}),
 frozenset({'GTTTTTATTC', 'TTTTTATTCT'}),
 frozenset({'CTCTTCTGCG', 'TCTTCTGCGC'}),
 frozenset({'CGCTATCGAC', 'GCTATCGACT'}),
 frozenset({'TCTTAGGTTC', 'TTCTTAGGTT'}),
 frozenset({'CTTTTGTTTT', 'TTTTGTTTTT'}),
 frozenset({'ATTCTTAGGT', 'TATTCTTAGG'}),
 frozenset({'ATCGACTCTC', 'TCGACTCTCT'}),
 frozenset({'CTCTCTTCTG', 'TCTCTTCTGC'}),
 frozenset({'CTTCTGCGCT', 'TCTTCTGCGC'}),
 frozenset({'AACGCTATCG', 'ACGCTAT

In [None]:
for i in range(1,4):
    files = glob.glob(f'../oligo/results/{k}mer/oligo_{i}*{k}mers_unique.csv')
    print(files)
    truth_file = f'../oligo/data/bait_kmers/oligo_{i}_{k}mers.csv'
    outfile_prefix = f'oligo_{i}'
    for f in tqdm(files):
        truth_set = get_truth_set(truth_file)
        exp = re.search('(?<=_)\w\w(?=_)', f)[0]
        df = calc_zscore(f)
        df_out = label_exp(df, truth_set, lib_set, exp)
        #df_out.to_csv(f'../oligo/results/{k}mer/{outfile_prefix}_{exp}_{k}mer_enrichment.csv', index=False)