In [53]:
import re
import os
import glob

from tqdm import tqdm
from scipy.stats import zscore
from Bio import SeqIO

In [2]:
# for record in SeqIO.parse('../oligo/data/fastas/prey.fasta', "fasta"):
#     print('>'+record.id+'_comp')
#     print(record.seq.complement())
#     with open('../oligo/data/fastas/prey_comp.fasta', 'w') as f:
#         f.write('>'+record.id+'_comp'+'\n')
#         f.write(str(record.seq.complement())+'\n')

In [77]:
import math
import pandas as pd
import time

In [104]:
def merge_data(exp_file, bg_file):
    exp_df = pd.read_csv(exp_file).rename(columns={'count': 'exp_count'})
    bg_df = pd.read_csv(bg_file).rename(columns={'count': 'bg_count'})
    exp_label = re.search('(?<=_)\w\w(?=_)', exp_file)[0]
    join_cols = ['node1','node2']
    join_df = exp_df.merge(bg_df, how='outer', left_on=join_cols, right_on=join_cols).reset_index(drop=True)
    join_df['exp_count'] = enr_df['exp_count'].fillna(0).astype(int)
    join_df['bg_count'] = enr_df['bg_count'].fillna(0).astype(int)
    join_df['exp'] = exp_label
    return(join_df)

def get_oligo_set(truth_file):
    df = pd.read_csv(truth_file)
    truth_set = [frozenset({i,j}) for i,j in zip(df['Node1'],df['Node2'])]
    truth_set = set(truth_set)
    return(truth_set)

def get_library_set(lib_file):
    df = pd.read_csv(lib_file)
    lib_set = [frozenset({i,j}) for i,j in zip(df['Node1'],df['Node2']) if 'N' not in i and 'N' not in j]
    lib_set = set(lib_set)
    return(lib_set)

def calc_fc(exp, bg):
    if bg > exp:
        fc = -bg/exp
        log2fc = -(math.log2(abs(fc)))
    elif bg <= exp:
        fc = exp/bg
        log2fc = math.log2(fc)
    return(fc, log2fc)

def calc_pseudo_fc(df):
    # pseudocounts
    df['exp_pc'] = df['exp_count'] + 1
    df['bg_pc'] = df['bg_count'] + 1
    df['f0_exp_pc'] = df['exp_pc']/df['exp_pc'].sum()
    df['f0_bg_pc'] = df['bg_pc']/df['bg_pc'].sum()
    # fold change cols
    df[['fc','log2fc']] = [calc_fc(i,j) for i,j in zip(df['f0_exp_pc'], df['f0_bg_pc'])]
    # drop extra cols
    df.reset_index(drop=True, inplace=True)
    df.drop(['exp_pc','bg_pc','f0_exp_pc','f0_bg_pc'], axis=1, inplace=True)
    df.sort_values(by='log2fc', ascending=False, inplace=True)
    return(df)

def calc_apex_zscore(df):
    # null distribution approximation
    df['f0_exp'] = df['exp_count']/df['exp_count'].sum()
    df['f0_bg'] = df['bg_count']/df['bg_count'].sum()
    # apex zscore calculation
    df['f1'] = (df['exp_count']+df['bg_count'])/(exp_sum+bg_sum)
    apex_zscore = (
        (f0_exp - f0_bg)/math.sqrt(
        (((f1*(1-f1))/exp_sum) + ((f1*(1-f1))/bg_sum)))
                  )
    return(apex_zscore)

In [16]:
exp_file = '../oligo/results/10mer/kmer_counts_q04cut/oligo_1_0w_10mers_q04.csv'
bg_file = '../oligo/results/10mer/kmer_counts_q04cut/oligo_1_bg_10mers_q04.csv'
lib_file = '../oligo/data/bait_kmers/library_10mers.csv'
oligo_file = '../oligo/data/bait_kmers/oligo_1_comp_10mers.csv'

In [105]:
t0 = time.time()
lib_kmers = get_library_set(lib_file)
oligo_kmers = get_oligo_set(oligo_file)
df = merge_data(exp_file, bg_file)
df['label'] = (['oligo match' if frozenset({i,j}) in oligo_kmers \
                else 'library match' if frozenset({i,j}) in lib_kmers \
                else 'no match' for i, j in zip(df['node1'],df['node2'])])
df = calc_pseudo_fc(df)
print(df)
print(f'Total run time: {round((time.time()-t0)/60, 2)} minutes.')

             node1      node2  exp_count  bg_count exp     label           fc  \
1995     AGAGAGAGA  GAGAGAGAG       4351         0  0w  no match  4618.279094   
52751    ATATATATA  TATATATAT        174         0  0w  no match   185.707454   
86570    GCGCGCGCG  CGCGCGCGC        118         0  0w  no match   126.281069   
308006   TGTGAGAGG  GTGAGAGGG         52         0  0w  no match    56.242829   
355467   GTGTGGTGG  TGTGGTGGG         47         0  0w  no match    50.936902   
...            ...        ...        ...       ...  ..       ...          ...   
1006200  ACCTGTCAG  CCTGTCAGA          0        42  0w  no match   -40.520721   
1006199  AAGCCACGT  AGCCACGTG          0        43  0w  no match   -41.463064   
1006198  CGCGCGCGC  GCGCGCGCG          0       142  0w  no match  -134.754957   
1006197  TATATATAT  ATATATATA          0       173  0w  no match  -163.967570   
1006196  GAGAGAGAG  AGAGAGAGA          0      4173  0w  no match -3933.336992   

            log2fc  
1995  

In [110]:
df.reset_index(inplace=True, drop=True)
for i in range(0,99):
    node1 = df.node1[i]
    node2 = df.node2[i]
    print(f'{node1},{node2}')

AGAGAGAGA,GAGAGAGAG
ATATATATA,TATATATAT
GCGCGCGCG,CGCGCGCGC
TGTGAGAGG,GTGAGAGGG
GTGTGGTGG,TGTGGTGGG
TTGGAAGAT,TGGAAGATG
ATTGCGACT,TTGCGACTC
AAGGTAGGT,AGGTAGGTA
TAGGTGAGG,AGGTGAGGG
CTGGGTAGG,TGGGTAGGG
AGGTGCGAG,GGTGCGAGA
GTGTAAGGG,TGTAAGGGG
ATGTAGAGG,TGTAGAGGG
TGGGTAAGG,GGGTAAGGG
AAAAGTAGG,AAAGTAGGG
TGGGTCAAA,GGGTCAAAT
GTGATGTGG,TGATGTGGG
TGAGGTGTG,GAGGTGTGG
CATGTGGGT,ATGTGGGTT
ATGTGGGGT,TGTGGGGTA
AATGGTAGG,ATGGTAGGT
AAGGTAGGT,AGGTAGGTT
GTAGGTGGG,TAGGTGGGG
CATGTGAAG,ATGTGAAGA
CCGTAACGA,CGTAACGAT
GGAGGGCGG,GAGGGCGGA
AAGTGGTAG,AGTGGTAGG
AAGAGGTGG,AGAGGTGGG
CCACCTGTG,CACCTGTGT
TTGTGCCGG,TGTGCCGGG
TAAGGGACA,AAGGGACAG
GAGGTAGGG,AGGTAGGGA
ATGTCGGAT,TGTCGGATA
CCGCCCCTC,CGCCCCTCT
ATGATCCAG,TGATCCAGA
GTGAAGGTG,TGAAGGTGA
TCGTGGTAG,CGTGGTAGG
TTGGCTGTG,TGGCTGTGA
CAGACGAGT,AGACGAGTT
GCTGGACGA,CTGGACGAA
CAGGGAAAT,AGGGAAATG
AAACCGATG,AACCGATGG
CATTGGGGT,ATTGGGGTA
AAGGGGGGG,AGGGGGGGC
TGTGAGTAG,GTGAGTAGA
TGAAGTAGG,GAAGTAGGG
TGTACTGGG,GTACTGGGG
GTGGGGTAG,TGGGGTAGT
TGGCATGGG,GGCATGGGG
TAAATGGGG,AAATGGGGT


In [109]:
df

Unnamed: 0,node1,node2,exp_count,bg_count,exp,label,fc,log2fc
1995,AGAGAGAGA,GAGAGAGAG,4351,0,0w,no match,4618.279094,12.173140
52751,ATATATATA,TATATATAT,174,0,0w,no match,185.707454,7.536888
86570,GCGCGCGCG,CGCGCGCGC,118,0,0w,no match,126.281069,6.980495
308006,TGTGAGAGG,GTGAGAGGG,52,0,0w,no match,56.242829,5.813597
355467,GTGTGGTGG,TGTGGTGGG,47,0,0w,no match,50.936902,5.670639
...,...,...,...,...,...,...,...,...
1006200,ACCTGTCAG,CCTGTCAGA,0,42,0w,no match,-40.520721,-5.340588
1006199,AAGCCACGT,AGCCACGTG,0,43,0w,no match,-41.463064,-5.373755
1006198,CGCGCGCGC,GCGCGCGCG,0,142,0w,no match,-134.754957,-7.074195
1006197,TATATATAT,ATATATATA,0,173,0w,no match,-163.967570,-7.357267


In [87]:
df['zscore_trad'] = (df['exp_count'] - df['bg_count'].mean())/(df['bg_count'].std(ddof=1))
df[df.label == 'oligo match']

Unnamed: 0,node1,node2,exp_count,bg_count,exp,label,fc,log2fc,zscore_trad
943787,ATGGAGTGT,TGGAGTGTA,15,0,0w,oligo match,16.978967,4.085677,-0.010038
1002349,TGGATGGAG,GGATGGAGT,11,0,0w,oligo match,12.734225,3.670639,-0.010152
14308,GTTTTTTTT,TTTTTTTTT,562,349,0w,oligo match,1.706993,0.771457,0.005477
459473,TTCTGGATG,TCTGGATGG,39,28,0w,oligo match,1.463704,0.549624,-0.009358
31750,TGTTTTTTT,GTTTTTTTT,275,201,0w,oligo match,1.449937,0.53599,-0.002663
292744,AGCCTGTTT,GCCTGTTTT,53,42,0w,oligo match,1.332651,0.4143,-0.008961
59848,CTGTTTTTT,TGTTTTTTT,157,128,0w,oligo match,1.299747,0.37823,-0.006011
514268,ACCGGTTGT,CCGGTTGTG,36,30,0w,oligo match,1.266576,0.340934,-0.009443
880731,TGGAGTGTA,GGAGTGTAC,18,15,0w,oligo match,1.260158,0.333604,-0.009953
85133,GGTTGTGAG,GTTGTGAGC,120,106,0w,oligo match,1.200032,0.263073,-0.00706


In [54]:
df['zscore_apex'] = ([calc_apex_zscore(i, j, int(df['exp_count'].sum()), int(df['bg_count'].sum())) \
                      for i, j in zip(df.exp_count, df.bg_count)])


KeyboardInterrupt: 

# code archive below here

In [4]:
kmers = list(range(11,16))
kmers

[11, 12, 13, 14, 15]

In [5]:
# for k in kmers:
#     library_file = f'../oligo/data/bait_kmers/library_{k}mers.csv'
#     lib_set = get_library_set(library_file)
#     for i in range(1,4):
#         files = glob.glob(f'../oligo/results_pruned/{k}mer/oligo_{i}*{k}mers_unique.csv')
#         print(files)
#         truth_file = f'../oligo/data/bait_kmers/oligo_{i}_{k}mers.csv'
#         outfile_prefix = f'oligo_{i}'
#         for f in tqdm(files):
#             truth_set = get_truth_set(truth_file)
#             exp = re.search('(?<=_)\w\w(?=_)', f)[0]
#             df = calc_zscore(f)
#             df_out = label_exp(df, truth_set, lib_set, exp)
#             df_out.to_csv(f'../oligo/results_pruned/{k}mer/{outfile_prefix}_{exp}_{k}mer_enrichment.csv', index=False)

In [6]:
kmers = list(range(5,16))
kmers

[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

In [7]:
for k in kmers:
    library_file = f'../oligo/data/bait_kmers/library_{k}mers.csv'
    lib_set = get_library_set(library_file)
    for i in range(1,4):
        files = glob.glob(f'../oligo/results_pruned/{k}mer/oligo_{i}*{k}mers_unique.csv')
        print(files)
        truth_file = f'../oligo/data/bait_kmers/oligo_{i}_comp_{k}mers.csv'
        outfile_prefix = f'oligo_{i}'
        for f in tqdm(files):
            truth_set = get_truth_set(truth_file)
            exp = re.search('(?<=_)\w\w(?=_)', f)[0]
            df = calc_zscore(f)
            df_out = label_exp(df, truth_set, lib_set, exp)
            df_out.to_csv(f'../oligo/results_pruned/{k}mer/{outfile_prefix}_{exp}_comp_{k}mer_enrichment.csv', index=False)

['../oligo/results_pruned/5mer/oligo_1_3w_3mm_5mers_unique.csv', '../oligo/results_pruned/5mer/oligo_1_0w_3mm_5mers_unique.csv', '../oligo/results_pruned/5mer/oligo_1_bg_3mm_5mers_unique.csv', '../oligo/results_pruned/5mer/oligo_1_1w_3mm_5mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 24.06it/s]


['../oligo/results_pruned/5mer/oligo_2_3w_3mm_5mers_unique.csv', '../oligo/results_pruned/5mer/oligo_2_1w_3mm_5mers_unique.csv', '../oligo/results_pruned/5mer/oligo_2_0w_3mm_5mers_unique.csv', '../oligo/results_pruned/5mer/oligo_2_bg_3mm_5mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 51.12it/s]


['../oligo/results_pruned/5mer/oligo_3_1w_3mm_5mers_unique.csv', '../oligo/results_pruned/5mer/oligo_3_0w_3mm_5mers_unique.csv', '../oligo/results_pruned/5mer/oligo_3_bg_3mm_5mers_unique.csv', '../oligo/results_pruned/5mer/oligo_3_3w_3mm_5mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 50.70it/s]


['../oligo/results_pruned/6mer/oligo_1_3w_3mm_6mers_unique.csv', '../oligo/results_pruned/6mer/oligo_1_1w_3mm_6mers_unique.csv', '../oligo/results_pruned/6mer/oligo_1_bg_3mm_6mers_unique.csv', '../oligo/results_pruned/6mer/oligo_1_0w_3mm_6mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 17.89it/s]


['../oligo/results_pruned/6mer/oligo_2_3w_3mm_6mers_unique.csv', '../oligo/results_pruned/6mer/oligo_2_bg_3mm_6mers_unique.csv', '../oligo/results_pruned/6mer/oligo_2_0w_3mm_6mers_unique.csv', '../oligo/results_pruned/6mer/oligo_2_1w_3mm_6mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 26.56it/s]


['../oligo/results_pruned/6mer/oligo_3_bg_3mm_6mers_unique.csv', '../oligo/results_pruned/6mer/oligo_3_0w_3mm_6mers_unique.csv', '../oligo/results_pruned/6mer/oligo_3_1w_3mm_6mers_unique.csv', '../oligo/results_pruned/6mer/oligo_3_3w_3mm_6mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 20.95it/s]


['../oligo/results_pruned/7mer/oligo_1_3w_3mm_7mers_unique.csv', '../oligo/results_pruned/7mer/oligo_1_0w_3mm_7mers_unique.csv', '../oligo/results_pruned/7mer/oligo_1_bg_3mm_7mers_unique.csv', '../oligo/results_pruned/7mer/oligo_1_1w_3mm_7mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  8.25it/s]


['../oligo/results_pruned/7mer/oligo_2_3w_3mm_7mers_unique.csv', '../oligo/results_pruned/7mer/oligo_2_1w_3mm_7mers_unique.csv', '../oligo/results_pruned/7mer/oligo_2_0w_3mm_7mers_unique.csv', '../oligo/results_pruned/7mer/oligo_2_bg_3mm_7mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  7.47it/s]


['../oligo/results_pruned/7mer/oligo_3_1w_3mm_7mers_unique.csv', '../oligo/results_pruned/7mer/oligo_3_0w_3mm_7mers_unique.csv', '../oligo/results_pruned/7mer/oligo_3_bg_3mm_7mers_unique.csv', '../oligo/results_pruned/7mer/oligo_3_3w_3mm_7mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00,  9.19it/s]


['../oligo/results_pruned/8mer/oligo_1_1w_3mm_8mers_unique.csv', '../oligo/results_pruned/8mer/oligo_1_0w_3mm_8mers_unique.csv', '../oligo/results_pruned/8mer/oligo_1_bg_3mm_8mers_unique.csv', '../oligo/results_pruned/8mer/oligo_1_3w_3mm_8mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  2.97it/s]


['../oligo/results_pruned/8mer/oligo_2_0w_3mm_8mers_unique.csv', '../oligo/results_pruned/8mer/oligo_2_bg_3mm_8mers_unique.csv', '../oligo/results_pruned/8mer/oligo_2_1w_3mm_8mers_unique.csv', '../oligo/results_pruned/8mer/oligo_2_3w_3mm_8mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  3.04it/s]


['../oligo/results_pruned/8mer/oligo_3_3w_3mm_8mers_unique.csv', '../oligo/results_pruned/8mer/oligo_3_0w_3mm_8mers_unique.csv', '../oligo/results_pruned/8mer/oligo_3_bg_3mm_8mers_unique.csv', '../oligo/results_pruned/8mer/oligo_3_1w_3mm_8mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:01<00:00,  3.08it/s]


['../oligo/results_pruned/9mer/oligo_1_0w_3mm_9mers_unique.csv', '../oligo/results_pruned/9mer/oligo_1_bg_3mm_9mers_unique.csv', '../oligo/results_pruned/9mer/oligo_1_1w_3mm_9mers_unique.csv', '../oligo/results_pruned/9mer/oligo_1_3w_3mm_9mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.17s/it]


['../oligo/results_pruned/9mer/oligo_2_1w_3mm_9mers_unique.csv', '../oligo/results_pruned/9mer/oligo_2_0w_3mm_9mers_unique.csv', '../oligo/results_pruned/9mer/oligo_2_bg_3mm_9mers_unique.csv', '../oligo/results_pruned/9mer/oligo_2_3w_3mm_9mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.22s/it]


['../oligo/results_pruned/9mer/oligo_3_3w_3mm_9mers_unique.csv', '../oligo/results_pruned/9mer/oligo_3_1w_3mm_9mers_unique.csv', '../oligo/results_pruned/9mer/oligo_3_0w_3mm_9mers_unique.csv', '../oligo/results_pruned/9mer/oligo_3_bg_3mm_9mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.17s/it]


['../oligo/results_pruned/10mer/oligo_1_3w_3mm_10mers_unique.csv', '../oligo/results_pruned/10mer/oligo_1_0w_3mm_10mers_unique.csv', '../oligo/results_pruned/10mer/oligo_1_1w_3mm_10mers_unique.csv', '../oligo/results_pruned/10mer/oligo_1_bg_3mm_10mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:18<00:00,  4.72s/it]


['../oligo/results_pruned/10mer/oligo_2_3w_3mm_10mers_unique.csv', '../oligo/results_pruned/10mer/oligo_2_0w_3mm_10mers_unique.csv', '../oligo/results_pruned/10mer/oligo_2_1w_3mm_10mers_unique.csv', '../oligo/results_pruned/10mer/oligo_2_bg_3mm_10mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:18<00:00,  4.64s/it]


['../oligo/results_pruned/10mer/oligo_3_3w_3mm_10mers_unique.csv', '../oligo/results_pruned/10mer/oligo_3_0w_3mm_10mers_unique.csv', '../oligo/results_pruned/10mer/oligo_3_1w_3mm_10mers_unique.csv', '../oligo/results_pruned/10mer/oligo_3_bg_3mm_10mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:18<00:00,  4.71s/it]


['../oligo/results_pruned/11mer/oligo_1_3w_3mm_11mers_unique.csv', '../oligo/results_pruned/11mer/oligo_1_1w_3mm_11mers_unique.csv', '../oligo/results_pruned/11mer/oligo_1_0w_3mm_11mers_unique.csv', '../oligo/results_pruned/11mer/oligo_1_bg_3mm_11mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:15<00:00, 18.84s/it]


['../oligo/results_pruned/11mer/oligo_2_3w_3mm_11mers_unique.csv', '../oligo/results_pruned/11mer/oligo_2_1w_3mm_11mers_unique.csv', '../oligo/results_pruned/11mer/oligo_2_0w_3mm_11mers_unique.csv', '../oligo/results_pruned/11mer/oligo_2_bg_3mm_11mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:05<00:00, 16.34s/it]


['../oligo/results_pruned/11mer/oligo_3_3w_3mm_11mers_unique.csv', '../oligo/results_pruned/11mer/oligo_3_1w_3mm_11mers_unique.csv', '../oligo/results_pruned/11mer/oligo_3_0w_3mm_11mers_unique.csv', '../oligo/results_pruned/11mer/oligo_3_bg_3mm_11mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [01:08<00:00, 17.11s/it]


['../oligo/results_pruned/12mer/oligo_1_bg_3mm_12mers_unique.csv', '../oligo/results_pruned/12mer/oligo_1_3w_3mm_12mers_unique.csv', '../oligo/results_pruned/12mer/oligo_1_0w_3mm_12mers_unique.csv', '../oligo/results_pruned/12mer/oligo_1_1w_3mm_12mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [03:37<00:00, 54.27s/it]


['../oligo/results_pruned/12mer/oligo_2_bg_3mm_12mers_unique.csv', '../oligo/results_pruned/12mer/oligo_2_3w_3mm_12mers_unique.csv', '../oligo/results_pruned/12mer/oligo_2_0w_3mm_12mers_unique.csv', '../oligo/results_pruned/12mer/oligo_2_1w_3mm_12mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [02:24<00:00, 36.08s/it]


['../oligo/results_pruned/12mer/oligo_3_bg_3mm_12mers_unique.csv', '../oligo/results_pruned/12mer/oligo_3_3w_3mm_12mers_unique.csv', '../oligo/results_pruned/12mer/oligo_3_0w_3mm_12mers_unique.csv', '../oligo/results_pruned/12mer/oligo_3_1w_3mm_12mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [02:45<00:00, 41.42s/it]


['../oligo/results_pruned/13mer/oligo_1_3w_3mm_13mers_unique.csv', '../oligo/results_pruned/13mer/oligo_1_1w_3mm_13mers_unique.csv', '../oligo/results_pruned/13mer/oligo_1_0w_3mm_13mers_unique.csv', '../oligo/results_pruned/13mer/oligo_1_bg_3mm_13mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [06:23<00:00, 95.80s/it]


['../oligo/results_pruned/13mer/oligo_2_3w_3mm_13mers_unique.csv', '../oligo/results_pruned/13mer/oligo_2_1w_3mm_13mers_unique.csv', '../oligo/results_pruned/13mer/oligo_2_0w_3mm_13mers_unique.csv', '../oligo/results_pruned/13mer/oligo_2_bg_3mm_13mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [03:29<00:00, 52.49s/it]


['../oligo/results_pruned/13mer/oligo_3_3w_3mm_13mers_unique.csv', '../oligo/results_pruned/13mer/oligo_3_1w_3mm_13mers_unique.csv', '../oligo/results_pruned/13mer/oligo_3_0w_3mm_13mers_unique.csv', '../oligo/results_pruned/13mer/oligo_3_bg_3mm_13mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [04:20<00:00, 65.22s/it]


['../oligo/results_pruned/14mer/oligo_1_3w_3mm_14mers_unique.csv', '../oligo/results_pruned/14mer/oligo_1_0w_3mm_14mers_unique.csv', '../oligo/results_pruned/14mer/oligo_1_1w_3mm_14mers_unique.csv']


100%|█████████████████████████████████████████████████████████████████████████████████████████| 3/3 [06:18<00:00, 126.15s/it]


['../oligo/results_pruned/14mer/oligo_2_bg_3mm_14mers_unique.csv', '../oligo/results_pruned/14mer/oligo_2_3w_3mm_14mers_unique.csv', '../oligo/results_pruned/14mer/oligo_2_0w_3mm_14mers_unique.csv', '../oligo/results_pruned/14mer/oligo_2_1w_3mm_14mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [04:16<00:00, 64.17s/it]


['../oligo/results_pruned/14mer/oligo_3_bg_3mm_14mers_unique.csv', '../oligo/results_pruned/14mer/oligo_3_3w_3mm_14mers_unique.csv', '../oligo/results_pruned/14mer/oligo_3_0w_3mm_14mers_unique.csv', '../oligo/results_pruned/14mer/oligo_3_1w_3mm_14mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [05:23<00:00, 80.98s/it]


['../oligo/results_pruned/15mer/oligo_1_0w_3mm_15mers_unique.csv', '../oligo/results_pruned/15mer/oligo_1_1w_3mm_15mers_unique.csv', '../oligo/results_pruned/15mer/oligo_1_3w_3mm_15mers_unique.csv']


100%|█████████████████████████████████████████████████████████████████████████████████████████| 3/3 [07:24<00:00, 148.29s/it]


['../oligo/results_pruned/15mer/oligo_2_bg_3mm_15mers_unique.csv', '../oligo/results_pruned/15mer/oligo_2_0w_3mm_15mers_unique.csv', '../oligo/results_pruned/15mer/oligo_2_1w_3mm_15mers_unique.csv', '../oligo/results_pruned/15mer/oligo_2_3w_3mm_15mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [04:56<00:00, 74.01s/it]


['../oligo/results_pruned/15mer/oligo_3_bg_3mm_15mers_unique.csv', '../oligo/results_pruned/15mer/oligo_3_0w_3mm_15mers_unique.csv', '../oligo/results_pruned/15mer/oligo_3_1w_3mm_15mers_unique.csv', '../oligo/results_pruned/15mer/oligo_3_3w_3mm_15mers_unique.csv']


100%|██████████████████████████████████████████████████████████████████████████████████████████| 4/4 [06:19<00:00, 94.99s/it]


In [8]:
# redo_files = ['../oligo/results/13mer/oligo_1_0w_13mers_unique.csv','../oligo/results/14mer/oligo_1_bg_14mers_unique.csv','../oligo/results/15mer/oligo_1_bg_15mers_unique.csv']
# redo_k = [13, 14, 15]
# i=1
# for k, f in zip(redo_k, redo_files):
#     print(k, '\t', f)
#     library_file = f'../oligo/data/bait_kmers/library_{k}mers.csv'
#     lib_set = get_library_set(library_file)
#     truth_file = f'../oligo/data/bait_kmers/oligo_{i}_{k}mers.csv'
#     outfile_prefix = f'oligo_{i}'
#     exp = re.search('(?<=_)\w\w(?=_)', f)[0]
#     df = calc_zscore(f)
#     df_out = label_exp(df, truth_set, lib_set, exp)
#     df_out.to_csv(f'../oligo/results/{k}mer/{outfile_prefix}_{exp}_{k}mer_enrichment.csv', index=False)

In [9]:
# k=11
# library_file = f'../oligo/data/bait_kmers/library_comp_{k}mers.csv'
# lib_set = get_library_set(library_file)

In [10]:
# for i in range(1,4):
#     files = glob.glob(f'../oligo/results/{k}mer/oligo_{i}*{k}mers_unique.csv')
#     print(files)
#     truth_file = f'../oligo/data/bait_kmers/oligo_{i}_{k}mers.csv'
#     outfile_prefix = f'oligo_{i}'
#     for f in tqdm(files):
#         truth_set = get_truth_set(truth_file)
#         exp = re.search('(?<=_)\w\w(?=_)', f)[0]
#         df = calc_zscore(f)
#         df_out = label_exp(df, truth_set, lib_set, exp)
#         #df_out.to_csv(f'../oligo/results/{k}mer/{outfile_prefix}_{exp}_{k}mer_enrichment.csv', index=False)

## streptavidin

In [4]:
kmers = list(range(5,16))
for k in kmers:
    library_file = f'../oligo/data/bait_kmers/library_{k}mers.csv'
    lib_set = get_library_set(library_file)
    files = glob.glob(f'../oligo/results/hfog1/streptavidin*_{k}mers_unique.csv')
    print(files)
    truth_file = f'../oligo/data/bait_kmers/oligo_all_comp_{k}mers.csv'
    outfile_prefix = f'streptavidin'
    for f in tqdm(files):
        truth_set = get_truth_set(truth_file)
        exp = re.search('(?<=_)\w\w(?=_)', f)[0]
        df = calc_zscore(f)
        df_out = label_exp(df, truth_set, lib_set, exp)
        df_out.to_csv(f'../oligo/results/{k}mer/{outfile_prefix}_{exp}_comp_{k}mer_enrichment.csv', index=False)

['../oligo/results/hfog1/streptavidin_bg_5mers_unique.csv', '../oligo/results/hfog1/streptavidin_3w_5mers_unique.csv', '../oligo/results/hfog1/streptavidin_bg_15mers_unique.csv', '../oligo/results/hfog1/streptavidin_3w_15mers_unique.csv', '../oligo/results/hfog1/streptavidin_7w_5mers_unique.csv', '../oligo/results/hfog1/streptavidin_7w_15mers_unique.csv', '../oligo/results/hfog1/streptavidin_0w_5mers_unique.csv', '../oligo/results/hfog1/streptavidin_0w_15mers_unique.csv']


100%|███████████████████████████████████████| 8/8 [05:49<00:00, 43.66s/it]


['../oligo/results/hfog1/streptavidin_7w_6mers_unique.csv', '../oligo/results/hfog1/streptavidin_0w_6mers_unique.csv', '../oligo/results/hfog1/streptavidin_bg_6mers_unique.csv', '../oligo/results/hfog1/streptavidin_3w_6mers_unique.csv']


100%|███████████████████████████████████████| 4/4 [00:00<00:00,  5.34it/s]


['../oligo/results/hfog1/streptavidin_bg_7mers_unique.csv', '../oligo/results/hfog1/streptavidin_3w_7mers_unique.csv', '../oligo/results/hfog1/streptavidin_0w_7mers_unique.csv', '../oligo/results/hfog1/streptavidin_7w_7mers_unique.csv']


100%|███████████████████████████████████████| 4/4 [00:00<00:00,  7.26it/s]


['../oligo/results/hfog1/streptavidin_7w_8mers_unique.csv', '../oligo/results/hfog1/streptavidin_0w_8mers_unique.csv', '../oligo/results/hfog1/streptavidin_3w_8mers_unique.csv', '../oligo/results/hfog1/streptavidin_bg_8mers_unique.csv']


100%|███████████████████████████████████████| 4/4 [00:01<00:00,  2.60it/s]


['../oligo/results/hfog1/streptavidin_3w_9mers_unique.csv', '../oligo/results/hfog1/streptavidin_bg_9mers_unique.csv', '../oligo/results/hfog1/streptavidin_0w_9mers_unique.csv', '../oligo/results/hfog1/streptavidin_7w_9mers_unique.csv']


100%|███████████████████████████████████████| 4/4 [00:05<00:00,  1.31s/it]


['../oligo/results/hfog1/streptavidin_3w_10mers_unique.csv', '../oligo/results/hfog1/streptavidin_bg_10mers_unique.csv', '../oligo/results/hfog1/streptavidin_7w_10mers_unique.csv', '../oligo/results/hfog1/streptavidin_0w_10mers_unique.csv']


100%|███████████████████████████████████████| 4/4 [00:20<00:00,  5.08s/it]


['../oligo/results/hfog1/streptavidin_0w_11mers_unique.csv', '../oligo/results/hfog1/streptavidin_7w_11mers_unique.csv', '../oligo/results/hfog1/streptavidin_bg_11mers_unique.csv', '../oligo/results/hfog1/streptavidin_3w_11mers_unique.csv']


100%|███████████████████████████████████████| 4/4 [01:10<00:00, 17.75s/it]


['../oligo/results/hfog1/streptavidin_bg_12mers_unique.csv', '../oligo/results/hfog1/streptavidin_3w_12mers_unique.csv', '../oligo/results/hfog1/streptavidin_7w_12mers_unique.csv', '../oligo/results/hfog1/streptavidin_0w_12mers_unique.csv']


100%|███████████████████████████████████████| 4/4 [02:42<00:00, 40.65s/it]


['../oligo/results/hfog1/streptavidin_0w_13mers_unique.csv', '../oligo/results/hfog1/streptavidin_7w_13mers_unique.csv', '../oligo/results/hfog1/streptavidin_3w_13mers_unique.csv', '../oligo/results/hfog1/streptavidin_bg_13mers_unique.csv']


100%|███████████████████████████████████████| 4/4 [04:13<00:00, 63.30s/it]


['../oligo/results/hfog1/streptavidin_0w_14mers_unique.csv', '../oligo/results/hfog1/streptavidin_7w_14mers_unique.csv', '../oligo/results/hfog1/streptavidin_3w_14mers_unique.csv', '../oligo/results/hfog1/streptavidin_bg_14mers_unique.csv']


100%|███████████████████████████████████████| 4/4 [05:03<00:00, 75.96s/it]


['../oligo/results/hfog1/streptavidin_bg_15mers_unique.csv', '../oligo/results/hfog1/streptavidin_3w_15mers_unique.csv', '../oligo/results/hfog1/streptavidin_7w_15mers_unique.csv', '../oligo/results/hfog1/streptavidin_0w_15mers_unique.csv']


100%|███████████████████████████████████████| 4/4 [05:48<00:00, 87.15s/it]


In [5]:
k = 5
library_file = f'../oligo/data/bait_kmers/library_{k}mers.csv'
lib_set = get_library_set(library_file)
files = glob.glob(f'../oligo/results/hfog1/streptavidin*_{k}mers_unique.csv')
print(files)
truth_file = f'../oligo/data/bait_kmers/oligo_all_comp_{k}mers.csv'
outfile_prefix = f'streptavidin'
for f in tqdm(files):
    truth_set = get_truth_set(truth_file)
    exp = re.search('(?<=_)\w\w(?=_)', f)[0]
    df = calc_zscore(f)
    df_out = label_exp(df, truth_set, lib_set, exp)
    df_out.to_csv(f'../oligo/results/{k}mer/{outfile_prefix}_{exp}_comp_{k}mer_enrichment.csv', index=False)

['../oligo/results/hfog1/streptavidin_bg_5mers_unique.csv', '../oligo/results/hfog1/streptavidin_3w_5mers_unique.csv', '../oligo/results/hfog1/streptavidin_7w_5mers_unique.csv', '../oligo/results/hfog1/streptavidin_0w_5mers_unique.csv']


100%|███████████████████████████████████████| 4/4 [00:00<00:00,  4.29it/s]


In [6]:
kmers = list(range(5,16))
for k in kmers:
    library_file = f'../oligo/data/bait_kmers/library_{k}mers.csv'
    lib_set = get_library_set(library_file)
    files = glob.glob(f'../oligo/results/hfog1/streptavidin*_{k}mers_unique.csv')
    print(files)
    truth_file = f'../oligo/data/bait_kmers/oligo_all_{k}mers.csv'
    outfile_prefix = f'streptavidin'
    for f in tqdm(files):
        truth_set = get_truth_set(truth_file)
        exp = re.search('(?<=_)\w\w(?=_)', f)[0]
        df = calc_zscore(f)
        df_out = label_exp(df, truth_set, lib_set, exp)
        df_out.to_csv(f'../oligo/results/{k}mer/{outfile_prefix}_{exp}_{k}mer_enrichment.csv', index=False)

['../oligo/results/hfog1/streptavidin_bg_5mers_unique.csv', '../oligo/results/hfog1/streptavidin_3w_5mers_unique.csv', '../oligo/results/hfog1/streptavidin_7w_5mers_unique.csv', '../oligo/results/hfog1/streptavidin_0w_5mers_unique.csv']


100%|███████████████████████████████████████| 4/4 [00:00<00:00, 82.06it/s]


['../oligo/results/hfog1/streptavidin_7w_6mers_unique.csv', '../oligo/results/hfog1/streptavidin_0w_6mers_unique.csv', '../oligo/results/hfog1/streptavidin_bg_6mers_unique.csv', '../oligo/results/hfog1/streptavidin_3w_6mers_unique.csv']


100%|███████████████████████████████████████| 4/4 [00:00<00:00, 40.07it/s]


['../oligo/results/hfog1/streptavidin_bg_7mers_unique.csv', '../oligo/results/hfog1/streptavidin_3w_7mers_unique.csv', '../oligo/results/hfog1/streptavidin_0w_7mers_unique.csv', '../oligo/results/hfog1/streptavidin_7w_7mers_unique.csv']


100%|███████████████████████████████████████| 4/4 [00:00<00:00, 12.98it/s]


['../oligo/results/hfog1/streptavidin_7w_8mers_unique.csv', '../oligo/results/hfog1/streptavidin_0w_8mers_unique.csv', '../oligo/results/hfog1/streptavidin_3w_8mers_unique.csv', '../oligo/results/hfog1/streptavidin_bg_8mers_unique.csv']


100%|███████████████████████████████████████| 4/4 [00:01<00:00,  3.41it/s]


['../oligo/results/hfog1/streptavidin_3w_9mers_unique.csv', '../oligo/results/hfog1/streptavidin_bg_9mers_unique.csv', '../oligo/results/hfog1/streptavidin_0w_9mers_unique.csv', '../oligo/results/hfog1/streptavidin_7w_9mers_unique.csv']


100%|███████████████████████████████████████| 4/4 [00:04<00:00,  1.17s/it]


['../oligo/results/hfog1/streptavidin_3w_10mers_unique.csv', '../oligo/results/hfog1/streptavidin_bg_10mers_unique.csv', '../oligo/results/hfog1/streptavidin_7w_10mers_unique.csv', '../oligo/results/hfog1/streptavidin_0w_10mers_unique.csv']


100%|███████████████████████████████████████| 4/4 [00:19<00:00,  4.94s/it]


['../oligo/results/hfog1/streptavidin_0w_11mers_unique.csv', '../oligo/results/hfog1/streptavidin_7w_11mers_unique.csv', '../oligo/results/hfog1/streptavidin_bg_11mers_unique.csv', '../oligo/results/hfog1/streptavidin_3w_11mers_unique.csv']


100%|███████████████████████████████████████| 4/4 [01:09<00:00, 17.35s/it]


['../oligo/results/hfog1/streptavidin_bg_12mers_unique.csv', '../oligo/results/hfog1/streptavidin_3w_12mers_unique.csv', '../oligo/results/hfog1/streptavidin_7w_12mers_unique.csv', '../oligo/results/hfog1/streptavidin_0w_12mers_unique.csv']


100%|███████████████████████████████████████| 4/4 [02:40<00:00, 40.08s/it]


['../oligo/results/hfog1/streptavidin_0w_13mers_unique.csv', '../oligo/results/hfog1/streptavidin_7w_13mers_unique.csv', '../oligo/results/hfog1/streptavidin_3w_13mers_unique.csv', '../oligo/results/hfog1/streptavidin_bg_13mers_unique.csv']


100%|███████████████████████████████████████| 4/4 [04:02<00:00, 60.71s/it]


['../oligo/results/hfog1/streptavidin_0w_14mers_unique.csv', '../oligo/results/hfog1/streptavidin_7w_14mers_unique.csv', '../oligo/results/hfog1/streptavidin_3w_14mers_unique.csv', '../oligo/results/hfog1/streptavidin_bg_14mers_unique.csv']


100%|███████████████████████████████████████| 4/4 [05:02<00:00, 75.56s/it]


['../oligo/results/hfog1/streptavidin_bg_15mers_unique.csv', '../oligo/results/hfog1/streptavidin_3w_15mers_unique.csv', '../oligo/results/hfog1/streptavidin_7w_15mers_unique.csv', '../oligo/results/hfog1/streptavidin_0w_15mers_unique.csv']


100%|███████████████████████████████████████| 4/4 [05:47<00:00, 86.89s/it]
