# Count all spaced k-mers from KMC output 

In [4]:
import numpy as np
import os
import pandas as pd
from matplotlib import pyplot as plt
from Bio import SeqIO
import sys
import glob
import matplotlib.patches as mpatches
import pickle
import itertools 
from numpy import arange
import argparse

In [5]:
def parse_fastq(file_name):
    input_seq_iterator = SeqIO.parse(file_name, "fastq")
    return [str(record.seq) for record in input_seq_iterator]

def parse_fasta(file_name):
    input_seq_iterator = SeqIO.parse(file_name, "fasta")
    return [str(record.seq) for record in input_seq_iterator]

def parse_seq(file_name):
    with open(file_name,'r') as f:
        seq = [line.rstrip() for line in f]
    return seq

In [6]:
sequences = parse_fasta('positives_toy.fasta')
core1 = 'TAG'
core2 = 'ACT'

In [7]:
pickle_dir = 'pickles'
data_dir = 'data'

#normalize = '0' #normalize with 'input' or 'mock'
core_size = 3
k_range = range(core_size*2,35) #kmer size


def count_bipartite_stats(file_name, input_name, k_range, core_size):
    bases = ['A','C','G','T']
    all_kmers = [''.join(p) for p in itertools.product(bases, repeat=2*core_size)]
    count_table = pd.DataFrame(index=np.sort(all_kmers))
    

    for k in k_range:
        print(k)
        all_kmers = np.sort([''.join(p) for p in itertools.product(bases, repeat=k)])

        input_kmer_files = glob.glob(os.path.join(data_dir,f'{input_name}*_%dmer.txt' % k))
        input_files = [os.path.basename(d).split('.')[0] for d in input_kmer_files]

        pd_kmer_files = glob.glob(os.path.join(data_dir,f'{file_name}*_%dmer.txt' % k))
        pd_files = [os.path.basename(d).split('.')[0] for d in pd_kmer_files]


        name = 'positives'
        counts = pd.read_table(pd_kmer_files[0], header=None, sep='\t').sort_index()
        counts.columns = ['kmer','count']
        complete = pd.DataFrame({'kmer':all_kmers})
        counts = complete.merge(counts, on='kmer', how='left').fillna(0)

        counts['spaced'] = [''.join([ix[:3], ix[-3:]]) for ix in counts.kmer]
        counts = counts.sort_values(['spaced'], ascending=True)
        #return counts

        if k>2*core_size:
            c = np.add.reduceat(counts['count'].values , np.arange(0, len(counts.index), np.power(4,k-2*core_size)))

        else:
            c=counts['count'].values

        #assign each kmer count to spaced core version
        count_table['count_%s_k%d'%(name,k)] = c

        #calculate motif fraction
        count_table['frac_%s_k%d'%(name,k)] = count_table['count_%s_k%d'%(name,k)]/sum(count_table['count_%s_k%d'%(name,k)])

        name = 'negatives'
        counts = pd.read_table(input_kmer_files[0], header=None, sep='\t').sort_index()
        counts.columns = ['kmer','count']
        complete = pd.DataFrame({'kmer':all_kmers})
        counts = complete.merge(counts, on='kmer', how='left').fillna(0)

        counts['spaced'] = [''.join([ix[:3], ix[-3:]]) for ix in counts.kmer]
        counts = counts.sort_values(['spaced'], ascending=True)
        #return counts

        if k>2*core_size:
            c = np.add.reduceat(counts['count'].values , np.arange(0, len(counts.index), np.power(4,k-2*core_size)))

        else:
            c=counts['count'].values

        #assign each kmer count to spaced core version
        count_table['count_%s_k%d'%(name,k)] = c

        #calculate motif fraction
        count_table['frac_%s_k%d'%(name,k)] = count_table['count_%s_k%d'%(name,k)]/sum(count_table['count_%s_k%d'%(name,k)])
            
        with open(os.path.join(pickle_dir, '%s_spaced_counts_%dcore.pkl'%('toy',core_size)), 'wb') as f: 
            pickle.dump(count_table, f)

    return count_table

In [None]:
count_table = count_bipartite_stats(file_name='positives_toy', input_name='negatives_toy', k_range=np.arange(6,20), core_size=3)

6
7
8
9
10
11
12
13
14
15


In [57]:
counts['count']

0        17
192      17
128      25
64       25
1        25
         ..
16382    17
16319    20
16191    20
16255    17
16383    43
Name: count, Length: 16384, dtype: int64