# K-mer
The goal of this notebook is to reproduce the k-mer model.
While investigating the plotting notebook in the original repository it was found that the 11-mer model actually is the best markov model. In the config file for the best markov model in the results folder it can be seen, that it is a bidirectional markov model of order 5.


In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import re

import numpy as np

from markov_model import *

import pickle
from collections import defaultdict

from tqdm import tqdm

In [2]:
markov_order = 3
K = (markov_order*2)+1

In [3]:
data_dir = '/lustre/groups/epigenereg01/workspace/projects/vale/mlm/'

In [6]:
def read_fasta(fasta_fa, N_seqs=None):
    
    fasta_df = defaultdict(str)

    c = 0
    
    with open(fasta_fa, 'r') as f:
        for line in f:
            if line.startswith('>'):
                seq_name = line
                c += 1
                if N_seqs is not None and c>N_seqs:
                    break
            else:
                fasta_df[seq_name] += line.rstrip().upper()
    
    fasta_df = pd.DataFrame({'seq':fasta_df.values(), 'seq_name':fasta_df.keys()})
    
    return fasta_df

In [8]:
human_fasta = data_dir + 'fasta/Homo_sapiens_rna.fa' #3'UTR on hegative strand should already be reversed

other_species_fasta = data_dir + 'fasta/241_mammals.shuffled.fa'

train_df = read_fasta(other_species_fasta, N_seqs=100000)

test_df = read_fasta(human_fasta)
test_df.seq_name = test_df.seq_name.apply(lambda x: x[1:].rstrip())

In [13]:
# get the frequency counts of all motifs till 11mer
kmer_train = KmerCount(K,pseudocount=0.1)

kmer_train.compute_counts(train_df.seq)

100%|██████████| 100000/100000 [10:51<00:00, 153.49it/s]


In [14]:
markov_model = BiMarkov(kmer_train)

In [15]:
# calculate the markov matrix using the 11mer counts
markov_model.compile_from_counts()

  self.markov_matrix[order,:,:] = self.markov_matrix[order,:,:]/np.sum(self.markov_matrix[order,:,:],axis=1)[:,np.newaxis]


In [16]:
test_df = test_df.set_index('seq_name').seq #dataframe to series

In [17]:
#inference

mapping = {'A':0,'C':1,'G':2,'T':3}

motif_probas = []

for seq_name, seq in tqdm(test_df.items(),total = len(test_df)):
    
    prbs = markov_model.impute_for_seq(seq, order=markov_order)
    
    motif_probas.append((seq_name, seq, prbs))

100%|██████████| 18134/18134 [01:30<00:00, 201.37it/s]


In [19]:
output_dir = data_dir + f'human_3utr/probs/K-mer/{K}_mer/'
os.makedirs(output_dir, exist_ok=True)

with open(output_dir + 'predictions.pickle', 'wb') as f:
    seq_names, seqs, probs = zip(*motif_probas)
    pickle.dump({'seq_names':seq_names, 'seqs':seqs, 'probs':probs},f)

# Performance for different K

In [44]:
kmers = [7,9,11,13]

model_scores = {}

for k in kmers:
    with open(data_dir + f'human_3utr/probs/K-mer/{K}_mer/predictions.pickle', 'rb') as f:
        data = pickle.load(f)
        model_scores[k] = {k:v for k,v in zip(data['seq_names'],data['probs'])}

In [45]:
res = {k:[] for k in kmers}

for seq_name, seq in tqdm(test_df.items(),total = len(test_df)):
    seq_mapping = np.array([mapping[base] for base in seq]) #motif sequence to ordinal encoding
    for k in kmers:
        preds = np.argmax(model_scores[k][seq_name],axis=1)
        res[k].extend(list(seq_mapping==preds))

100%|██████████| 18134/18134 [00:07<00:00, 2413.63it/s]


In [46]:
mean_acc = {k:np.mean(v) for k,v in res.items()}
std_acc = {k:np.std(v) for k,v in res.items()}

In [52]:
mean_acc

{7: 0.37839834126653354,
 9: 0.3973033119437702,
 11: 0.4121199421714091,
 13: 0.43742221225283656}

In [53]:
std_acc

{7: 0.48498766643417823,
 9: 0.48933974931767193,
 11: 0.49221651276246636,
 13: 0.49606856429396046}