In [10]:
from kmer_counter2.count_kmers import mer_count
from fasta_parser import parse_fasta

In [135]:
from collections import defaultdict
from more_itertools import sliced
from functools import reduce

In [38]:
import numpy as np

In [29]:
filename = "/media/paulosschlogl/Paulo/pauloscchlogl/Genome_kmers/Metadata/Bacteria/Pseudomonadota/Gammaproteobacteria/CHR/GCA_003790525.1_ASM379052v1_genomic.fna.gz"

In [30]:
for Id, sequence in parse_fasta(filename):
    seq = sequence
    seq_len = len(seq)

In [133]:
kmer_counts_1_10 = mer_count(seq, 1, 10)

In [150]:
kmer_freqs_1_10 = defaultdict(float)
for k, v in kmer_counts_1_10.items():
    kmer_freqs_2_10[k] = kmer_freqs_1_10.get(km, 0.0) + v/seq_len

In [153]:
kmer_counts_1_10

defaultdict(int,
            {'C': 59682,
             'T': 67675,
             'A': 67353,
             'G': 59083,
             'CT': 14781,
             'TC': 15874,
             'TA': 13671,
             'AT': 18194,
             'AA': 20784,
             'TT': 21153,
             'TG': 16977,
             'GC': 16239,
             'GT': 13547,
             'CG': 13649,
             'GG': 13693,
             'AG': 14764,
             'AC': 13611,
             'CA': 17295,
             'GA': 15603,
             'CC': 13957,
             'CTC': 3141,
             'TCT': 3906,
             'CTA': 2049,
             'TAT': 4064,
             'ATA': 4001,
             'TAA': 4448,
             'AAT': 4920,
             'ATT': 5041,
             'TTG': 4438,
             'TGC': 4310,
             'GCT': 4294,
             'CTG': 5057,
             'TGT': 3500,
             'GTT': 4333,
             'TTT': 7245,
             'TTA': 4448,
             'TTC': 5022,
             'TCG': 3299,

In [201]:
bases = {k:v/seq_len for k,v in kmer_counts_1_10.items() if k in "AGCT" and len(k) == 1}
bases

{'C': 0.2351601502011482,
 'T': 0.26665432064714156,
 'A': 0.2653855701299878,
 'G': 0.23279995902172243}

In [213]:
def expected_kmer_by_zom(kmer, base_freqs, len_seq):
    """
    Calculates the expected number of a substring of length k (k-mer)
    based in a zero order Markov model (zom).
    
    Inputs:
    
        kmer - a substring representing the word/k-mer (a string of length k).
        base_freqs - a dictionary-like object mapping the frequency of the
                     nucleotides/basesand their counted values normalized by
                     the length of the sequence/genome.
        len_seq - a integer representing the length of the sequence/genome where
                  the kmers were counted.
    
    Outputs:
    
        expected - a float representing the kmer/substring of length k in the
                sequence/genome of length N (len_seq - len_kmer + 1).
    N = len(seq) - len(kmer) + 1
    E(w) = N*(nuc1*nuc2*nuc3)
    """
    # number position in the genome where the
    # kmer was counted
    n = len_seq - len(kmer) + 1
    cnt = 1
    for base in kmer:
        # multiply all the bases values that
        # are found in the kmer
        cnt *= base_freqs[base]
    # return the expected value of the kmer
    print(cnt)
    exp = n * cnt
    return exp

In [214]:
def make_kmers(seq, k):
    N = range(0, len(seq) - k + 1)
    for idx in N:
        yield seq[idx:idx+k]

def zero_markov(kmer, kmer_data, seq_len):
    km_spl = list(sliced(kmer, n = 1))
    N = seq_len - len(kmer) - 1
    freqs = [kmer_data[km]/seq_len for km in km_spl]
    kmer_zero = reduce(lambda x, y: x*y, freqs)
    return kmer_zero * N

def estimate_prob_by_markov(kmer, kmer_data, seq_len, mo):
    nume = 1
    deno = 1
    # zro order markov
    if mo == 0:
        return zero_markov(kmer, seq_len)
    # first order markov
    elif mo == 1:
        kmax = make_kmers(kmer[1:-1], 2)
        kmin = [kmer[0], kmer[-1]]
    # second order markov
    elif mo == 2:
        kmax = make_kmers(kmer, 3)
        kmin = make_kmers(kmer[1:-1], 2)
    # thr order markov
    elif mo == 3:
        kmax = make_kmers(kmer, 4)
        kmin = make_kmers(kmer[1:-1], 3)
    for k1 in kmax:
        for k2 in kmin:
            nume *= kmer_data[k1]
            deno *= kmer_data[k2]
    return nume/deno


In [215]:
zero_markov("AAAGGGGAGC", kmer_counts_1_10, seq_len)

0.20241759611031426

In [216]:
expected_kmer_by_zom("AAAGGGGAGC", bases, seq_len)

7.976042276848408e-07


0.20241919131876965

In [191]:
lst = [0.26539707307846894, 0.26539707307846894, 0.26539707307846894, 0.26539707307846894, 0.23517034305033455, 0.23281004957010348, 0.23281004957010348, 0.23281004957010348, 0.23281004957010348, 0.23281004957010348]
reduce(lambda x, y: x*y, lst)

7.97950010994765e-07

In [192]:
np.prod(lst)

7.97950010994765e-07

In [188]:
(0.26539707307846894**4) * (0.23517034305033455**5) * 0.26666587858871

9.516225573672542e-07

In [187]:
exp = estimate_prob_by_markov("AAAGGGGAGC", kmer_counts_1_10, seq_len, 0)
#exp2 = th3_order_markov_long_kmers("AAAGGGGAGC", 4, kmer_freqs_1_10)

['A', 'A', 'A', 'G', 'G', 'G', 'G', 'A', 'G', 'C']
[0.26539707307846894, 0.26539707307846894, 0.26539707307846894, 0.26539707307846894, 0.23517034305033455, 0.23281004957010348, 0.23281004957010348, 0.23281004957010348, 0.23281004957010348, 0.23281004957010348]
7.97950010994765e-07


In [142]:
exp, #exp2

(0.0,)

In [119]:
2*(2 * np.log(2/exp2))-( 2 - exp2)

17.42774523116948

In [124]:
2*(exp)

0.0018305057293965502

In [111]:
2*(2 * np.log(2/exp2)), (2 - exp2), 2*(2 * np.log(2/exp2))-(2 - exp2)

(19.412135906311782, 1.984390675142301, 17.42774523116948)

In [112]:
2 * (2 * np.log(2/exp2)) - (2 - exp2)

17.42774523116948

In [113]:
kmer_counts_2_10["AACCTGGGGT"], kmer_counts_2_10["AAAGGGGAGC"]

(0, 2)

In [114]:
def e2(obs, exp):
    if obs > 0:
        return 2 * (obs * np.log(obs/exp2)) - (obs - exp2)
    else:
        return 2 * exp

In [115]:
e2(0, exp), e2(2, exp2)

(0.0018305057293965502, 17.42774523116948)