In [1]:
import sys
from Bio import SeqRecord, SeqIO

import gc
from dataclasses import dataclass
import sys
import copy
from aadg_genomics_class.monitoring.logs import LOGS
from aadg_genomics_class.monitoring.task_reporter import TaskReporter, monitor_mem_snapshot
from aadg_genomics_class import click_utils as click
from aadg_genomics_class.new_aligner2 import align_seq
from aadg_genomics_class.new_aligner2np import doit
from aadg_genomics_class.edit_check import levenshtein

from typing import Dict, Any, Set, Optional
from itertools import chain
from numpy.lib.stride_tricks import sliding_window_view

import csv
import tracemalloc

import math
import numpy as np
from typing import Iterable

In [9]:
MASKS: Optional[Dict[int, int]] = None
MAX_KMER_SIZE = 64

MAPPING = dict(
    A=1,
    a=1,
    c=0,
    C=0,
    g=3,
    G=3,
    t=2,
    T=2,
)

RR_MAPPING = ["C", "A", "T", "G"]

COMPLEMENT_MAPPING = {
    1: 2,
    0: 3,
    3: 0,
    2: 1,
}

MAPPING_FN = np.vectorize(MAPPING.get)
COMPLEMENT_MAPPING_FN = np.vectorize(COMPLEMENT_MAPPING.get)

def format_sequences(src: Iterable[SeqRecord]):
    result = {record.id: MAPPING_FN(np.array(record.seq)) for record in src}
    return result, list(result.keys())

def iter_sequences(src: Iterable[SeqRecord]):
    return ((record.id, MAPPING_FN(np.array(record.seq))) for record in src)

def sequence_complement(seq):
     return COMPLEMENT_MAPPING_FN(seq)

def generate_mask(
    kmer_len: int,
) -> int:
    global MASKS
    if not MASKS:
        MASKS = dict()
        ret = 3
        for i in range(MAX_KMER_SIZE+1):
            ret = (ret << 2) | 3
            MASKS[i] = ret
    return MASKS[kmer_len]


In [11]:

def _get_kmers_min_pos(
    sequence_len,
    mask,
    r_seq_arr,
    kmers,
    r_kmers,
    window_len,
    kmer_len,
):
    kmers_min_pos = np.add(np.argmin(sliding_window_view(kmers, window_shape=window_len), axis=1), np.arange(0, sequence_len - window_len + 1))
    #r_kmers_min_pos = np.add(np.argmin(sliding_window_view(r_kmers, window_shape=window_len), axis=1), np.arange(0, sequence_len - window_len + 1))
    return kmers_min_pos#, r_kmers_min_pos


def get_minimizers_target(
    seq_arr,
    kmer_len,
    window_len,
):

    sequence_len = len(seq_arr)
    #r_seq_arr = sequence_complement(seq_arr) # This causes alignment problems?
    mask = generate_mask(kmer_len)

    # Function to compute kmer value based on the previous (on the left side) kmer value and new nucleotide
    uadd = np.frompyfunc(lambda x, y: ((x << 2) | y) & mask, 2, 1)

    # This computes values for kmers
    kmers = uadd.accumulate(seq_arr, dtype=object).astype(int)
    kmers[:kmer_len-2] = 0

    # r_kmers = uadd.accumulate(r_seq_arr, dtype=object).astype(int)
    # r_kmers[:kmer_len-2] = 0

    # Do sliding window and get min kmers positions
    kmers_min_pos = _get_kmers_min_pos( # r_kmers_min_pos
        sequence_len=sequence_len,
        mask=mask,
        r_seq_arr=seq_arr,
        kmers=kmers,
        r_kmers=kmers,
        window_len=window_len,
        kmer_len=kmer_len,
    )

    # Select min from kmer and r_kmer
    # select_min_from_kmer_r_kmer = np.argmin(np.column_stack((
    #     r_kmers[r_kmers_min_pos],
    #     kmers[kmers_min_pos],
    # )), axis=1).astype(dtype=bool)

    # Now collect all selected mimumum and kmers into single table
    selected_kmers = np.column_stack((
        kmers[kmers_min_pos],
        kmers_min_pos,
        np.ones(len(kmers_min_pos), dtype=bool)
    ))[kmer_len:]

    # Remove duplicates
    selected_kmers = selected_kmers[selected_kmers[:, 0].argsort()]

    # This part performs group by using the kmer value
    selected_kmers_unique = np.unique(selected_kmers, axis=0)
    selected_kmers_unique_idx = np.unique(selected_kmers_unique[:, 0], return_index=True)[1][1:]
    selected_kmers_entries_split = np.split(selected_kmers_unique[:, 1:], selected_kmers_unique_idx)

    if len(selected_kmers_unique) > 0:
        # We zip all kmers into a dict
        result = dict(zip(chain([selected_kmers_unique[0, 0]], selected_kmers_unique[selected_kmers_unique_idx, 0]), selected_kmers_entries_split))
    else:
        # If we have no minimizers we return nothing, sorry
        result = dict()
    return result

In [3]:
reference_records, reference_ids = format_sequences(SeqIO.parse("../data/reference.fasta", "fasta"))

In [5]:
reference_ids

['reference']

In [15]:
kmer_len = 18
window_len = 8
min_index_target = get_minimizers_target(
    reference_records[reference_ids[0]],
    kmer_len=kmer_len,
    window_len=window_len,
)

In [18]:
min_index_target[291873], min_index_target[2100232], min_index_target[278396374]

(array([[729317,      1]]),
 array([[932737,      1]]),
 array([[645111,      1],
        [645150,      1]]))

In [29]:
selected_kmers = None
def get_minimizers_target2(
    seq_arr,
    kmer_len,
    window_len,
):
    global selected_kmers
    sequence_len = len(seq_arr)
    mask = generate_mask(kmer_len)

    # Function to compute kmer value based on the previous (on the left side) kmer value and new nucleotide
    uadd = np.frompyfunc(lambda x, y: ((x << 2) | y) & mask, 2, 1)

    # This computes values for kmers
    kmers = uadd.accumulate(seq_arr, dtype=object).astype(int)
    kmers[:kmer_len-2] = 0
    del seq_arr
    
    # Do sliding window and get min kmers positions
    kmers_min_pos = np.add(np.argmin(sliding_window_view(kmers, window_shape=window_len), axis=1), np.arange(0, sequence_len - window_len + 1))
    
    # Now collect all selected mimumum and kmers into single table
    selected_kmers = np.column_stack((
        kmers[kmers_min_pos],
        kmers_min_pos,
        #np.ones(len(kmers_min_pos), dtype=bool)
    ))[kmer_len:]
    del kmers_min_pos
    del kmers

    # Remove duplicates
    selected_kmers = selected_kmers[selected_kmers[:, 0].argsort()]
    selected_kmers = np.unique(selected_kmers, axis=0)

    # This part performs group by using the kmer value
    selected_kmers_unique_idx = np.unique(selected_kmers[:, 0], return_index=True)[1][1:]
    selected_kmers_entries_split = np.split(selected_kmers[:, 1], selected_kmers_unique_idx)

    if len(selected_kmers) > 0:
        # We zip all kmers into a dict
        result = dict(zip(chain([selected_kmers[0, 0]], selected_kmers[selected_kmers_unique_idx, 0]), selected_kmers_entries_split))
    else:
        # If we have no minimizers we return nothing, sorry
        result = dict()
    return result

In [30]:
min_index_target = get_minimizers_target2(
    reference_records[reference_ids[0]],
    kmer_len=kmer_len,
    window_len=window_len,
)
min_index_target[291873], min_index_target[2100232], min_index_target[278396374]

(array([729317]), array([932737]), array([645111, 645150]))

In [23]:
selected_kmers

array([[       291873,        729317,             1],
       [       291873,        729317,             1],
       [       291873,        729317,             1],
       ...,
       [1095278559004,        533929,             1],
       [1095666675286,        674229,             1],
       [1098436833773,        716915,             1]])

In [24]:
selected_kmers[:,0]

array([       291873,        291873,        291873, ..., 1095278559004,
       1095666675286, 1098436833773])