In [289]:
import sys

import gc
import sys
import copy
import os

from typing import Dict, Any, Set, Optional, Tuple
from itertools import chain
from numpy.lib.stride_tricks import sliding_window_view
from operator import itemgetter
from collections import defaultdict

import csv
import numpy as np

GLO_Q = None
GLO_T = None
GLO_CHUNK = None

MASKS: Optional[Dict[int, int]] = None
MAX_KMER_SIZE = 64

MAPPING_DOLLAR = 0
MAPPING = dict(
    C=1,
    A=2,
    T=3,
    G=4,
    #a=1,
    #c=0,
    #g=3,
    #G=4,
    #t=2,
    #T=3,
)

ALPHABET = set(MAPPING.values())
ALPHABET_DOLLAR = set([*ALPHABET, MAPPING_DOLLAR])

RR_MAPPING = ["C", "A", "T", "G"]

COMPLEMENT_MAPPING = {
    1: 2,
    0: 3,
    3: 0,
    2: 1,
}

MAPPING_FN = np.vectorize(MAPPING.get, otypes=[np.uint8])
COMPLEMENT_MAPPING_FN = np.vectorize(COMPLEMENT_MAPPING.get)

# DP algorithm adapted from Langmead's notebooks
def align_dp_trace(D, x, y):
    ''' Backtrace edit-distance matrix D for strings x and y '''
    i, j = len(x), len(y)
    while i > 0:
        diag, vert, horz = sys.maxsize, sys.maxsize, sys.maxsize
        delt = None
        if i > 0 and j > 0:
            delt = 0 if x[i-1] == y[j-1] else 1
            diag = D[i-1, j-1] + delt
        if i > 0:
            vert = D[i-1, j] + 1
        if j > 0:
            horz = D[i, j-1] + 1
        if diag <= vert and diag <= horz:
            # diagonal was best
            i -= 1; j -= 1
        elif vert <= horz:
            # vertical was best; this is an insertion in x w/r/t y
            i -= 1
        else:
            # horizontal was best
            j -= 1
    # j = offset of the first (leftmost) character of t involved in the
    # alignment
    return j

def align_dp_k_edit(p, t):
    ''' Find the alignment of p to a substring of t with the fewest edits.  
        Return the edit distance and the coordinates of the substring. '''
    D = np.zeros((len(p)+1, len(t)+1), dtype=int)
    # Note: First row gets zeros.  First column initialized as usual.
    D[1:, 0] = range(1, len(p)+1)
    for i in range(1, len(p)+1):
        for j in range(1, len(t)+1):
            delt = 1 if p[i-1] != t[j-1] else 0
            D[i, j] = min(D[i-1, j-1] + delt, D[i-1, j] + 1, D[i, j-1] + 1)
    # Find minimum edit distance in last row
    mnJ, mn = None, len(p) + len(t)
    for j in range(len(t)+1):
        if D[len(p), j] < mn:
            mnJ, mn = j, D[len(p), j]
    # Backtrace; note: stops as soon as it gets to first row
    off = align_dp_trace(D, p, t[:mnJ])
    # Return edit distance and t coordinates of aligned substring
    return mn, off, mnJ


def run_match_align_dp(target, query, align_mode=1):
    for (k, step) in [(15, 11), (10, 11), (8, 5)]:
        suff_len_factor = 0.4
        suff_len_factor2 = 0.6

        print(f"align_seq(): ALIGNER_MODE {align_mode}")
        suff_len = round(len(query) * suff_len_factor)
        suff_len2 = round(len(query) * suff_len_factor2)

        target_suffix = target[suff_len-k:]
        query_suffix = query[(len(query)-suff_len):]

        edist = len(query)//9
        kmers = defaultdict(list)

        mask = generate_mask(k)

        uadd = np.frompyfunc(lambda x, y: ((x << 2) | y) & mask, 2, 1)

        # This computes values for kmers
        kmers_target = uadd.accumulate(target_suffix, dtype=object).astype(int)
        # for i in range(0, len(target)-k+1, step):
        #         kmers[np.sum(target[i:i+k])].append(i)
        for i in range(0, len(kmers_target), step):
            kmers[kmers_target[i]].append(i)

        hits = []
        kmers_query = uadd.accumulate(query_suffix, dtype=object).astype(int)
        #for i in range(0, len(query)-k+1, step+1):
            #for j in kmers[np.sum(query[i:i+k])]:
        for i in range(0, len(kmers_query), step+1):
            for j in kmers[kmers_query[i]]:
                lf = max(0, j-i-edist)
                rt = min(len(target_suffix), j-i+len(query_suffix)+edist)
                mn, soff, eoff = align_dp_k_edit(query_suffix, target_suffix[lf:rt])
                soff += lf
                eoff += lf
                if mn <= edist:
                    hits.append((mn, soff, eoff))
        hits.sort()
        if hits:
            return 0, len(target_suffix)-hits[0][2]
        if align_mode == 1:
            return 0, 0
    return 0, 0

def merge(x: np.array, SA12: np.array, SA3: np.array) -> np.array:
    "Merge the suffixes in sorted SA12 and SA3."
    ISA = np.zeros((len(x),), dtype='int')
    for i in range(len(SA12)):
        ISA[SA12[i]] = i
    SA = np.zeros((len(x),), dtype='int')
    idx = 0
    i, j = 0, 0
    while i < len(SA12) and j < len(SA3):
        if less(x, SA12[i], SA3[j], ISA):
            SA[idx] = SA12[i]
            idx += 1
            i += 1
        else:
            SA[idx] = SA3[j]
            idx += 1
            j += 1
    if i < len(SA12):
        SA[idx:len(SA)] = SA12[i:]
    elif j < len(SA3):
        SA[idx:len(SA)] = SA3[j:]
    return SA


def u_idx(i: int, m: int) -> int:
    "Map indices in u back to indices in the original string."
    if i < m:
        return 1 + 3 * i
    else:
        return 2 + 3 * (i - m - 1)


def safe_idx(x: np.array, i: int) -> int:
    "Hack to get zero if we index beyond the end."
    return 0 if i >= len(x) else x[i]

def symbcount(x: np.array, asize: int) -> np.array:
    "Count how often we see each character in the alphabet."
    counts = np.zeros((asize,), dtype="int")
    for c in x:
        counts[c] += 1
    return counts

def cumsum(counts: np.array) -> np.array:
    "Compute the cumulative sum from the character count."
    res = np.zeros((len(counts, )), dtype='int')
    acc = 0
    for i, k in enumerate(counts):
        res[i] = acc
        acc += k
    return res

def bucket_sort(x: np.array, asize: int,
                idx: np.array, offset: int = 0) -> np.array:
    "Sort indices in idx according to x[i + offset]."
    sort_symbs = np.array([safe_idx(x, i + offset) for i in idx])
    counts = symbcount(sort_symbs, asize)
    buckets = cumsum(counts)
    out = np.zeros((len(idx),), dtype='int')
    for i in idx:
        bucket = safe_idx(x, i + offset)
        out[buckets[bucket]] = i
        buckets[bucket] += 1
    return out

def radix3(x: np.array, asize: int, idx: np.array) -> np.array:
    "Sort indices in idx according to their first three letters in x."
    idx = bucket_sort(x, asize, idx, 2)
    idx = bucket_sort(x, asize, idx, 1)
    return bucket_sort(x, asize, idx)

def triplet(x: np.array, i: int) -> Tuple[int, int, int]:
    "Extract the triplet (x[i],x[i+1],x[i+2])."
    return safe_idx(x, i), safe_idx(x, i + 1), safe_idx(x, i + 2)

def collect_alphabet(x: np.array, idx: np.array) -> Tuple[np.array, int]:
    "Map the triplets starting at idx to a new alphabet."
    alpha = np.zeros((len(x),), dtype='int')
    value = 1
    last_trip = -1, -1, -1
    for i in idx:
        trip = triplet(x, i)
        if trip != last_trip:
            value += 1
            last_trip = trip
        alpha[i] = value
    return alpha, value - 1

def build_u(x: np.array, alpha: np.array) -> np.array:
    "Construct u string, using 1 as central sentinel."
    a = np.array([alpha[i] for i in range(1, len(x), 3)] +
                 [1] +
                 [alpha[i] for i in range(2, len(x), 3)])
    return a

def less(x: np.array, i: int, j: int, ISA: np.array) -> bool:
    "Check if x[i:] < x[j:] using the inverse suffix array for SA12."
    a: int = safe_idx(x, i)
    b: int = safe_idx(x, j)
    if a < b: return True
    if a > b: return False
    if i % 3 != 0 and j % 3 != 0: return ISA[i] < ISA[j]
    return less(x, i + 1, j + 1, ISA)

def skew_rec(x: np.array, asize: int) -> np.array:
    "skew/DC3 SA construction algorithm."

    SA12 = np.array([i for i in range(len(x)) if i % 3 != 0])

    SA12 = radix3(x, asize, SA12)
    new_alpha, new_asize = collect_alphabet(x, SA12)
    if new_asize < len(SA12):
        # Recursively sort SA12
        u = build_u(x, new_alpha)
        sa_u = skew_rec(u, new_asize + 2)
        m = len(sa_u) // 2
        SA12 = np.array([u_idx(i, m) for i in sa_u if i != m])

    if len(x) % 3 == 1:
        SA3 = np.array([len(x) - 1] + [i - 1 for i in SA12 if i % 3 == 1])
    else:
        SA3 = np.array([i - 1 for i in SA12 if i % 3 == 1])
    SA3 = bucket_sort(x, asize, SA3)
    return merge(x, SA12, SA3)

# DUŻO SYFU
C = {}
O = {}
D = []

# rewards/penalties
gap_open = 3
gap_ext = 1
mismatch = 1
match = 0

# option switches
sub_mat = {}

num_prunes = 0
# insertion -> 1
# delection -> 2
# match -> 0
# mismatch -> 3
# start -> 4

def compute_C(totals):
    """compute C, the number of lexicographically greater symbols in the ref"""
    #C = {0: 0, 1: 0, 2: 0, 3: 0, MAPPING_DOLLAR: 0}
    C = {v: 0 for v in ALPHABET_DOLLAR}
    for k in ALPHABET:
        for ref in ALPHABET:
            if ref < k:
                C[k] += totals[ref]

    return C


def compute_D(s, C, Oprime, bw):
    """compute estimated lower bounds of differences in substring s[0:i] for all  in [0,len(s)]"""
    k = 1
    l = len(bw)-2
    z = 0
    D = [0] * len(s)

    for i in range(0, len(s)):
        k = C[s[i]] + Oprime[s[i]][k-1] + 1
        l = C[s[i]] + Oprime[s[i]][l]
        # k = C[s[i]] + 1
        # l = C[s[i]]
        if k > l:
            k = 1
            l = len(bw)-1
            z += 1
        D[i] = z

    return D


def get_D(i):
    """enforce condition that if D[i] is set to -1, its value will be considered as 0"""
    if i < 0:
        return 0
    else:
        return D[i]


def get_O(char, index):
    """see get_D()"""
    if index < 0:
        return 0
    else:
        return O[char][index]


def inexact_recursion(s, i, diff, k, l, prev_type):
    """search bwt recursively and tolerate errors"""
    
    global num_prunes

    # pruning based on estimated mistakes
    if diff < get_D(i):
        num_prunes += 1
        return set()

    # end of query condition
    temp = set()
    if i < 0:
        for j in range(k, l+1):
            temp.add((j, diff))
        return temp

    # search
    sa_idx = set()  # set of suffix array indices at which a match starts
    
    # Insertion
    if prev_type == 1:
        sa_idx = sa_idx.union(inexact_recursion(s, i-1, diff-gap_ext, k, l, 1))
    else:
        sa_idx = sa_idx.union(inexact_recursion(s, i-1, diff-gap_ext-gap_open, k, l, 1))

    for char in ALPHABET:
        temp_k = C[char] + get_O(char, k-1) + 1
        temp_l = C[char] + get_O(char, l)
    
        if temp_k <= temp_l:
            # Deletion
            if prev_type == 2:
                sa_idx = sa_idx.union(inexact_recursion(s, i, diff-gap_ext, temp_k, temp_l, 2))
            else:
                sa_idx = sa_idx.union(inexact_recursion(s, i, diff-gap_ext-gap_open, temp_k, temp_l, 2))
            if char == s[i]:
                # Match!
                sa_idx = sa_idx.union(inexact_recursion(s, i-1, diff+match, temp_k, temp_l, 0))
                
            else:
                # Mismatch
                if sub_mat:
                    sa_idx = sa_idx.union(inexact_recursion(s, i-1, diff-mismatch*sub_mat[(s[i], char)],
                                                            temp_k, temp_l, 3))
                else:
                    sa_idx = sa_idx.union(inexact_recursion(s, i-1, diff-mismatch, temp_k, temp_l, 3))

    return sa_idx


def estimate_substitution_mat(ref, r):
    """get likelihood of each substitution type over all possible alignments"""
    mismatches = {}

    for i in range(0, len(ref)):
        for j in range(0, len(r)):
            if ref[i] != r[j]:
                if (ref[i], r[j]) in mismatches:
                    mismatches[(ref[i], r[j])] += 1
                else:
                    mismatches[(ref[i], r[j])] = 1

    scale = max(mismatches.values())
    for k in mismatches:
        mismatches[k] = float(mismatches[k])/scale

    return mismatches

def rank(bw):
    """rank(char) := list of number of occurrences of a char for each substring R[:i] (reference)"""
    totals = {}
    ranks = {}

    for char in ALPHABET:
        if (char not in totals) and (char != MAPPING_DOLLAR): # '$':
            totals[char] = 0
            ranks[char] = []

    for char in bw:
        if char != MAPPING_DOLLAR: # '$':
            totals[char] += 1
        for t in totals.keys():
            ranks[t].append(totals[t])

    return ranks, totals

def inexact_search(bw, bwr, s, diff):
    """find suffix array intervals with up to diff differences"""

    global C, O, D, num_prunes
    # totals, ranks
    # O is a dictionary with keys $,A,C,G,T, and values are arrays of counts
    O, tot = rank(bw)

    # reverse ranks
    Oprime, junk = rank(bwr)
    #Oprime = None

    # C[a] := number of lexicographically smaller letters than a in bw/reference
    C = compute_C(tot)

    # D[i] := lower bound on number of differences in substring s[1:i]
    D = compute_D(s, C, Oprime, bw)

    # call the recursive search function and return a list of SA-range tuples
    sa_index_set = inexact_recursion(s, len(s)-1, diff, 0, len(bw)-1, 4)
    index_dict = {}

    for (i, j) in sa_index_set:
        # if index already exists, pick the higher diff value
        if i in index_dict:
            if index_dict[i] < j:
                index_dict[i] = j
                num_prunes += 1

        else:
            index_dict[i] = j

    # sort list by diff from highest to lowest
    return sorted(index_dict.items(), key=itemgetter(1), reverse=True) 


def best_match_position(bw, bwr, s, diff, sa):
    sa_index_list = inexact_search(bw, bwr, s, diff)
    if len(sa_index_list) != 0:
        best_index, score = sa_index_list[0]
        return sa[best_index]+1, score
    else:
        return -1, -1

def normalize_pos(pos, len):
    return min(max(pos, 0), len)


def generate_mask(
    kmer_len: int,
) -> int:
    global MASKS
    if not MASKS:
        MASKS = dict()
        ret = 3
        for i in range(MAX_KMER_SIZE+1):
            ret = (ret << 2) | 3
            MASKS[i] = ret
    return MASKS[kmer_len]


def get_minimizers(
    seq_arr,
    kmer_len,
    window_len,
):
    sequence_len = len(seq_arr)
    mask = generate_mask(kmer_len)

    # Function to compute kmer value based on the previous (on the left side) kmer value and new nucleotide
    uadd = np.frompyfunc(lambda x, y: ((x << 2) | y) & mask, 2, 1)

    # This computes values for kmers
    kmers = uadd.accumulate(seq_arr, dtype=object).astype(int)
    kmers[:kmer_len-2] = 0
    del seq_arr
    
    # Do sliding window and get min kmers positions
    kmers_min_pos = np.add(np.argmin(sliding_window_view(kmers, window_shape=window_len), axis=1), np.arange(0, sequence_len - window_len + 1))
    
    # Now collect all selected mimumum and kmers into single table
    selected_kmers = np.column_stack((
        kmers[kmers_min_pos],
        kmers_min_pos,
        #np.ones(len(kmers_min_pos), dtype=bool)
    ))[kmer_len:]
    del kmers_min_pos
    del kmers

    # Remove duplicates
    selected_kmers = selected_kmers[selected_kmers[:, 0].argsort()]
    selected_kmers = np.unique(selected_kmers, axis=0)

    # This part performs group by using the kmer value
    selected_kmers_unique_idx = np.unique(selected_kmers[:, 0], return_index=True)[1][1:]
    selected_kmers_entries_split = np.split(selected_kmers[:, 1], selected_kmers_unique_idx)

    if len(selected_kmers) > 0:
        # We zip all kmers into a dict
        result = dict(zip(chain([selected_kmers[0, 0]], selected_kmers[selected_kmers_unique_idx, 0]), selected_kmers_entries_split))
    else:
        # If we have no minimizers we return nothing, sorry
        result = dict()
    return result


def cartesian_product(*arrays):
    la = len(arrays)
    dtype = np.result_type(*arrays)
    arr = np.empty([len(a) for a in arrays] + [la], dtype=dtype)
    for i, a in enumerate(np.ix_(*arrays)):
        arr[...,i] = a
    return arr.reshape(-1, la)

def run_aligner_pipeline(
    reference_file_path: str,
    reads_file_path: str,
    output_file_path: str,
    kmer_len: int,
    window_len: int,
):
    global GLO_Q
    global GLO_T
    global GLO_CHUNK
    
    gc.disable()
    #tracemalloc.start()
    np.set_printoptions(threshold=sys.maxsize)
    print(f"Invoked CLI with the following args: {' '.join(sys.argv)}")
    
    expected_coords = {}
    with open('../data_big/reads20Mb.txt', mode ='r')as file:
        csvFile = csv.reader(file, delimiter='\t')
        expected_coords = {line[0]: (int(line[1]), int(line[2])) for line in csvFile}
    if kmer_len > MAX_KMER_SIZE:
        kmer_len = MAX_KMER_SIZE

    target_seq = None
    ref_loaded = False
    all_seq = ""
    all_seq_len = 0
    index_offset = 0
    CHUNK_SIZE = 1000000 # Chunk size should be around 1000000

    ref_index = dict()
    with open(reference_file_path) as ref_fh:
        for line in chain(ref_fh, [">"]):
            if line[0] != '>':
                fasta_line = line.rstrip()
                all_seq += fasta_line
                all_seq_len  += len(fasta_line)
            if (all_seq_len >= CHUNK_SIZE or line[0] == '>') and all_seq_len > 0:
                print(f"PROCESS CHUNK {all_seq_len}")
                
                # all_seq_len = 0
                # all_seq = 0

                seq_arr = MAPPING_FN(np.array(list(all_seq)))
                GLO_CHUNK = seq_arr
                if target_seq is None:
                   target_seq = seq_arr
                else:
                   target_seq = np.concatenate((target_seq, seq_arr), axis=0, dtype=np.uint8)
                del all_seq

                # Target index building
                sequence_len = len(seq_arr)
                mask = generate_mask(kmer_len)

                # Function to compute kmer value based on the previous (on the left side) kmer value and new nucleotide
                uadd = np.frompyfunc(lambda x, y: ((x << 2) | y) & mask, 2, 1)

                # This computes values for kmers
                kmers = uadd.accumulate(seq_arr, dtype=object).astype(int)
                kmers[:kmer_len-2] = 0
                del seq_arr
                
                # Do sliding window and get min kmers positions
                kmers_min_pos = np.add(np.argmin(sliding_window_view(kmers, window_shape=window_len), axis=1), np.arange(0, sequence_len - window_len + 1))
                
                # Now collect all selected mimumum and kmers into single table
                selected_kmers = np.column_stack((
                    kmers[kmers_min_pos],
                    kmers_min_pos,
                    #np.ones(len(kmers_min_pos), dtype=bool)
                ))[kmer_len:]
                del kmers_min_pos
                del kmers
                gc.collect()

                # Remove duplicates
                selected_kmers = selected_kmers[selected_kmers[:, 0].argsort()]
                selected_kmers = np.unique(selected_kmers, axis=0)

                # Shift all indices according to what we loaded already
                selected_kmers[:,1] += index_offset

                # This part performs group by using the kmer value
                selected_kmers_unique_idx = np.unique(selected_kmers[:, 0], return_index=True)[1][1:]
                selected_kmers_entries_split = np.split(selected_kmers[:, 1], selected_kmers_unique_idx)

                if len(selected_kmers) > 0:
                    # We zip all kmers into a dict
                    i = 0
                    for k, v in zip(chain([selected_kmers[0, 0]], selected_kmers[selected_kmers_unique_idx, 0]), selected_kmers_entries_split):
                        i += 1
                        # TODO: REMOVE SOME FROM INDEX
                        if i >= 20 and len(v) == 1:
                            i = 0
                            continue
                        if k in ref_index:
                            ref_index[k] = np.concatenate((ref_index[k], v), axis=0)
                        else:
                            ref_index[k] = v
                else:
                    # If we have no minimizers we return nothing, sorry
                    pass

                index_offset += all_seq_len
                all_seq_len = 0
                all_seq = ""
                print(f"PROCESSED ENTIRE CHUNK! offset={index_offset}")
                del selected_kmers_unique_idx
                del selected_kmers_entries_split
                gc.collect()
            if line[0] == '>':
                if ref_loaded:
                    break
                ref_loaded = True
                continue

    gc_collect_cnt = 300
    output_buf = []
    with open(output_file_path, 'w') as output_file:
        query_id = ""
        query_seq = ""
        with open(reads_file_path) as reads_fh:
            for line in chain(reads_fh, [">"]):
                if line[0] == '>' and len(query_seq) > 0:
                    query_seq = MAPPING_FN(np.array(list(query_seq)))
                    # Process
                    if gc_collect_cnt > 299:
                       gc_collect_cnt = 0
                       gc.collect()
                    gc_collect_cnt += 1
                    
                    # if query_id not in ['read_937', 'read_961', 'read_972', 'read_96', 'read_126', 'read_394', 'read_561', 'read_693', 'read_771', 'read_794', 'read_817', 'read_903', 'read_910', 'read_937', 'read_972', 'read_961']:
                    #    continue
                    if query_id == 'read_25': #int(query_id.split('_')[1]) < 100 or query_id in ['read_937', 'read_961', 'read_972', 'read_96', 'read_126', 'read_394', 'read_561', 'read_693', 'read_771', 'read_794', 'read_817', 'read_903', 'read_910', 'read_937', 'read_972', 'read_961']:
                        try:
                            max_diff = round(len(query_seq)*1.3)
                            min_index_query = get_minimizers(
                                query_seq,
                                kmer_len=kmer_len,
                                window_len=window_len,
                            )

                            common_kmers = []
                            for key in min_index_query:
                                if key in ref_index:
                                    common_kmers.append(key)

                            matches = np.array([[-1, -1]])
                            for kmer in common_kmers:
                                kmer_entries_target, kmer_entries_query = ref_index[kmer], min_index_query[kmer]
                                matches = np.concatenate((
                                    matches,
                                    cartesian_product(
                                        kmer_entries_target,
                                        kmer_entries_query,
                                    )),
                                    axis=0,
                                )
                            matches = matches[matches[:, 0].argsort()]
                            matches = matches[1:]
                            n = len(matches)
                            
                            match_score, match_start_t, match_end_t, match_start_q, match_end_q = -max_diff, 0, 0, 0, 0

                            # print("ALL MATCH:")
                            # print(matches)
                            # print("END")

                            if n == 0:
                                pass
                            elif n == 1:
                                match_score, match_start_t, match_end_t, match_start_q, match_end_q = 0, matches[0, 0], matches[0, 0], matches[0, 1], matches[0, 1]
                            else:
                                longest_seq_len = 0
                                parent = [999999999]*(n+1)
                                increasingSub = [999999999]*(n+1)
                                for i in range(n):
                                    start = 1
                                    end = longest_seq_len
                                    while start <= end:
                                        middle = (start + end) // 2
                                        if matches[increasingSub[middle], 1] < matches[i, 1]:
                                            start = middle + 1
                                        else:
                                            end = middle - 1
                                    parent[i] = increasingSub[start-1]
                                    increasingSub[start] = i

                                    if start > longest_seq_len:
                                        longest_seq_len = start

                                current_node = increasingSub[longest_seq_len]
                                q = [current_node]*longest_seq_len 
                                for j in range(longest_seq_len-1, 0, -1):
                                    current_node = parent[current_node]
                                    q[j-1] = current_node

                                lis = np.take(matches, q, axis=0)
                                for i in range(longest_seq_len):
                                    start = i
                                    end = longest_seq_len
                                    while start <= end:
                                        middle = (start + end) // 2
                                        if middle == longest_seq_len:
                                            start = longest_seq_len
                                            break
                                        if lis[middle, 0] < lis[i, 0] + max_diff - lis[i, 1]:
                                            start = middle + 1
                                        else:
                                            end = middle - 1
                                    # Window is i till end
                                    # print(f"Start from {i} (till {start} whcih has value") #[{lis[start, 0]}, {lis[start, 1]}])
                                    estimated_matches_q = (lis[start, 1] if start < longest_seq_len else max_diff) - lis[i, 1]
                                    estimated_matches_t = (lis[start, 0] if start < longest_seq_len else lis[start-1, 0]) - lis[i, 0]
                                    score = min(estimated_matches_q, estimated_matches_t)*min(estimated_matches_q, estimated_matches_t) - np.sum(np.diff(lis[i:start, 0], axis=0))
                                    # print(lis[i:start])
                                    # print(f"LAST ELEMENT IS {lis[i:start][-1]} where start={start} and l-1={longest_seq_len-1}")
                                    # print(f"score = {score}")
                                    if score > match_score:
                                        match_end_index_pos = max(i, min(start-1, longest_seq_len-1))
                                        match_score, match_start_t, match_end_t, match_start_q, match_end_q = score, lis[i, 0], lis[match_end_index_pos, 0], lis[i, 1], lis[match_end_index_pos, 1]
                                        #print(f"ACCEPTED SCORE: {match_start_t} - {match_end_t}")
                                    if start == longest_seq_len:
                                        break

                            #print(f"SCORE: Match score is {match_score}")
                            #print(f"SCORE: Match around {match_start_t} - {match_end_t}")
                            #sys.exit(1)

                            # q_begin, q_end, t_begin, t_end, list_length

                            relative_extension = kmer_len // 2 + 1

                            if abs(match_end_t - match_start_t) > max_diff + relative_extension:
                                # FAILED MAPPING!
                                #print(f"Failed sequence, reason: {match_start_t} - {match_end_t} ({abs(match_end_t - match_start_t)})")
                                output_buf.append(f"{query_id} status=FAIL\n")
                            else:
                                q_begin, q_end = 0, len(query_seq)
                                t_begin, t_end = match_start_t - match_start_q - relative_extension, match_end_t + (len(query_seq)-match_end_q) + relative_extension

                                q_begin, q_end = normalize_pos(q_begin, len(query_seq)), normalize_pos(q_end, len(query_seq))
                                t_begin, t_end = normalize_pos(t_begin, len(target_seq)), normalize_pos(t_end, len(target_seq))

                                realign_mode = 0
                                # t_begin_pad, t_end_pad, should_realign_right = run_match_align_bwt(
                                #     query_seq,
                                #     target_seq[t_begin:t_end],
                                # )
                                GLO_Q = query_seq
                                GLO_T = target_seq[t_begin:t_end]
                                return None
                                    
                                if should_realign_right:
                                    realign_mode = 1
                                if abs(t_end-(t_end_pad or 0)-t_begin-(t_begin_pad or 0)) > len(query_seq)*1.05:
                                    realign_mode = 2
                                    if t_begin_pad is not None:
                                        t_begin += t_begin_pad
                                    if t_end_pad is not None:
                                        t_end -= t_end_pad

                                if realign_mode > 0:
                                    t_begin_pad, t_end_pad = run_match_align_dp(
                                        target_seq[t_begin:t_end],
                                        query_seq,
                                        align_mode=realign_mode,
                                    )

                                if not should_realign_right and t_begin_pad is None:
                                    t_begin_pad = relative_extension
                                if not should_realign_right and t_end_pad is None:
                                    t_end_pad = relative_extension

                                if t_begin_pad is not None:
                                    t_begin += t_begin_pad
                                if t_end_pad is not None:
                                    t_end -= t_end_pad

                                # print("TARGET!!!!")
                                # print("".join([RR_MAPPING[i] for i in target_seq[t_begin:t_end].tolist()]))
                                # print("QUERY!!!")
                                # print("".join([RR_MAPPING[i] for i in query_seq.tolist()]))
                                #print(f"ALIGNED: {t_begin} - {t_end} (pd: {t_begin_pad}, {t_end_pad} query: {q_begin} - {q_end})")
                                # sys.exit(1)

                                # print("TARGET!!!!")
                                # print("".join([RR_MAPPING[i] for i in target_seq[t_begin:t_end].tolist()]))
                                # print("QUERY!!!")
                                # print("".join([RR_MAPPING[i] for i in query_seq.tolist()]))

                                #est_edit_dist = estimate_distance(target_seq[t_begin:t_end], query_seq) #levenshtein("".join([RR_MAPPING[i] for i in target_seq[t_begin:t_end].tolist()]), "".join([RR_MAPPING[i] for i in query_seq.tolist()]))
                                # est_edit_dist = levenshtein(
                                #    "".join([RR_MAPPING[i] for i in target_seq[t_begin:t_end].tolist()]),
                                #    "".join([RR_MAPPING[i] for i in query_seq.tolist()]),
                                #    177, 2, 2, 1
                                # )

                                if query_id in expected_coords:
                                   diff_start = expected_coords[query_id][0]-t_begin
                                   diff_end = expected_coords[query_id][1]-t_end
                                   #print(f"TOTAL DIFF: {max(abs(diff_start), abs(diff_end))}")
                                   status = "OK" if max(abs(diff_start), abs(diff_end)) < 20 else "BAD"
                                   qual = "AA" if abs(diff_start)+abs(diff_end) < 10 else ("AB" if abs(diff_start)+abs(diff_end) < 20 else ("C" if max(abs(diff_start), abs(diff_end)) < 20 else "D"))
                                   #output_buf.append
                                   output_file.write(f"{query_id} status={status} qual={qual} diff=<{diff_start}, {diff_end}>  | {t_begin} {t_end} | pad: {t_begin_pad}, {t_end_pad} | {'REALIGNED'+str(realign_mode) if realign_mode != 0 else ''} \n")
                                else:
                                    output_buf.append(f"{query_id} {t_begin} {t_end}\n")
                        except Exception as e:
                            # TODO?
                            print(e)
                            raise e
                if line[0] == '>':
                    # Process end
                    query_seq = ""
                    query_id = line[1:].strip()
                else:
                    query_seq += line.rstrip()
        if len(output_buf) > 0:
            output_file.writelines(output_buf)
        print(f"Wrote records to {output_file_path}")
        #os._exit(0) # Faster exit than normally

run_aligner_pipeline(
    reference_file_path="../data_big/reference20M.fasta",
    reads_file_path="../data_big/reads20Mb.fasta",
    output_file_path="output.txt",
    kmer_len=16,
    window_len=5,
)

Invoked CLI with the following args: /Users/styczynski/Library/Caches/pypoetry/virtualenvs/aadg-genomics-class-g6v9BdZu-py3.11/lib/python3.11/site-packages/ipykernel_launcher.py -f /Users/styczynski/Library/Jupyter/runtime/kernel-038f444b-18f2-47cb-b801-04020e02259a.json
PROCESS CHUNK 1000020
PROCESSED ENTIRE CHUNK! offset=1000020
PROCESS CHUNK 1000020
PROCESSED ENTIRE CHUNK! offset=2000040
PROCESS CHUNK 1000020
PROCESSED ENTIRE CHUNK! offset=3000060
PROCESS CHUNK 1000020
PROCESSED ENTIRE CHUNK! offset=4000080
PROCESS CHUNK 1000020
PROCESSED ENTIRE CHUNK! offset=5000100
PROCESS CHUNK 1000020
PROCESSED ENTIRE CHUNK! offset=6000120
PROCESS CHUNK 1000020
PROCESSED ENTIRE CHUNK! offset=7000140
PROCESS CHUNK 1000020
PROCESSED ENTIRE CHUNK! offset=8000160
PROCESS CHUNK 1000020
PROCESSED ENTIRE CHUNK! offset=9000180
PROCESS CHUNK 1000020
PROCESSED ENTIRE CHUNK! offset=10000200
PROCESS CHUNK 1000020
PROCESSED ENTIRE CHUNK! offset=11000220
PROCESS CHUNK 1000020
PROCESSED ENTIRE CHUNK! offset=12

In [290]:
len(GLO_CHUNK)

999620

In [292]:

# Used to create kmer binary masks
MAX_KMER_SIZE = 64

# Mapping of special character used by suffix tables and similar structures
MAPPING_DOLLAR = 0
# Mapping of nucleotides
MAPPING = dict(
    C=1,
    A=2,
    T=3,
    G=4,
)
# Allphabet
ALPHABET = set(MAPPING.values())
# Alphabet with included special character
ALPHABET_DOLLAR = set([*ALPHABET, MAPPING_DOLLAR])

# Pairs of alphabet symbols (a, b) such that a > b
# Useful for O(a^2) iteration over alphabet
ALPHABET_LT_PAIRS = [(a, b) for a in ALPHABET for b in ALPHABET if b < a]

# Vectorized numpy function to map nucleotides from string
MAPPING_FN = vectorize(MAPPING.get, otypes=[uint8])

# Remove every nth kmer from the reference index
REFERENCE_NTH_KMER_REMOVAL = 15 # was: 20
# Load reference in chunks of that size
REFERENCE_INDEX_CHUNK_SIZE = 1000000

# Length of k-mer used to generate (k,w)-minimizer indices
KMER_LEN = 16 # was: 16
# Length of window to generate (k,w)-minimizer indices
WINDOW_LEN = 8 # was: 5

# Match from k-mer index with be resized by kmer_len times this factor
FACT_KMER_TO_RELATIVE_EXTENSION_LEN = 0.5

# Miminal length of subsequent k-mers that form a valid match
MIN_LIS_EXTENSION_WINDOW_LEN = 3
# Max distance between starting k-mer of the match and the ending k-mer of match
# Used when filtering the extended seed
# Should be higher than FACT_TARGET_TO_QUERY_MAX_RELATIVE_LENGTH
FACT_LIS_MAX_QUERY_DISTANCE = 1.3

# Max difference between query and target to concude that it's a valid match
#
# if len(t_region) > len(q_region) * FACT_TARGET_TO_QUERY_MAX_RELATIVE_LENGTH:
#   invalid_match()
#
FACT_TARGET_TO_QUERY_MAX_RELATIVE_LENGTH = 1.05
# When using BWT aligner this is the maximum extra padding that we consider valid (as a fraction of len(query))
FACT_BWT_QUERY_MAX_OFFSET = 0.04
# When using BWT aligner this is the length of query part that we consider for fast matching (as a fraction of len(query))
FACT_BWT_QUERY_FRAGMENT_SIZE = 0.1
# When using BWT aligner this is the maximum number of error that we should encounter (to speed up searching)
# The threshold is calculated as follows:
#
#    fragment_len = len(query) * FACT_BWT_QUERY_FRAGMENT_SIZE
#    max_errors = fragment_len * FACT_BWT_FRAGMENT_REL_ERRORS_THRESHOLD
#
FACT_BWT_FRAGMENT_REL_ERRORS_THRESHOLD = 0.08

# When using DP aligner (more accurate than BWT, but also slower)
# we consider pairs (kmer_len, kmer_skip)
# kmer_len is the length of k-mer we consider
# kmer_skip means we skip every n-th kmer 
# Those values can be significantlly lower than global KMER_SIZE, because we run DP aligner only in specific situations
# If no match is found we use the next configuration untill we find anything
DP_K_STEP_SEQUENCE = [(15, 11), (10, 11), (8, 5)]
# Length of the query suffix we use for the DP aligner as a fraction of query length
FACT_DP_QUERY_SUFFIX_REL_LEN = 0.4
# For DP aligner we set maxiumum exit distance 
# This value is fraction of the length of the query
FACT_DP_QUERY_REL_MAX_E_DISTANCE = 0.1111

# Cost when gap is opened (DP aligner uses only COST_GAP_EXTEND for gaps)
COST_GAP_OPEN = 3
# Cost when gap is extended (DP aligner uses only COST_GAP_EXTEND for gaps)
COST_GAP_EXTEND = 1
# Cost of mismatch
COST_MISMATCH = 1
# Cost of match
COST_MATCH = 0

# Globals used to cache values and speed up calculations (no need to pass around many references in call frames)
# C[a] := number of lexicographically smaller letters than a in bw/reference
_global_bwt_c = {}
# O is a dictionary with keys $,A,C,G,T, and values are arrays of counts
_global_bwt_o = {}
# D[i] := lower bound on number of differences in substring s[1:i]
_global_bwt_d = []
# mask[k] := mask used to calculate hash for k-mer of length k
_global_masks: Optional[Dict[int, int]] = None

In [371]:
L = [[420197, 98], [1042765, 581], [1425562, 79], [1425564, 81], [1425565, 82], [1425566, 83], [1425569, 86], [1425570, 87], [1425757, 278], [1425762, 283], [1425768, 289], [1425770, 291], [1425789, 310], [1425839, 359], [1425842, 362], [1425865, 385], [1425867, 387], [1425869, 389], [1425874, 394], [1425875, 395], [1425944, 465], [1426498, 982], [1523483, 47], [1523516, 79], [1523573, 136], [1523651, 217], [1523654, 220], [1523655, 221], [1523656, 222], [1523794, 359], [1523797, 362], [2335456, 23], [2609553, 755], [3378065, 35], [3378069, 39], [3378071, 41], [3378075, 45], [3378077, 47], [3378082, 52], [3378085, 55], [3378113, 82], [3378117, 86], [3378118, 87], [3378210, 179], [3378240, 212], [3378245, 217], [3378248, 220], [3378249, 221], [3378250, 222], [3378256, 228], [3378257, 229], [3378263, 235], [3378264, 236], [3378268, 240], [3378319, 291], [3378320, 292], [3378323, 295], [3378325, 297], [3378329, 301], [3378333, 305], [3378336, 308], [3378338, 310], [3378343, 315], [3378345, 317], [3378364, 335], [3378388, 359], [3378391, 362], [3378396, 367], [3378399, 370], [3378400, 371], [3378403, 374], [3378408, 379], [3378482, 454], [3378487, 459], [3378493, 465], [3378524, 496], [3378530, 502], [3378533, 505], [3378539, 511], [3378540, 512], [3378542, 514], [3378547, 519], [3378550, 522], [3378551, 523], [3378558, 530], [3378562, 534], [3779543, 987], [4168612, 19], [4168616, 23], [4168621, 28], [4168623, 30], [4168773, 179], [4168803, 212], [4168808, 217], [4168811, 220], [4168812, 221], [4168813, 222], [4168820, 229], [4168906, 315], [4168951, 359], [4168959, 367], [4168962, 370], [4168963, 371], [4168966, 374], [4168986, 394], [4168987, 395], [4169045, 454], [4169050, 459], [4169056, 465], [4169096, 505], [4169102, 511], [4169103, 512], [4169120, 529], [4169125, 534], [4169130, 539], [4169134, 543], [4169157, 566], [4169158, 567], [4169228, 634], [4169262, 668], [4169264, 670], [4169292, 698], [4169294, 700], [4169295, 701], [4169316, 722], [4169319, 725], [4169442, 848], [4169447, 853], [4169448, 854], [4169503, 908], [4169504, 909], [4169505, 910], [4169506, 911], [4169509, 914], [4169513, 918], [4169538, 943], [4169541, 946], [4169566, 971], [4169567, 972], [4169572, 977], [4169575, 980], [4169577, 982], [4380717, 35], [4380721, 39], [4380723, 41], [4380727, 45], [4380729, 47], [4380734, 52], [4380737, 55], [4380862, 179], [4381133, 454], [4381138, 459], [4381144, 465], [4381597, 908], [4381598, 909], [4381599, 910], [4381600, 911], [4381603, 914], [4504318, 133], [4504564, 382], [4504567, 385], [4504569, 387], [4504571, 389], [4504576, 394], [4504577, 395], [4638744, 52], [4638747, 55], [4638829, 136], [4638833, 140], [5120913, 308], [5120915, 310], [5120965, 359], [5120968, 362], [5120988, 382], [5120991, 385], [5120993, 387], [5120995, 389], [5121000, 394], [5121001, 395], [5121059, 454], [5121064, 459], [5121070, 465], [5121626, 982], [5342322, 52], [5342325, 55], [5342350, 79], [5342352, 81], [5342353, 82], [5342354, 83], [5342358, 87], [5342404, 133], [5342437, 166], [5342441, 170], [5342480, 212], [5342485, 217], [5342488, 220], [5342489, 221], [5342490, 222], [5342496, 228], [5342497, 229], [5342631, 362], [5343269, 982], [5417492, 963], [6200708, 19], [6200712, 23], [6200717, 28], [6200719, 30], [6200741, 52], [6200744, 55], [6200769, 79], [6200771, 81], [6200772, 82], [6200773, 83], [6200776, 86], [6200777, 87], [6200823, 133], [6200869, 179], [6200899, 212], [6200904, 217], [6200907, 220], [6200908, 221], [6200909, 222], [6200916, 229], [6200922, 235], [6200923, 236], [6200927, 240], [6200965, 278], [6200970, 283], [6200976, 289], [6200978, 291], [6200979, 292], [6200982, 295], [6200984, 297], [6200988, 301], [6200992, 305], [6200995, 308], [6200997, 310], [6201045, 359], [6201048, 362], [6201053, 367], [6201056, 370], [6201057, 371], [6201060, 374], [6201065, 379], [6201068, 382], [6201071, 385], [6201073, 387], [6201075, 389], [6201080, 394], [6201081, 395], [6201110, 424], [6201168, 483], [6201173, 488], [6201176, 491], [6201181, 496], [6201187, 502], [6201190, 505], [6201196, 511], [6201197, 512], [6201199, 514], [6201204, 519], [6201207, 522], [6201208, 523], [6201214, 529], [6201215, 530], [6201219, 534], [6201224, 539], [6201251, 566], [6201252, 567], [6201257, 572], [6201260, 575], [6201263, 578], [6201266, 581], [6201319, 634], [6201353, 668], [6201355, 670], [6201378, 693], [6201395, 710], [6201399, 714], [6201404, 719], [6201407, 722], [6201429, 744], [6201435, 750], [6201436, 751], [6201517, 829], [6201521, 833], [6201526, 838], [6201531, 843], [6201533, 845], [6201536, 848], [6201597, 908], [6201598, 909], [6201599, 910], [6201600, 911], [6201607, 918], [6201632, 943], [6201635, 946], [6201660, 971], [6201661, 972], [6201666, 977], [6201669, 980], [6201671, 982], [6212289, 52], [6212292, 55], [6404861, 263], [7001090, 35], [7001094, 39], [7001096, 41], [7001100, 45], [7001102, 47], [7001107, 52], [7001110, 55], [7001135, 79], [7001137, 81], [7001138, 82], [7001139, 83], [7001142, 86], [7001143, 87], [7001189, 133], [7001235, 179], [7001265, 212], [7001270, 217], [7001273, 220], [7001274, 221], [7001275, 222], [7001281, 228], [7001282, 229], [7001288, 235], [7001289, 236], [7001293, 240], [7001331, 278], [7001336, 283], [7001342, 289], [7001344, 291], [7001345, 292], [7001348, 295], [7001350, 297], [7001354, 301], [7001358, 305], [7001361, 308], [7001363, 310], [7001368, 315], [7001370, 317], [7001389, 335], [7001413, 359], [7001416, 362], [7001421, 367], [7001424, 370], [7001425, 371], [7001428, 374], [7001433, 379], [7001436, 382], [7001439, 385], [7001441, 387], [7001443, 389], [7001448, 394], [7001449, 395], [7001512, 459], [7001518, 465], [7001541, 488], [7001544, 491], [7001549, 496], [7001555, 502], [7001558, 505], [7001564, 511], [7001565, 512], [7001567, 514], [7001572, 519], [7001575, 522], [7001576, 523], [7001582, 529], [7001583, 530], [7001587, 534], [7001592, 539], [7001619, 566], [7001620, 567], [7001625, 572], [7001628, 575], [7001631, 578], [7001634, 581], [7001687, 634], [7001721, 668], [7001723, 670], [7001751, 698], [7001753, 700], [7001754, 701], [7001755, 702], [7001761, 708], [7001762, 709], [7001763, 710], [7001767, 714], [7001772, 719], [7001775, 722], [7001778, 725], [7001782, 729], [7001788, 735], [7001789, 736], [7001793, 740], [7001797, 744], [7001803, 750], [7001804, 751], [7001885, 829], [7001889, 833], [7001894, 838], [7001899, 843], [7001901, 845], [7001904, 848], [7001909, 853], [7001910, 854], [7001913, 857], [7001916, 860], [7001921, 865], [7001965, 908], [7001966, 909], [7001967, 910], [7001968, 911], [7001971, 914], [7001975, 918], [7002000, 943], [7002003, 946], [7002029, 972], [7550776, 454], [7550781, 459], [7550787, 465], [7550810, 488], [7550813, 491], [7550818, 496], [7550824, 502], [7550827, 505], [7550833, 511], [7550834, 512], [7550836, 514], [7550841, 519], [7550844, 522], [7550845, 523], [7550851, 529], [7550852, 530], [7550856, 534], [7550861, 539], [7550888, 566], [7550889, 567], [7550894, 572], [7550897, 575], [7550900, 578], [7550903, 581], [7550956, 634], [7612425, 496], [7692155, 133], [7692162, 140], [7692167, 145], [7692236, 217], [7692310, 291], [7692405, 385], [7692407, 387], [7692409, 389], [7692414, 394], [7692415, 395], [7693039, 982], [7694387, 982], [7829867, 41], [7829871, 45], [7829873, 47], [7836431, 35], [7836435, 39], [7836437, 41], [7836441, 45], [7836443, 47], [7836448, 52], [7836451, 55], [7836476, 79], [7836478, 81], [7836479, 82], [7836480, 83], [7836483, 86], [7836484, 87], [7836576, 179], [7836622, 228], [7836623, 229], [7836672, 278], [7836730, 335], [7836754, 359], [7836757, 362], [7836762, 367], [7836765, 370], [7836766, 371], [7836769, 374], [7836774, 379], [7836777, 382], [7836780, 385], [7836782, 387], [7836784, 389], [7836789, 394], [7836790, 395], [7837870, 502], [7837873, 505], [7838292, 911], [7838295, 914], [7838388, 982], [7993775, 19], [7993779, 23], [7993808, 52], [7993811, 55], [7993840, 83], [7993843, 86], [7993844, 87], [7993936, 179], [7993966, 212], [7993971, 217], [7993974, 220], [7993975, 221], [7993976, 222], [7993982, 228], [7993983, 229], [7993989, 235], [7993990, 236], [7993994, 240], [7994046, 292], [7994069, 315], [7994071, 317], [7994114, 359], [7994117, 362], [7994122, 367], [7994125, 370], [7994126, 371], [7994129, 374], [7994134, 379], [7994137, 382], [7994140, 385], [7994142, 387], [7994144, 389], [7994149, 394], [7994150, 395], [7994208, 454], [7994213, 459], [7994219, 465], [7994250, 496], [7994256, 502], [7994259, 505], [7994265, 511], [7994266, 512], [7994268, 514], [7994273, 519], [7994276, 522], [7994277, 523], [7994283, 529], [7994284, 530], [7994288, 534], [7994293, 539], [7994335, 581], [7994388, 634], [7994422, 668], [7994424, 670], [7994463, 709], [7994464, 710], [7994468, 714], [7994473, 719], [7994476, 722], [7994479, 725], [7994483, 729], [7994489, 735], [7994490, 736], [7994494, 740], [7994498, 744], [7994504, 750], [7994505, 751], [7994586, 829], [7994590, 833], [7994595, 838], [7994600, 843], [7994602, 845], [7994605, 848], [7994610, 853], [7994611, 854], [7994614, 857], [7994617, 860], [7994622, 865], [7994666, 908], [7994667, 909], [7994668, 910], [7994669, 911], [7994672, 914], [7994676, 918], [7994701, 943], [7994704, 946], [7994740, 982], [8264434, 45], [8264441, 52], [8264444, 55], [8264608, 221], [8264695, 308], [8264697, 310], [8265404, 982], [8479987, 19], [8479991, 23], [8479996, 28], [8479998, 30], [8480003, 35], [8480007, 39], [8480009, 41], [8480013, 45], [8480015, 47], [8480020, 52], [8480023, 55], [8480048, 79], [8480050, 81], [8480051, 82], [8480052, 83], [8480055, 86], [8480056, 87], [8480102, 133], [8480105, 136], [8480109, 140], [8480114, 145], [8480120, 151], [8480125, 156], [8480127, 158], [8480132, 163], [8480135, 166], [8480139, 170], [8480143, 174], [8480148, 179], [8480178, 212], [8480183, 217], [8480186, 220], [8480187, 221], [8480188, 222], [8480194, 228], [8480195, 229], [8480202, 236], [8480206, 240], [8480249, 283], [8480255, 289], [8480257, 291], [8480258, 292], [8480261, 295], [8480263, 297], [8480267, 301], [8480271, 305], [8480274, 308], [8480276, 310], [8480281, 315], [8480283, 317], [8480302, 335], [8480326, 359], [8480329, 362], [8480334, 367], [8480337, 370], [8480338, 371], [8480341, 374], [8480346, 379], [8480349, 382], [8480352, 385], [8480354, 387], [8480356, 389], [8480361, 394], [8480362, 395], [8480420, 454], [8480425, 459], [8480431, 465], [8480449, 483], [8480454, 488], [8480457, 491], [8480462, 496], [8480468, 502], [8480471, 505], [8480477, 511], [8480478, 512], [8480480, 514], [8480485, 519], [8480488, 522], [8480489, 523], [8480495, 529], [8480496, 530], [8480500, 534], [8480505, 539], [8480509, 543], [8480532, 566], [8480533, 567], [8480538, 572], [8480541, 575], [8480544, 578], [8480547, 581], [8480600, 634], [8480634, 668], [8480636, 670], [8480664, 698], [8480666, 700], [8480667, 701], [8480668, 702], [8480674, 708], [8480675, 709], [8480676, 710], [8480685, 719], [8480688, 722], [8480691, 725], [8480695, 729], [8480701, 735], [8480706, 740], [8480710, 744], [8480716, 750], [8480717, 751], [8480798, 829], [8480802, 833], [8480812, 843], [8480814, 845], [8480817, 848], [8480822, 853], [8480823, 854], [8480826, 857], [8480829, 860], [8480834, 865], [8480878, 908], [8480879, 909], [8480880, 910], [8480881, 911], [8480884, 914], [8480888, 918], [8480913, 943], [8480916, 946], [8480941, 971], [8480942, 972], [8480947, 977], [8480950, 980], [8480952, 982], [9026308, 539], [9205729, 751], [11031680, 28], [11031687, 35], [11031691, 39], [11031793, 140], [11031832, 179], [11031879, 229], [11031928, 278], [11031933, 283], [11031939, 289], [11031941, 291], [11031958, 308], [11031960, 310], [11032040, 389], [11032045, 394], [11032046, 395], [11244038, 359], [11244041, 362], [11775006, 126], [12008589, 19], [12008593, 23], [12008598, 28], [12008600, 30], [12008605, 35], [12008609, 39], [12008611, 41], [12008615, 45], [12008617, 47], [12008622, 52], [12008625, 55], [12008657, 86], [12008658, 87], [12008704, 133], [12008767, 212], [12008772, 217], [12008775, 220], [12008776, 221], [12008777, 222], [12008783, 228], [12008784, 229], [12008790, 235], [12008791, 236], [12008795, 240], [12008833, 278], [12008838, 283], [12008844, 289], [12008846, 291], [12008863, 308], [12008865, 310], [12008870, 315], [12008872, 317], [12008891, 335], [12008923, 367], [12008926, 370], [12008927, 371], [12008930, 374], [12008935, 379], [12009009, 454], [12009078, 523], [12009084, 529], [12009085, 530], [12009089, 534], [12009094, 539], [12009098, 543], [12009121, 566], [12009122, 567], [12009127, 572], [12009130, 575], [12009133, 578], [12009136, 581], [12009189, 634], [12009223, 668], [12009225, 670], [12009253, 698], [12009255, 700], [12009256, 701], [12009257, 702], [12009263, 708], [12009264, 709], [12009265, 710], [12009269, 714], [12009274, 719], [12009277, 722], [12009280, 725], [12009284, 729], [12009290, 735], [12009291, 736], [12009295, 740], [12009299, 744], [12009305, 750], [12009401, 843], [12009403, 845], [12009467, 908], [12009468, 909], [12009469, 910], [12009470, 911], [12009473, 914], [12009477, 918], [12009502, 943], [12009505, 946], [12009531, 972], [12009536, 977], [12009539, 980], [12009541, 982], [12307579, 35], [12307585, 41], [12307589, 45], [12307591, 47], [12307596, 52], [12307599, 55], [12307624, 79], [12307626, 81], [12307627, 82], [12307628, 83], [12307631, 86], [12307632, 87], [12307678, 133], [12307734, 189], [12307738, 193], [12307754, 212], [12307759, 217], [12307762, 220], [12307763, 221], [12307764, 222], [12307770, 228], [12307771, 229], [12307852, 310], [12328011, 19], [12328015, 23], [12328020, 28], [12328022, 30], [12328027, 35], [12328031, 39], [12328033, 41], [12328037, 45], [12328039, 47], [12328044, 52], [12328047, 55], [12328072, 79], [12328074, 81], [12328075, 82], [12328076, 83], [12328079, 86], [12328080, 87], [12328126, 133], [12328197, 212], [12328202, 217], [12328205, 220], [12328206, 221], [12328207, 222], [12328213, 228], [12328214, 229], [12328220, 235], [12328221, 236], [12328225, 240], [12328300, 315], [12328302, 317], [12328321, 335], [12328345, 359], [12328348, 362], [12328353, 367], [12328371, 385], [12328373, 387], [12328375, 389], [12328380, 394], [12328381, 395], [12328439, 454], [12328444, 459], [12328450, 465], [12328476, 491], [12328481, 496], [12328487, 502], [12328490, 505], [12328496, 511], [12328497, 512], [12328514, 529], [12328551, 566], [12328552, 567], [12328557, 572], [12328560, 575], [12328563, 578], [12328566, 581], [12328618, 634], [12328652, 668], [12328654, 670], [12328703, 719], [12328706, 722], [12328709, 725], [12328713, 729], [12328719, 735], [12328720, 736], [12328724, 740], [12328728, 744], [12328734, 750], [12328735, 751], [12328830, 843], [12328832, 845], [12328835, 848], [12328840, 853], [12328841, 854], [12328896, 908], [12328897, 909], [12328898, 910], [12328899, 911], [12328902, 914], [12328906, 918], [12328931, 943], [12328934, 946], [12328959, 971], [12328960, 972], [12328965, 977], [12328968, 980], [12328970, 982], [12685909, 19], [12685913, 23], [12685918, 28], [12685920, 30], [12685925, 35], [12685929, 39], [12685931, 41], [12685935, 45], [12685937, 47], [12685942, 52], [12685945, 55], [12685970, 79], [12685972, 81], [12685973, 82], [12685974, 83], [12685977, 86], [12685978, 87], [12686024, 133], [12686100, 212], [12686105, 217], [12686108, 220], [12686109, 221], [12686110, 222], [12686116, 228], [12686117, 229], [12686123, 235], [12686124, 236], [12686128, 240], [12686166, 278], [12686171, 283], [12686177, 289], [12686179, 291], [12686180, 292], [12686183, 295], [12686185, 297], [12686189, 301], [12686193, 305], [12686196, 308], [12686198, 310], [12686203, 315], [12686205, 317], [12686224, 335], [12686248, 359], [12686251, 362], [12686256, 367], [12686259, 370], [12686260, 371], [12686263, 374], [12686268, 379], [12686271, 382], [12686274, 385], [12686276, 387], [12686278, 389], [12686283, 394], [12686284, 395], [12686342, 454], [12686347, 459], [12686353, 465], [12686371, 483], [12686376, 488], [12686379, 491], [12686384, 496], [12686390, 502], [12686393, 505], [12686399, 511], [12686400, 512], [12686402, 514], [12686407, 519], [12686410, 522], [12686411, 523], [12686417, 529], [12686418, 530], [12686422, 534], [12686427, 539], [12686454, 566], [12686455, 567], [12686460, 572], [12686463, 575], [12686466, 578], [12686469, 581], [12686522, 634], [12686556, 668], [12686558, 670], [12686586, 698], [12686588, 700], [12686589, 701], [12686590, 702], [12686596, 708], [12686597, 709], [12686598, 710], [12686602, 714], [12686607, 719], [12686610, 722], [12686613, 725], [12686617, 729], [12686623, 735], [12686624, 736], [12686628, 740], [12686632, 744], [12686638, 750], [12686639, 751], [12686720, 829], [12686724, 833], [12686729, 838], [12686734, 843], [12686736, 845], [12686739, 848], [12686744, 853], [12686745, 854], [12686748, 857], [12686751, 860], [12686756, 865], [12686800, 908], [12686801, 909], [12686802, 910], [12686803, 911], [12686806, 914], [12686810, 918], [12686835, 943], [12686838, 946], [13247610, 964], [13259995, 52], [13259998, 55], [13260023, 79], [13260025, 81], [13260026, 82], [13260027, 83], [13260030, 86], [13260031, 87], [13260089, 145], [13260232, 291], [13260329, 387], [13260331, 389], [13260336, 394], [13260337, 395], [13497288, 491], [13907246, 35], [13907250, 39], [13907252, 41], [13907256, 45], [13907258, 47], [13907263, 52], [13907266, 55], [13907294, 82], [13907295, 83], [13907298, 86], [13907299, 87], [13907345, 133], [13907352, 140], [13907391, 179], [13907776, 454], [14135629, 19], [14135633, 23], [14135638, 28], [14135640, 30], [14135645, 35], [14135649, 39], [14135651, 41], [14135655, 45], [14135657, 47], [14135662, 52], [14135665, 55], [14135751, 145], [14135785, 179], [14135815, 212], [14346420, 19], [14346424, 23], [14346429, 28], [14346431, 30], [14346436, 35], [14346440, 39], [14346442, 41], [14346446, 45], [14346448, 47], [14346453, 52], [14346456, 55], [14346483, 81], [14346488, 86], [14346489, 87], [14346581, 179], [14346611, 212], [14346616, 217], [14346619, 220], [14346620, 221], [14346621, 222], [14346627, 228], [14346628, 229], [14346634, 235], [14346635, 236], [14346639, 240], [14346677, 278], [14346735, 335], [14346759, 359], [14346762, 362], [14346767, 367], [14346770, 370], [14346771, 371], [14346774, 374], [14346779, 379], [14346782, 382], [14346785, 385], [14346787, 387], [14346789, 389], [14346794, 394], [14346795, 395], [14346853, 454], [14346890, 491], [14346895, 496], [14346901, 502], [14346904, 505], [14346965, 566], [14347069, 670], [14347100, 701], [14347245, 843], [14347247, 845], [14347346, 943], [14347349, 946], [14347374, 971], [14347375, 972], [14347380, 977], [14347383, 980], [14347385, 982], [14349929, 133], [14349936, 140], [14350010, 217], [14350013, 220], [14350014, 221], [14350015, 222], [14350244, 454], [14350249, 459], [14350255, 465], [14350811, 982], [15611110, 52], [15611113, 55], [15737333, 607], [16672756, 52], [16672759, 55], [16672845, 140], [16672871, 166], [16672875, 170], [16672879, 174], [16672884, 179], [16672914, 212], [16672919, 217], [16673010, 308], [16673012, 310], [16673062, 359], [16673097, 394], [16673156, 454], [16673161, 459], [16673167, 465], [16673204, 502], [16673207, 505], [16673232, 530], [16673268, 566], [16673269, 567], [16792324, 28], [16792331, 35], [16792351, 55], [16792379, 82], [16792380, 83], [16792383, 86], [16792384, 87], [16792476, 179], [16792517, 221], [16792518, 222], [16792524, 228], [16792525, 229], [16792531, 235], [16792532, 236], [16792536, 240], [16792574, 278], [16792579, 283], [16792585, 289], [16792587, 291], [16792588, 292], [16792684, 387], [16792686, 389], [16792691, 394], [16792761, 465], [16792798, 502], [16792801, 505], [16792818, 522], [16792825, 529], [16792826, 530], [16793208, 909], [16793209, 910], [16793210, 911], [16793213, 914], [16793271, 972], [16793276, 977], [16793279, 980], [16793281, 982], [16999680, 28], [16999682, 30], [16999687, 35], [16999691, 39], [16999693, 41], [16999697, 45], [16999699, 47], [16999704, 52], [16999707, 55], [16999732, 79], [16999734, 81], [16999735, 82], [16999736, 83], [16999739, 86], [16999740, 87], [16999832, 179], [16999862, 212], [16999867, 217], [16999870, 220], [16999871, 221], [16999872, 222], [16999878, 228], [16999879, 229], [16999885, 235], [16999886, 236], [16999890, 240], [16999945, 295], [16999947, 297], [16999951, 301], [16999955, 305], [16999958, 308], [16999960, 310], [16999965, 315], [16999967, 317], [16999986, 335], [17000021, 370], [17000022, 371], [17000025, 374], [17000030, 379], [17000033, 382], [17000036, 385], [17000038, 387], [17000040, 389], [17000045, 394], [17000046, 395], [17000104, 454], [17000109, 459], [17000115, 465], [17000141, 491], [17000146, 496], [17000152, 502], [17000155, 505], [17000161, 511], [17000180, 530], [17000184, 534], [17000189, 539], [17000193, 543], [17000216, 566], [17000217, 567], [17000284, 634], [17000360, 710], [17000364, 714], [17000369, 719], [17000372, 722], [17000375, 725], [17000496, 843], [17000501, 848], [17000560, 908], [17000561, 909], [17000562, 910], [17000563, 911], [17000566, 914], [17000570, 918], [17000595, 943], [17000598, 946], [17000623, 971], [17000624, 972], [17000629, 977], [17000632, 980], [17000634, 982], [18640119, 19], [18640123, 23], [18640128, 28], [18640130, 30], [18640152, 52], [18640155, 55], [18640182, 81], [18640187, 86], [18640188, 87], [18640280, 179], [18640310, 212], [18640315, 217], [18640318, 220], [18640319, 221], [18640320, 222], [18640326, 228], [18640327, 229], [18640333, 235], [18640334, 236], [18640338, 240], [18640395, 297], [18640399, 301], [18640403, 305], [18640406, 308], [18640408, 310], [18640434, 335], [18640458, 359], [18640461, 362], [18640466, 367], [18640488, 389], [18640493, 394], [18640494, 395], [18640552, 454], [18640557, 459], [18640563, 465], [18640594, 496], [18640600, 502], [18640603, 505], [18640620, 522], [18640627, 529], [18640628, 530], [18640664, 566], [18640665, 567], [18640768, 670], [18640949, 848], [18641074, 972], [18641079, 977], [18641082, 980], [18641084, 982], [18746097, 19], [18746119, 41], [18746123, 45], [18746125, 47], [18746130, 52], [18746133, 55], [18746160, 81], [18746165, 86], [18746166, 87], [18746212, 133], [18746215, 136], [18746219, 140], [18746224, 145], [18746258, 179], [18746288, 212], [18746305, 229], [18746312, 236], [18746316, 240], [18746365, 289], [18746367, 291], [18746384, 308], [18746386, 310], [18746412, 335], [18746447, 370], [18746448, 371], [18746451, 374], [18746456, 379], [18746459, 382], [18746462, 385], [18746464, 387], [18746466, 389], [18746471, 394], [18746472, 395], [18746564, 488], [18746567, 491], [18746572, 496], [18746578, 502], [18746581, 505], [18746605, 529], [18746606, 530], [18746643, 567], [18746744, 668], [18746746, 670], [18746927, 848], [18746932, 853], [18746933, 854], [18746988, 908], [18746989, 909], [18746990, 910], [18746991, 911], [18746994, 914], [18747060, 980], [18747062, 982], [18752347, 79], [18752349, 81], [18752350, 82], [18752351, 83], [18752354, 86], [18752355, 87], [18752408, 140], [18752413, 145], [18752447, 179], [18752570, 308], [18752572, 310], [18752645, 382], [18752648, 385], [18752650, 387], [18752758, 496], [18816456, 136], [18816460, 140], [18820761, 45], [18820763, 47], [18820768, 52], [18820771, 55], [18820799, 81], [18820800, 82], [18820801, 83], [18820804, 86], [18820805, 87], [18820851, 133], [18820858, 140], [18820863, 145], [18821006, 291], [18821023, 308], [18821025, 310], [18821075, 359], [18821167, 454], [18821172, 459], [18821178, 465], [18834865, 28], [18834867, 30], [18834872, 35], [18834876, 39], [18834878, 41], [18834882, 45], [18834884, 47], [18834889, 52], [18834892, 55], [18834917, 79], [18834919, 81], [18834920, 82], [18834921, 83], [18834924, 86], [18834925, 87], [19046551, 35], [19046555, 39], [19046557, 41], [19046561, 45], [19046563, 47], [19046568, 52], [19046596, 79], [19046599, 82], [19046603, 86], [19046604, 87], [19046696, 179], [19046743, 229], [19046850, 335], [19046874, 359], [19046877, 362], [19046980, 465], [19047435, 908], [19047534, 982]]

In [372]:
len(L)

1482

In [373]:
from sys import maxsize


matches = L
n = len(matches)
max_diff = round(1001*FACT_LIS_MAX_QUERY_DISTANCE)

longest_seq_len = 0
parent = [maxsize]*(n+1)
increasingSub = [maxsize]*(n+1)
for i in range(n):
    start = 1
    end = longest_seq_len
    while start <= end:
        middle = (start + end) // 2
        if matches[increasingSub[middle]][1] >= matches[i][1] or matches[increasingSub[start]][0] + max_diff < matches[i][0]:
            end = middle - 1
        else:
            start = middle + 1    
    # Crawl
    # while start > 1:
    #     if increasingSub[start] == maxsize or matches[increasingSub[start]][0] + max_diff > matches[i][0]:
    #         break
    #     start -= 1
    parent[i] = increasingSub[start-1]
    increasingSub[start] = i
    if start > longest_seq_len:
        longest_seq_len = start

current_node = increasingSub[longest_seq_len]
q = [current_node]*longest_seq_len 
for j in range(longest_seq_len-1, 0, -1):
    current_node = parent[current_node]
    q[j-1] = current_node

lis = take(matches, q, axis=0)

In [374]:
lis

array([[ 8479987,       19],
       [ 8479991,       23],
       [ 8479996,       28],
       [ 8479998,       30],
       [ 8480003,       35],
       [ 8480007,       39],
       [ 8480009,       41],
       [ 8480013,       45],
       [ 8480015,       47],
       [ 8480020,       52],
       [ 8480023,       55],
       [ 8480048,       79],
       [ 8480050,       81],
       [ 8480051,       82],
       [ 8480052,       83],
       [ 8480055,       86],
       [ 8480056,       87],
       [ 8480102,      133],
       [ 8480105,      136],
       [ 8480109,      140],
       [ 8480114,      145],
       [ 8480120,      151],
       [ 8480125,      156],
       [ 8480127,      158],
       [ 8480132,      163],
       [ 8480135,      166],
       [ 8480139,      170],
       [ 8480143,      174],
       [12307738,      193],
       [12307754,      212],
       [12307759,      217],
       [12307762,      220],
       [12307763,      221],
       [12686110,      222],
       [126861

In [411]:
match_score = -max_diff
rekt = None
scores = []
score_1, score_2, score_3 = -max_diff, -max_diff, -max_diff
for i in range(longest_seq_len):
    start = i
    end = longest_seq_len
    while start <= end:
        middle = (start + end) // 2
        if middle == longest_seq_len:
            start = longest_seq_len
            break
        if lis[middle, 0] < lis[i, 0] + max_diff - lis[i, 1]:
            start = middle + 1
        else:
            end = middle - 1
    # Window is i till end
    lis_ext_window_len = end - i
    if lis_ext_window_len > MIN_LIS_EXTENSION_WINDOW_LEN:
        window_src = lis[i:start, :].tolist()
        window = [window_src[0]]
        diff_sum = 0
        for i in range(1, len(window_src)):
            t1, q1 = window[len(window)-1]
            t2, q2 = window_src[i]
            if t2-t1 < KMER_LEN and q2-q1 < KMER_LEN:
                continue
            diff_sum += t2-t1
            window.append([t2, q2])
        #print(f"window {len(window_src)} -> {len(window)}")
        #print(window)
        
        estimated_matches_q = window[len(window)-1][1] - window[0][1] #(lis[start, 1] if start < longest_seq_len else max_diff) - lis[i, 1]
        estimated_matches_t = window[len(window)-1][0] - window[0][0] #(lis[start, 0] if start < longest_seq_len else lis[start-1, 0]) - lis[i, 0]
        score = (min(estimated_matches_q, estimated_matches_t) - diff_sum/KMER_LEN)/1000

        score_1 = score_2
        score_2 = score_3
        score_3 = score

        if score_2 > score_1 and score_2 > score_3:
            # Local maximum
            scores.append(score_2)
        
        if score > match_score:
            match_score, match_start_t, match_end_t, match_start_q, match_end_q = score, window[0][0], window[len(window)-1][0], window[0][1], window[len(window)-1][1]
            lis_accepted = True
            print(f"score={match_score} start={match_start_t}")
        if start == longest_seq_len:
            break

scores = sorted(scores, reverse=True)[:3]

score=0.1415 start=8479987
score=0.642875 start=12686110


In [412]:
scores

[0.642875, 0.1415, 0.035625]