In [1]:
import json
with open('matches.json', 'r') as f:
    matches = json.load(f)

In [2]:
len(matches)

116

In [146]:
from sys import maxsize 
from numpy import frompyfunc, vectorize, uint8, uint32, array, concatenate, add, argmin, arange, column_stack, unique, split, empty, ix_, take


# Length of k-mer used to generate (k,w)-minimizer indices
KMER_LEN = 16 # was: 16
# Length of window to generate (k,w)-minimizer indices
WINDOW_LEN = 7 # was: 5

# Match from k-mer index with be resized by kmer_len times this factor
FACT_KMER_TO_RELATIVE_EXTENSION_LEN = 0.5

# Miminal length of subsequent k-mers in LIS (longest increasing subsequence) that form a valid match
MIN_LIS_EXTENSION_WINDOW_LEN = 3
# Mimimum score for the extension window that is considered a match
MIN_LIS_EXTENSION_WINDOW_SCORE = 0.1
# Number of locally maximum scores for extension windows that we consider
MAX_ACCEPTED_LIS_EXTENSION_WINDOWS_COUNT = 5
# Max distance between starting k-mer of the match and the ending k-mer of match
# Used when filtering the extended seed
# Should be higher than FACT_TARGET_TO_QUERY_MAX_RELATIVE_LENGTH
FACT_LIS_MAX_QUERY_DISTANCE = 1.3

def track_block(*args, **kwargs):
    pass

def track_counter(*args, **kwargs):
    pass

def get_scores_current(matches):
    query_len = 1000
    max_diff = round(query_len*FACT_LIS_MAX_QUERY_DISTANCE)
    n = len(matches)

    # HERE:
    relative_extension = round(KMER_LEN * FACT_KMER_TO_RELATIVE_EXTENSION_LEN) + 1
    lis_accepted = False
    match_start_t, match_end_t, match_start_q, match_end_q = 0, 0, 0, 0

    if n < MIN_LIS_EXTENSION_WINDOW_LEN:
        if __debug__:
            track_block('read_lis', end=True)
        pass
    else:
        longest_seq_len = 0
        parent = [maxsize]*(n+1)
        increasingSub = [maxsize]*(n+1)
        for i in range(n):
            start = 1
            end = longest_seq_len
            while start <= end:
                middle = (start + end) // 2
                if matches[increasingSub[middle]][1] >= matches[i][1] or matches[increasingSub[start]][0] + max_diff < matches[i][0]:
                    end = middle - 1
                else:
                    start = middle + 1    
            parent[i] = increasingSub[start-1]
            increasingSub[start] = i
            if start > longest_seq_len:
                longest_seq_len = start

        current_node = increasingSub[longest_seq_len]
        q = [current_node]*longest_seq_len 
        for j in range(longest_seq_len-1, 0, -1):
            current_node = parent[current_node]
            q[j-1] = current_node

        if __debug__:
            track_block('read_lis', end=True)
            track_block('read_lis_cutoff')

        lis = take(matches, q, axis=0)
        if __debug__:
            track_counter('lis_length', len(lis))
        # lis_len = len(lis)
        # if lis_len >= MIN_LIS_EXTENSION_WINDOW_LEN:
        #     match_start_t, match_end_t, match_start_q, match_end_q = lis[0, 0], lis[lis_len-1, 0], lis[0, 1], lis[lis_len-1, 1]
        #     if abs(match_end_t - match_start_t) < max_diff + relative_extension:
        #         lis_accepted = True
        # if not lis_accepted:
        #     # Backup lis!
        #     match_score = -max_diff
        #     longest_seq_len = 0
        #     parent = [maxsize]*(n+1)
        #     increasingSub = [maxsize]*(n+1)
        #     for i in range(n):
        #         start = 1
        #         end = longest_seq_len
        #         while start <= end:
        #             middle = (start + end) // 2
        #             if matches[increasingSub[middle]][1] < matches[i][1]:
        #                 start = middle + 1
        #             else:
        #                 end = middle - 1
        #         parent[i] = increasingSub[start-1]
        #         increasingSub[start] = i

        #         if start > longest_seq_len:
        #             longest_seq_len = start

        #     current_node = increasingSub[longest_seq_len]
        #     q = [current_node]*longest_seq_len 
        #     for j in range(longest_seq_len-1, 0, -1):
        #         current_node = parent[current_node]
        #         q[j-1] = current_node

        #lis = take(matches, q, axis=0)
        scores = []
        score_1, score_2, score_3 = -max_diff, -max_diff, -max_diff
        for i in range(longest_seq_len):
            start = i
            end = longest_seq_len
            while start <= end:
                middle = (start + end) // 2
                if middle == longest_seq_len:
                    start = longest_seq_len
                    break
                if lis[middle, 0] < lis[i, 0] + max_diff - lis[i, 1]:
                    start = middle + 1
                else:
                    end = middle - 1
            # Window is i till end
            # Window is i till end
            lis_ext_window_len = end - i
            if lis_ext_window_len > MIN_LIS_EXTENSION_WINDOW_LEN:
                window_src = lis[i:start, :].tolist()
                window = [window_src[0]]
                diff_sum = 0
                for i in range(1, len(window_src)):
                    t1, q1 = window[len(window)-1]
                    t2, q2 = window_src[i]
                    if t2-t1 < KMER_LEN and q2-q1 < KMER_LEN:
                        continue
                    diff_sum += t2-t1
                    window.append([t2, q2])
                #print(f"window {len(window_src)} -> {len(window)}")
                #print(window)

                
                estimated_matches_q = window[len(window)-1][1] - window[0][1] #(lis[start, 1] if start < longest_seq_len else max_diff) - lis[i, 1]
                estimated_matches_t = window[len(window)-1][0] - window[0][0] #(lis[start, 0] if start < longest_seq_len else lis[start-1, 0]) - lis[i, 0]
                score = (min(estimated_matches_q, estimated_matches_t) - diff_sum/KMER_LEN)/query_len
                print(f"score={score} diff_sum={diff_sum}")
                score_1 = score_2
                score_2 = score_3
                score_3 = score

                if score_2 > score_1 and score_2 > score_3:
                    # Local maximum
                    if score_2 > MIN_LIS_EXTENSION_WINDOW_SCORE:
                        scores.append((score_2, window_src[0][0], window_src[len(window_src)-1][0], window_src[0][1], window_src[len(window_src)-1][1]))
                # if score > match_score:
                #     match_score, match_start_t, match_end_t, match_start_q, match_end_q = score, window[0][0], window[len(window)-1][0], window[0][1], window[len(window)-1][1]
                #     lis_accepted = True

                    #print(f"score={match_score} start={match_start_t}")
                if start == longest_seq_len:
                    break
        
        if score_3 > score_2 and score_3 > score_1:
            if score_3 > MIN_LIS_EXTENSION_WINDOW_SCORE:
                scores.append((score_3, window_src[0][0], window_src[len(window_src)-1][0], window_src[0][1], window_src[len(window_src)-1][1]))
            
        scores = sorted(scores, reverse=True)[:MAX_ACCEPTED_LIS_EXTENSION_WINDOWS_COUNT]
        return scores

In [147]:
get_scores_current(matches)

score=0.8951875 diff_sum=957


[(0.8951875, 11622851, 11623815, 16, 978)]

In [116]:
from time import time_ns
from numpy import searchsorted

def doit(fn, samples=100, prepare=None):
    r = None
    samples = max(samples, 1)
    samples_t = []
    for i in range(samples):
        data = None
        if prepare:
            data = prepare()
        _start = time_ns()
        if prepare:
            r = fn(data)
        else:
            r = fn()
        _end = time_ns()
        samples_t.append(_end-_start)
    t_avg = sum(samples_t)/len(samples_t)
    print(f"Time[samples={samples}]: {t_avg//10000/100} ms")
    return r

def get_scores_np(matches):
    query_len = 1000
    max_diff = round(query_len*FACT_LIS_MAX_QUERY_DISTANCE)
    n = len(matches)

    # HERE:
    relative_extension = round(KMER_LEN * FACT_KMER_TO_RELATIVE_EXTENSION_LEN) + 1
    lis_accepted = False
    match_start_t, match_end_t, match_start_q, match_end_q = 0, 0, 0, 0

    if n < MIN_LIS_EXTENSION_WINDOW_LEN:
        if __debug__:
            track_block('read_lis', end=True)
        pass
    else:
        longest_seq_len = 0
        parent = [maxsize]*(n+1)
        increasingSub = [maxsize]*(n+1)
        for i in range(n):
            start = 1
            end = longest_seq_len
            while start <= end:
                middle = (start + end) // 2
                if matches[increasingSub[middle]][1] >= matches[i][1] or matches[increasingSub[start]][0] + max_diff < matches[i][0]:
                    end = middle - 1
                else:
                    start = middle + 1    
            parent[i] = increasingSub[start-1]
            increasingSub[start] = i
            if start > longest_seq_len:
                longest_seq_len = start

        current_node = increasingSub[longest_seq_len]
        q = [current_node]*longest_seq_len
        for j in range(longest_seq_len-1, 0, -1):
            current_node = parent[current_node]
            q[j-1] = current_node
        lis = take(matches, q, axis=0)

        
        if __debug__:
            track_block('read_lis', end=True)
            track_block('read_lis_cutoff')

        #lis = take(matches, q, axis=0)
        if __debug__:
            track_counter('lis_length', len(lis))
        # lis_len = len(lis)
        # if lis_len >= MIN_LIS_EXTENSION_WINDOW_LEN:
        #     match_start_t, match_end_t, match_start_q, match_end_q = lis[0, 0], lis[lis_len-1, 0], lis[0, 1], lis[lis_len-1, 1]
        #     if abs(match_end_t - match_start_t) < max_diff + relative_extension:
        #         lis_accepted = True
        # if not lis_accepted:
        #     # Backup lis!
        #     match_score = -max_diff
        #     longest_seq_len = 0
        #     parent = [maxsize]*(n+1)
        #     increasingSub = [maxsize]*(n+1)
        #     for i in range(n):
        #         start = 1
        #         end = longest_seq_len
        #         while start <= end:
        #             middle = (start + end) // 2
        #             if matches[increasingSub[middle]][1] < matches[i][1]:
        #                 start = middle + 1
        #             else:
        #                 end = middle - 1
        #         parent[i] = increasingSub[start-1]
        #         increasingSub[start] = i

        #         if start > longest_seq_len:
        #             longest_seq_len = start

        #     current_node = increasingSub[longest_seq_len]
        #     q = [current_node]*longest_seq_len 
        #     for j in range(longest_seq_len-1, 0, -1):
        #         current_node = parent[current_node]
        #         q[j-1] = current_node

        scores = []
        score_1, score_2, score_3 = -max_diff, -max_diff, -max_diff

        window_indices = searchsorted(lis[:, 0], lis[:, 0] + max_diff - lis[:, 1], side='right').tolist()
        
        for i in range(longest_seq_len):
            #start = i
            end = window_indices[i]
            # start = i
            # end = longest_seq_len
            # while start <= end:
            #     middle = (start + end) // 2
            #     if middle == longest_seq_len:
            #         start = longest_seq_len
            #         break
            #     if lis[middle][0] < lis[i][0] + max_diff - lis[i][1]:
            #         start = middle + 1
            #     else:
            #         end = middle - 1
            start = i
            #end = searchsorted(lis[i:, 0], lis[i, 0] + max_diff - lis[i, 1], side='right')#.tolist()
            if end - start > MIN_LIS_EXTENSION_WINDOW_LEN:
                window = lis[start:end]
                estimated_matches_q = window[len(window)-1][1] - window[0][1]
                estimated_matches_t = window[len(window)-1][0] - window[0][0]
                
                score = (min(estimated_matches_q, estimated_matches_t) - estimated_matches_q/KMER_LEN)/query_len
                score_1 = score_2
                score_2 = score_3
                score_3 = score

                if score_2 > score_1 and score_2 > score_3:
                    # Local maximum
                    if score_2 > MIN_LIS_EXTENSION_WINDOW_SCORE:
                        scores.append((score_2, window[0][0], window[len(window)-1][0], window[0][1], window[len(window)-1][1]))
                # if score > match_score:
                #     match_score, match_start_t, match_end_t, match_start_q, match_end_q = score, window[0][0], window[len(window)-1][0], window[0][1], window[len(window)-1][1]
                #     lis_accepted = True

                    #print(f"score={match_score} start={match_start_t}")
                if start == longest_seq_len:
                    break
        
        if score_3 > score_2 and score_3 > score_1:
            if score_3 > MIN_LIS_EXTENSION_WINDOW_SCORE:
                scores.append((score_3, window[0][0], window[len(window)-1][0], window[0][1], window[len(window)-1][1]))
            
        scores = sorted(scores, reverse=True)[:MAX_ACCEPTED_LIS_EXTENSION_WINDOWS_COUNT]
        return scores

doit(lambda x: get_scores_np(x), prepare=lambda: array(matches), samples=1000)

Time[samples=1000]: 0.49 ms


[(0.901875, 11622854, 11623815, 19, 978)]

In [120]:
doit(lambda: get_scores_current(matches), samples=1000)

Time[samples=1000]: 0.18 ms


[(0.8951875, 11622851, 11623815, 16, 978)]

In [164]:
from time import time_ns
from numpy import searchsorted
from bisect import bisect_right

def get_scores_pure_lists(matches):
    query_len = 1000
    max_diff = round(query_len*FACT_LIS_MAX_QUERY_DISTANCE)
    n = len(matches)

    # HERE:
    relative_extension = round(KMER_LEN * FACT_KMER_TO_RELATIVE_EXTENSION_LEN) + 1
    lis_accepted = False
    match_start_t, match_end_t, match_start_q, match_end_q = 0, 0, 0, 0

    if n < MIN_LIS_EXTENSION_WINDOW_LEN:
        if __debug__:
            track_block('read_lis', end=True)
        pass
    else:
        longest_seq_len = 0
        parent = [maxsize]*(n+1)
        increasingSub = [maxsize]*(n+1)
        for i in range(n):
            start = 1
            end = longest_seq_len
            while start <= end:
                middle = (start + end) // 2
                if matches[increasingSub[middle]][1] >= matches[i][1] or matches[increasingSub[start]][0] + max_diff < matches[i][0]:
                    end = middle - 1
                else:
                    start = middle + 1    
            parent[i] = increasingSub[start-1]
            increasingSub[start] = i
            if start > longest_seq_len:
                longest_seq_len = start

        current_node = increasingSub[longest_seq_len]
        lis_t = [matches[current_node][0]]*longest_seq_len
        lis_q = [matches[current_node][1]]*longest_seq_len
        for j in range(longest_seq_len-1, 0, -1):
            current_node = parent[current_node]
            lis_t[j-1] = matches[current_node][0]
            lis_q[j-1] = matches[current_node][1]

        
        if __debug__:
            track_block('read_lis', end=True)
            track_block('read_lis_cutoff')

        if __debug__:
            track_counter('lis_length', longest_seq_len)

        scores = []
        score_1, score_2, score_3 = -max_diff, -max_diff, -max_diff

        start = 0
        end = bisect_right(lis_t, lis_t[0] + max_diff - lis_q[0]) - 1
        spaces = 0
        for i in range(1, end+1):
            spaces += max(lis_t[i] - lis_t[i-1], KMER_LEN) - KMER_LEN
        print(f"spaces_0 = {spaces} <{start} - {end}> ({longest_seq_len})")
        for start in range(0, longest_seq_len):
            print(f"remove {start}")
            #0 1 [S]->A B
            if start+1 < longest_seq_len:
                spaces -= max(lis_t[start+1] - lis_t[start], KMER_LEN) - KMER_LEN
            new_end = bisect_right(lis_t, lis_t[start] + max_diff - lis_q[start], lo=end) - 1
            for i in range(end+1, new_end+1):
                print(f"add {i-1}")
                spaces += max(lis_t[i] - lis_t[i-1], KMER_LEN) - KMER_LEN
            end = new_end
            wnd_len = end - start + 1
            if wnd_len > MIN_LIS_EXTENSION_WINDOW_LEN:
                estimated_matches_q = lis_q[end] - lis_q[start]
                estimated_matches_t = lis_t[end] - lis_t[start]
                score = (min(estimated_matches_q, estimated_matches_t) - spaces/KMER_LEN)/query_len
                print(f"score={score} diff_sum={spaces}")
                score_1 = score_2
                score_2 = score_3
                score_3 = score

                if score_2 > score_1 and score_2 > score_3:
                    # Local maximum
                    if score_2 > MIN_LIS_EXTENSION_WINDOW_SCORE:
                        scores.append((score_2, lis_t[start], lis_t[end], lis_q[start], lis_q[end]))
                    #print(f"score={match_score} start={match_start_t}")
                if start == longest_seq_len:
                    break
        if score_3 > score_2 and score_3 > score_1:
            if score_3 > MIN_LIS_EXTENSION_WINDOW_SCORE:
                scores.append((score_3, lis_t[start], lis_t[end], lis_q[start], lis_q[end]))
            
        scores = sorted(scores, reverse=True)[:MAX_ACCEPTED_LIS_EXTENSION_WINDOWS_COUNT]
        return scores

doit(lambda: get_scores_pure_lists(matches), samples=1)

spaces_0 = 295 <0 - 105> (106)
remove 0
score=0.9435625 diff_sum=295
remove 1
score=0.9405625 diff_sum=295
remove 2
score=0.9360625 diff_sum=287
remove 3
score=0.9120625 diff_sum=287
remove 4
score=0.9100625 diff_sum=287
remove 5
score=0.9030625 diff_sum=287
remove 6
score=0.9015 diff_sum=280
remove 7
score=0.8785 diff_sum=280
remove 8
score=0.874125 diff_sum=270
remove 9
score=0.848125 diff_sum=270
remove 10
score=0.845125 diff_sum=270
remove 11
score=0.844125 diff_sum=270
remove 12
score=0.840125 diff_sum=270
remove 13
score=0.833125 diff_sum=270
remove 14
score=0.831875 diff_sum=258
remove 15
score=0.803875 diff_sum=258
remove 16
score=0.79925 diff_sum=236
remove 17
score=0.76125 diff_sum=236
remove 18
score=0.75825 diff_sum=236
remove 19
score=0.75325 diff_sum=236
remove 20
score=0.74925 diff_sum=236
remove 21
score=0.74625 diff_sum=236
remove 22
score=0.74325 diff_sum=236
remove 23
score=0.74025 diff_sum=236
remove 24
score=0.73225 diff_sum=236
remove 25
score=0.72725 diff_sum=236

[(0.9435625, 11622854, 11623815, 19, 978),
 (0.6116875, 11623240, 11623815, 405, 978),
 (0.127625, 11623750, 11623815, 913, 978)]