In [2]:
# parameters and packages
match = 2
mismatch = -1
gap = -2

In [3]:
# helper function: global alignment for similarity score

def lcs_backtrack(v, w, match, mismatch, indel):

    s = []
    for _ in range(len(v) + 1):
        s.append([0] * (len(w) + 1))

    temp = [0]
    for i in range(len(s[0])-1):
        val = temp[i] - indel
        temp.append(val)
    s[0] = temp

    for i in range(1, len(s)):
        val = s[i-1][0] - indel
        s[i][0] = val

    for i in range(1, len(v) + 1):
        for j in range(1, len(w) + 1):
            m = 0
            if v[i-1] == w[j-1]:
                m = match
            else:
                m = -(mismatch)
            s[i][j] = max(s[i-1][j] - indel, s[i][j-1] - indel, s[i-1][j-1] + m)
    return s

# Insert your global_alignment function here, along with any subroutines you need
def similarity(s, t, match_reward=match, mismatch_penalty=mismatch, indel_penalty=gap):
    manhatten = lcs_backtrack(s, t, match_reward, mismatch_penalty, indel_penalty)
    score = manhatten[-1][-1]

    i = len(s)
    j = len(t)
    s1 = ""
    s2 = ""
    while i != 0 and j != 0:
        if manhatten[i][j-1] - indel_penalty == manhatten[i][j]:
            s1 += "-"
            j -= 1
            s2 += t[-1]
            t = t[:-1]
        elif manhatten[i-1][j] - indel_penalty == manhatten[i][j]:
            s1 += s[-1]
            s = s[:-1]
            s2 += "-"
            i -= 1
        else:
            s1 += s[-1]
            s = s[:-1]
            s2 += t[-1]
            t = t[:-1]
            i -= 1
            j -= 1
    return score, s1[::-1], s2[::-1]


In [5]:
# helper function: splits s and t after common subsequence alignment
def split_string(s, t, string):

    s_idx = s.find(string)
    t_idx = t.find(string)

    # Extract left and right parts
    s_left, s_right = s[:s_idx], s[s_idx + len(string):]
    t_left, t_right = t[:t_idx], t[t_idx + len(string):]

    return s_left, s_right, t_left, t_right

In [None]:
# helper function: find optimal common shared subsequence
# premise 
#   uni-alignments are more evolutionarily relevant
#   longer substrings tend to be evolutionarily conserved  

"""
    lcp_array(comb, suffix_a)
    generates the lcp array

    Args:
        comb (str): combination of both strings with "$" appended to the  (s + "$" + t + "#")
        suffix_a (array): suffix array 

        suffix array 
        - all the suffixes in sorted order with the index in which they occur 

    Returns:
        array: the lcp array 
"""

def lcp_array(comb, suffix_a):
    # constructs an lcp array
    rank = [0] * len(comb)
    lcp = [0] * len(comb)
    
    for i, suffix in enumerate(suffix_a):
        rank[suffix] = i
    
    idx = 0
    for i in range(len(comb)):
        if rank[i] > 0:
            j = suffix_a[rank[i] - 1]
            while (i + idx < len(comb)) and (j + idx < len(comb)) and (comb[i + idx] == comb[j + idx]):
                idx += 1
            lcp[rank[i]] = idx
            if idx > 0:
                idx -= 1
    return lcp


"""
    function:
    finding thing optimal common shared subsequence 
    only returns one substring 

    Args:
        s: string S 
        t: string T
        k1: int (such that k1 < k2 -- see tandem_transform_lcs)
    
    Returns: 
        longest: string 
        the longest common subsequence between strings S and T. 
        if there are two longest common subsequences of the same length, 
            return the one which maximizes the global alignment score.
        
        our code breaks if there are no matches between S and T 
        
"""
def lus(s, t, k1):
    # find longest unique substrings between s and t
    comb = s + "$" + t + "#"

    # craete suffix array
    suffixes = [(comb[i:], i) for i in range(len(comb))]
    suffixes.sort()
    suffix_array = [suffix[1] for suffix in suffixes]
    
    # create lcp array
    lcp = lcp_array(comb, suffix_array)
    
    # find all unique substrings
    all_shared_substrings = []

    for i in range(1, len(comb)):
        suff_one = suffix_array[i]
        suff_two = suffix_array[i - 1]

        if (suff_one < len(s) and suff_two > len(s)) or (suff_one > len(s) and suff_two < len(s)):
            if lcp[i] > 0:
                if (i-1) >= 0:
                     prev = lcp[i - 1] 
                else:
                     prev = 0
                if (i+1) < len(lcp):
                     next = lcp[i+1]
                else:
                     next = 0

                if lcp[i] > max(prev, next):
                    shared = comb[suff_one : suff_one + lcp[i]]
                    all_shared_substrings.append(shared)
    
    # find longest string with most optimal score
    max_len = 0
    for string in all_shared_substrings:
        max_len = max(len(string), max_len)

    longest = None
    cur_score = float("-inf")

    for string in all_shared_substrings:
        if len(string) == max_len and len(string) > k1:
            sl, sr, tl, tr = split_string(s, t, string)
            score = similarity(sl, tl)[0] + similarity(sr, tr)[0]
            if score > cur_score:
                longest = string
    return longest


In [None]:
# helper function: determine search space
# all kmers in left or right edge of alignment that can lead to tandem duplication / deletion
def get_search_space_right(s_sub, t_sub, k1, k2, common_alignment):

    temp = []
    for i in range(k1, k2+1):
        temp.append(common_alignment[len(common_alignment) - i:])

    search_space = []
    for kmer in temp:
        # tandem deletion
        kmer = kmer.replace(" ", "")

        if s_sub[:len(kmer)] == kmer and t_sub[:len(kmer)] != kmer:
            search_space.append((kmer, 'del'))

        # tandem duplication
        if s_sub[:len(kmer)] != kmer and t_sub[:len(kmer)] == kmer:
            search_space.append((kmer, 'dup'))

    return search_space

def get_search_space_left(s_sub, t_sub, k1, k2, common_alignment):

    search_space = []
    for i in range(k1, k2+1):
        search_space.append(common_alignment[:i])

    return search_space
        

In [8]:
def tandem_transform_lcs(s: str, t: str, k1: int, k2: int, v: int):

        # find best possible alignment between s and t longer than k1, uses greedy approach
        common_alignment = lus(s, t, k1)
        print(common_alignment)
        
        # align s and t to common prefix, get the left flanking and right flanking sequence
        sl, sr, tl, tr = split_string(s, t, common_alignment)

        # left kmer search space
        search_space_l = get_search_space_left(sl, tl, k1, k2, common_alignment)

        # right kmer search space
        search_space_r = get_search_space_right(sr, tr, k1, k2, common_alignment)

        # print(search_space_r)


        # test all search space values and select the one that leads to highest global alignment score when applied

        # apply transformation

        # similarity score it with ENTIRE s and t string. if pass threshold with global alignemnt, dn this and return s and t transformed

        # if not, apply transformation to s

        # do this recursively with remaining sequence that is before transformation site, if left flanking as well as reamining sequence that is after the transformation site

    
    # you shoudl have a fully transformed tandem s string that when aligned with t, will produce a similarity score above the trheshold v


# tandem_transform_lcs("ACCATCTT",
#                      "TACCATCATC",
#                      2,
#                      3,
#                      6)

# tandem_transform_lcs("GGACACTT",
#                      "AAGGACT",
#                      2,
#                      3,
#                      0)

tandem_transform_lcs("CCTAAAACTCTA",
                     "CCTAACTCTCTA",
                     2,
                     3,
                     12)


AACTCT
