# Longest Common Subsequence

Заданы две строки s1 и s2. Требуется найти длину самой длинной общей подпоследовательности. Если общей подпоследовательности нет - выдать 0.

Подпоследовательность - это строка, сгенерированная из исходной строки путем удаления 0 или более символов и без изменения относительного порядка остальных символов.

Например, подпоследовательностями “ABC” являются “”, “A”, “B”, “C”, “AB”, “AC”, “BC” и “ABC”.

В общем случае, строка длины $n$ имеет $2^n$ подпоследовательностей.

In [1]:
using Test

"Наивный" алгоритм. Время - $O(2^{\min(m,n)})$, память - $O(\min(m, n))$

In [2]:
# A Naive recursive solution of LCS problem

# Returns positions of common characters for s1[1:i], s2[1:j]
function lcs(i, j, s1, s2)
    if (i == 0 || j == 0)
        ((), ())
    elseif (s1[i] == s2[j])
        (p0, q0) = lcs(i - 1, j - 1, s1, s2)
        ((p0..., i), (q0..., j))
    else
        (p1, q1) = lcs(i, j - 1, s1, s2)
        (p2, q2) = lcs(i - 1, j, s1, s2)
        length(p1) >= length(p2) ? (p1, q1) : (p2, q2)
    end
end

lcs(s1, s2) = lcs(length(s1), length(s2), s1, s2)

lcs (generic function with 2 methods)

In [3]:
@test lcs("ABC", "ACD") == ((1, 3), (1, 2))
@test lcs("AGGTAB", "GXTXAYB") == ((3, 4, 5, 6), (1, 3, 5, 7))
@test lcs("ABC", "CBA") == ((3,), (1,))

[32m[1mTest Passed[22m[39m

## Мемоизация

Алгоритм с мемоизацией. Время - $O(m * n)$, память - $O(m * n)$.

In [4]:
# A recursive solution of LCS problem with memoization

function lcs_memo(i, j, d, s1, s2)
    if haskey(d, (i, j))
        return d[(i, j)]
    end

    r =
        if (i == 0 || j == 0)
            ((), ())
        else
            if (s1[i] == s2[j])
                (p0, q0) = lcs_memo(i - 1, j - 1, d, s1, s2)
                ((p0..., i), (q0..., j))
            else
                (p1, q1) = lcs_memo(i, j - 1, d, s1, s2)
                (p2, q2) = lcs_memo(i - 1, j, d, s1, s2)
                length(p1) >= length(p2) ? (p1, q1) : (p2, q2)
            end
        end
    d[(i, j)] = r
    return r
end

lcs_memo (generic function with 1 method)

In [5]:
function lcs_memo(s1, s2)
    d = Dict{Tuple{Int,Int},Tuple{Tuple{Vararg{Int}},Tuple{Vararg{Int}}}}()
    lcs_memo(length(s1), length(s2), d, s1, s2)
end

lcs_memo (generic function with 2 methods)

In [6]:
@test lcs_memo("ABC", "ACD") == ((1, 3), (1, 2))
@test lcs_memo("AGGTAB", "GXTXAYB") == ((3, 4, 5, 6), (1, 3, 5, 7))
@test lcs_memo("ABC", "CBA") == ((3,), (1,))

[32m[1mTest Passed[22m[39m

## Подготовка к генерации специализрованных программ

При генерации специализированных программ мемоизация переносится с времени исполнения на время генерации.

Чтобы не образовывались глубоко вложенные условные выражения, переупорядочим вычисления в "наивном" алгоритме. При этом получатся лишние вычисления, которые потом можно будет убрать с помощью мемоизации.

In [7]:
function lcs2(i, j, s1, s2)
    local p, q
    if (i == 0 || j == 0)
        p = ()
        q = ()
    else
        (p0, q0) = lcs2(i - 1, j - 1, s1, s2)
        (p1, q1) = lcs2(i, j - 1, s1, s2)
        (p2, q2) = lcs2(i - 1, j, s1, s2)

        if (s1[i] == s2[j])
            p = (p0..., i)
            q = (q0..., j)
        else
            if length(p1) >= length(p2)
                p = p1
                q = q1
            else
                p = p2
                q = q2
            end
        end
    end
    (p, q)
end

lcs2(s1, s2) = lcs2(length(s1), length(s2), s1, s2)

lcs2 (generic function with 2 methods)

In [8]:
@test lcs2("ABC", "ACD") == ((1, 3), (1, 2))
@test lcs2("AGGTAB", "GXTXAYB") == ((3, 4, 5, 6), (1, 3, 5, 7))
@test lcs2("ABC", "CBA") == ((3,), (1,))

[32m[1mTest Passed[22m[39m

## Специализация по `i` и `j`

- Peter Thiemann. 1999. **Combinators for program generation.** J. Funct. Program. 9, 5 (September 1999), 483–525. https://doi.org/10.1017/S0956796899003469

- Kedar Swadi, Walid Taha, Oleg Kiselyov. 2005. **Staging dynamic programming algorithms**. Unpublished manuscript (April 2005), available from: <http://www.cs.rice.edu/~taha/publications.html>.

- Yukiyoshi Kameyama, Oleg Kiselyov, and Chung-chieh Shan. 2009. **Shifting the stage: staging with delimited control.** In Proceedings of the 2009 ACM SIGPLAN workshop on Partial evaluation and program manipulation (PEPM '09). Association for Computing Machinery, New York, NY, USA, 111–120. <https://doi.org/10.1145/1480945.1480962>

- Oleg Kiselyov. 2010. **Delimited control in OCaml, abstractly and concretely: system description.** In Proceedings of the 10th international conference on Functional and Logic Programming (FLOPS'10). Springer-Verlag, Berlin, Heidelberg, 304–320. https://doi.org/10.1007/978-3-642-12251-4_22

In [9]:
using MacroTools: prettify

"Наивный" генератор. Получается одно громадное выражение, в котором есть совпадающие подвыражения.

In [10]:
function lcs_gen_impl1!(es, d, i, j)
    p = Symbol("p_", i, "_", j)
    q = Symbol("q_", i, "_", j)

    if haskey(d, p)
        return (d[p], d[q])
    end

    if (i == 0 || j == 0)
        d[p] = :(())
        d[q] = :(())
    else
        (p0, q0) = lcs_gen_impl1!(es, d, i - 1, j - 1)
        (p1, q1) = lcs_gen_impl1!(es, d, i, j - 1)
        (p2, q2) = lcs_gen_impl1!(es, d, i - 1, j)

        push!(es, :(local $p, $q))
        d[p] = p
        d[q] = q

        push!(es,
            quote
                if (s1[$i] == s2[$j])
                    $p = ($p0..., $i)
                    $q = ($q0..., $j)
                else
                    if length($p1) >= length($p2)
                        $p = $p1
                        $q = $q1
                    else
                        $p = $p2
                        $q = $q2
                    end
                end
            end)
    end
    (d[p], d[q])
end

function lcs_gen_impl1(i, j)
    es = Expr[]
    d = Dict{Symbol,Any}()
    (p, q) = lcs_gen_impl1!(es, d, i, j)

    quote
        $(es...)
        return ($p, $q)
    end
end

lcs_gen_impl1 (generic function with 1 method)

In [11]:
lcs_gen_impl1(2, 3) |> prettify

quote
    local p_1_1, q_1_1
    if s1[1] == s2[1]
        p_1_1 = (()..., 1)
        q_1_1 = (()..., 1)
    else
        if length(()) >= length(())
            p_1_1 = ()
            q_1_1 = ()
        else
            p_1_1 = ()
            q_1_1 = ()
        end
    end
    local p_1_2, q_1_2
    if s1[1] == s2[2]
        p_1_2 = (()..., 1)
        q_1_2 = (()..., 2)
    else
        if length(p_1_1) >= length(())
            p_1_2 = p_1_1
            q_1_2 = q_1_1
        else
            p_1_2 = ()
            q_1_2 = ()
        end
    end
    local p_2_1, q_2_1
    if s1[2] == s2[1]
        p_2_1 = (()..., 2)
        q_2_1 = (()..., 1)
    else
        if length(()) >= length(p_1_1)
            p_2_1 = ()
            q_2_1 = ()
        else
            p_2_1 = p_1_1
            q_2_1 = q_1_1
        end
    end
    local p_2_2, q_2_2
    if s1[2] == s2[2]
        p_2_2 = (p_1_1..., 2)
        q_2_2 = (q_1_1..., 2)
    else
        if length(p_2_1) >= length(p_1_2)
            p

In [12]:
@generated function lcs_gen1(::Val{i}, ::Val{j}, s1, s2) where {i,j}
    lcs_gen_impl1(i, j)
end

lcs_gen1 (generic function with 1 method)

In [13]:
function lcs_gen1(s1, s2)
    lcs_gen1(Val(length(s1)), Val(length(s2)), s1, s2)
end

lcs_gen1 (generic function with 2 methods)

In [14]:
@test lcs_gen1("ABC", "ACD") == ((1, 3), (1, 2))
# The longest common subsequence is “GTAB”.
@test lcs_gen1("AGGTAB", "GXTXAYB") == ((3, 4, 5, 6), (1, 3, 5, 7))
@test lcs_gen1("ABC", "CBA") == ((3,), (1,))

[32m[1mTest Passed[22m[39m

Теперь - оптимизируем выражения...

## Упрощение выражений через переписывание (Metatheory.jl)

In [15]:
using Metatheory, Metatheory.Rewriters

In [16]:
opt_rules = @theory x y begin

    (x, y) => :($y,) where {x==:(()...)}

    length(x) >= length(()) --> true
    length(()) >= length(y) --> y == ()

    (true ? x : y) --> x

end;

In [17]:
strategy = (#= Fixpoint ∘ =# Postwalk ∘ Chain)
opt_expr(e) = strategy(opt_rules)(e)

opt_expr (generic function with 1 method)

In [18]:
@test opt_expr(:((()..., 10))) == :((10,))
@test opt_expr(:(((1, 2)..., 10))) == :(((1, 2)..., 10))

@test opt_expr(:(length((1, 2)) >= length(()))) == true
@test opt_expr(:(length(()) >= length((1, 2)))) == :((1, 2) == ())

@test opt_expr(:(true ? 10 : 20)) == 10

[32m[1mTest Passed[22m[39m

In [19]:
function lcs_gen_impl2!(es, d, i, j)
    p = Symbol("p_", i, "_", j)
    q = Symbol("q_", i, "_", j)

    if haskey(d, p)
        return (d[p], d[q])
    end

    if (i == 0 || j == 0)
        d[p] = :(())
        d[q] = :(())
    else
        (p0, q0) = lcs_gen_impl2!(es, d, i - 1, j - 1)
        (p1, q1) = lcs_gen_impl2!(es, d, i, j - 1)
        (p2, q2) = lcs_gen_impl2!(es, d, i - 1, j)

        push!(es, :(local $p, $q))
        d[p] = p
        d[q] = q

        push!(es, opt_expr(
            quote
                if (s1[$i] == s2[$j])
                    $p = ($p0..., $i)
                    $q = ($q0..., $j)
                else
                    if length($p1) >= length($p2)
                        $p = $p1
                        $q = $q1
                    else
                        $p = $p2
                        $q = $q2
                    end
                end
            end))
    end
    (d[p], d[q])
end

lcs_gen_impl2! (generic function with 1 method)

In [20]:
function lcs_gen_impl2(i, j)
    es = Expr[]
    d = Dict{Symbol,Any}()
    (p, q) = lcs_gen_impl2!(es, d, i, j)

    quote
        $(es...)
        return ($p, $q)
    end
end

lcs_gen_impl2 (generic function with 1 method)

In [21]:
lcs_gen_impl2(2, 3) |> prettify

quote
    local p_1_1, q_1_1
    if s1[1] == s2[1]
        p_1_1 = (1,)
        q_1_1 = (1,)
    else
        p_1_1 = ()
        q_1_1 = ()
    end
    local p_1_2, q_1_2
    if s1[1] == s2[2]
        p_1_2 = (1,)
        q_1_2 = (2,)
    else
        p_1_2 = p_1_1
        q_1_2 = q_1_1
    end
    local p_2_1, q_2_1
    if s1[2] == s2[1]
        p_2_1 = (2,)
        q_2_1 = (1,)
    else
        if p_1_1 == ()
            p_2_1 = ()
            q_2_1 = ()
        else
            p_2_1 = p_1_1
            q_2_1 = q_1_1
        end
    end
    local p_2_2, q_2_2
    if s1[2] == s2[2]
        p_2_2 = (p_1_1..., 2)
        q_2_2 = (q_1_1..., 2)
    else
        if length(p_2_1) >= length(p_1_2)
            p_2_2 = p_2_1
            q_2_2 = q_2_1
        else
            p_2_2 = p_1_2
            q_2_2 = q_1_2
        end
    end
    local p_1_3, q_1_3
    if s1[1] == s2[3]
        p_1_3 = (1,)
        q_1_3 = (3,)
    else
        p_1_3 = p_1_2
        q_1_3 = q_1_2
    end
    local p_2

In [22]:
@generated function lcs_gen2(::Val{i}, ::Val{j}, s1, s2) where {i,j}
    lcs_gen_impl2(i, j)
end

lcs_gen2 (generic function with 1 method)

In [23]:
function lcs_gen2(s1, s2)
    lcs_gen2(Val(length(s1)), Val(length(s2)), s1, s2)
end

lcs_gen2 (generic function with 2 methods)

In [24]:
@test lcs_gen2("ABC", "ACD") == ((1, 3), (1, 2))
# The longest common subsequence is “GTAB”.
@test lcs_gen2("AGGTAB", "GXTXAYB") == ((3, 4, 5, 6), (1, 3, 5, 7))
@test lcs_gen2("ABC", "CBA") == ((3,), (1,))

[32m[1mTest Passed[22m[39m