# Odległość edycyjna

In [1]:
import numpy as np

In [2]:
def delta1(a,b):
    if a==b:
        return 0
    else:
        return 1
    
def calc_edit_table(x,y,delta=delta1):
    edit_table = np.empty((len(x)+1,len(y)+1))
    edit_table[:,0] = np.arange(len(x) + 1)
    edit_table[0,:] = np.arange(len(y) + 1)
    for i in range(len(x)):
        k = i + 1
        for j in range(len(y)):
            l = j + 1
            edit_table[k,l] = min(edit_table[k-1,l]+1,
                                  edit_table[k,l-1]+1,
                                  edit_table[k-1,l-1]+delta(x[i],y[j]))
    return edit_table

def edit_disance(a, b, delta=delta1):
    return calc_edit_table(a,b,delta)[-1,-1]

In [3]:
print(edit_disance('wojtk','wjeek'))
print(calc_edit_table('wojtk','wjeek'))
print(calc_edit_table('cbabac','abcabbbaa'))

3.0
[[0. 1. 2. 3. 4. 5.]
 [1. 0. 1. 2. 3. 4.]
 [2. 1. 1. 2. 3. 4.]
 [3. 2. 1. 2. 3. 4.]
 [4. 3. 2. 2. 3. 4.]
 [5. 4. 3. 3. 3. 3.]]
[[0. 1. 2. 3. 4. 5. 6. 7. 8. 9.]
 [1. 1. 2. 2. 3. 4. 5. 6. 7. 8.]
 [2. 2. 1. 2. 3. 3. 4. 5. 6. 7.]
 [3. 2. 2. 2. 2. 3. 4. 5. 5. 6.]
 [4. 3. 2. 3. 3. 2. 3. 4. 5. 6.]
 [5. 4. 3. 3. 3. 3. 3. 4. 4. 5.]
 [6. 5. 4. 3. 4. 4. 4. 4. 5. 5.]]


In [4]:
def edit_sequence(a,b,delta=delta1):
    edit_table = calc_edit_table(a,b,delta)
    operations = []
    i, j = edit_table.shape
    i, j = i-1, j-1 # starting from sink
    last_cost = edit_table[-1, -1]
    operation = None # 0 - diagonal, 1 - vertical, 2 - horizontal
    while (i, j) != (0,0):
        if i >= 0 and j >= 0:
            operation = np.argmin([edit_table[i-1, j-1], 
                                   edit_table[i-1, j],
                                   edit_table[i, j-1]])
        elif i - 1 >= 0:
            operation = 1
        else:
            operation = 2
        if operation == 0 or operation == 1:
            i -= 1
        if operation == 0 or operation == 2:
            j -= 1
        operations.append(operation)
    return operations[::-1]

In [5]:
edit_sequence('cbabac','abcabbbaa')

[0, 0, 2, 0, 0, 2, 2, 0, 0]

In [6]:
calc_edit_table('cbabac','abcabbbaa')

array([[0., 1., 2., 3., 4., 5., 6., 7., 8., 9.],
       [1., 1., 2., 2., 3., 4., 5., 6., 7., 8.],
       [2., 2., 1., 2., 3., 3., 4., 5., 6., 7.],
       [3., 2., 2., 2., 2., 3., 4., 5., 5., 6.],
       [4., 3., 2., 3., 3., 2., 3., 4., 5., 6.],
       [5., 4., 3., 3., 3., 3., 3., 4., 4., 5.],
       [6., 5., 4., 3., 4., 4., 4., 4., 5., 5.]])

In [7]:
def show_operations(a,b,delta=delta1):
    seq = edit_sequence(a,b,delta)
    
    prefix = ""
    suffix = a
    print("starting from:", suffix)
    print("operations:\n")
    def show_state(infix, operation):
        print(f"{prefix}*{infix}*{suffix} | ({operation})")
    
    idx_a = 0
    idx_b = 0
    for operation in seq:
        if operation == 0: # diagonal (swap/nothing)
            suffix = suffix[1:]
            if a[idx_a] == b[idx_b]:
                pass # no edition is needed
            else: # swap
                show_state(infix=b[idx_b], operation=f"swap {a[idx_a]}=>{b[idx_b]}")
            prefix += b[idx_b]
            idx_a += 1
            idx_b += 1
        if operation == 2: # diagonal (add from b)
            show_state(infix=b[idx_b], operation=f"add {b[idx_b]}")
            prefix += b[idx_b]
            idx_b += 1
        if operation == 1: # vertical (remove from a)
            suffix = suffix[1:]
            show_state(infix="#", operation=f"remove {a[idx_a]}")
            idx_a += 1
    print("\nresult:", prefix)

In [8]:
show_operations("aalos", "kloc")
print("-"*80)
show_operations("Łódź", "Lodz")
print("-"*80)
show_operations("kwintesencja", "quintessence")
print("-"*80)
show_operations("ATGAATCTTACCGCCTCG", "ATGAGGCTCTGGCCCCTG")

starting from: aalos
operations:

*#*alos | (remove a)
*k*los | (swap a=>k)
klo*c* | (swap s=>c)

result: kloc
--------------------------------------------------------------------------------
starting from: Łódź
operations:

*L*ódź | (swap Ł=>L)
L*o*dź | (swap ó=>o)
Lod*z* | (swap ź=>z)

result: Lodz
--------------------------------------------------------------------------------
starting from: kwintesencja
operations:

*q*wintesencja | (swap k=>q)
q*u*intesencja | (swap w=>u)
quintes*s*encja | (add s)
quintessenc*#*a | (remove j)
quintessenc*e* | (swap a=>e)

result: quintessence
--------------------------------------------------------------------------------
starting from: ATGAATCTTACCGCCTCG
operations:

ATGA*G*TCTTACCGCCTCG | (swap A=>G)
ATGAG*G*CTTACCGCCTCG | (swap T=>G)
ATGAGGCT*C*TACCGCCTCG | (add C)
ATGAGGCTCT*G*CCGCCTCG | (swap A=>G)
ATGAGGCTCTG*G*CGCCTCG | (swap C=>G)
ATGAGGCTCTGGC*C*CCTCG | (swap G=>C)
ATGAGGCTCTGGCCCCT*#*G | (remove C)

result: ATGAGGCTCTGGCCCCTG


# Najdłuższy wspólny podciąg

In [9]:
from spacy.tokenizer import Tokenizer
from spacy.lang.xx import MultiLanguage
from random import random

In [10]:
def delta_no_swap(a,b):
    if a==b:
        return 0
    else:
        return np.infty

In [11]:
def lcs(a,b):
    return (len(a) + len(b) - edit_disance(a,b,delta_no_swap)) / 2

def find_lcs(a,b):
    table = calc_edit_table(a, b, delta=delta_no_swap)
    lcs = []
    i = table.shape[0] - 1
    j = table.shape[1] - 1
  #  print(table)
    while (i, j) != (0,0):
        if i >= 0 and j >= 0:
            operation = np.argmin([table[i-1, j-1], 
                                   table[i-1, j],
                                   table[i, j-1]])
        elif i - 1 >= 0:
            operation = 1
        else:
            operation = 2
      #  print(operation)
        if operation == 0:
            if table[i, j] == table[i-1, j-1]:
                lcs.append(a[i-1])
                i -= 1
                j -= 1
            else: # swap (not allowed)
                if i - 1 >= 0:
                    operation = 1
                else:
                    operation = 2
        if operation == 1:
            i -= 1
        if operation == 2:
            j -= 1

    return lcs[::-1]

print(find_lcs('abc','def'))
print(lcs('cbabac','abcabba'))
find_lcs('cbabac','abcabba')

[]
4.0


['c', 'a', 'b', 'a']

In [12]:
with open("romeo-i-julia.txt", "r") as file:
    text = file.read()
text[:100]

'William Shakespeare\n\nRomeo i Julia\ntłum. Józef Paszkowski\n\nISBN 978-83-288-2903-9\n\n\n\nOSOBY:\n * ESKAL'

In [13]:
tokenizer = Tokenizer(MultiLanguage().vocab)
tokens = tokenizer(text)
damaged_texts = [[str(token) for token in tokens if random() > 0.03] for i in range(2)]

In [61]:
n=2000
lcs(damaged_texts[0][:n], damaged_texts[1][:n])

1934.0

In [62]:
result = find_lcs(damaged_texts[0][:n], damaged_texts[1][:n])
len(result)

1934

In [14]:
def show_content(files):
    for filename in files:
        with open(filename, "r") as file:
            print(filename)
            print( "-"*40, "\n" + file.read(), "-"*40)

def diff(file_a, file_b):
    "lines are enumerated from 0"
    texts = []
    for filename in [file_a, file_b]:
        with open(filename, "r") as file:
            texts.append(file.read().split("\n"))
            
    idx = [0, 0]
    idx_seq = 0
            
    def show_file(i, omit_line=False):
        if not omit_line:
            print(idx[i])
        arrow = ">"
        if i==0:
            arrow = "<"
        while texts[i][idx[i]] != seqence[idx_seq]:
            print(f"{arrow}{repr(texts[i][idx[i]])}")
            idx[i] += 1
            
    seqence = find_lcs(texts[0], texts[1])
  #  print(seqence)
    while idx[0] < len(texts[0]) and idx[1] < len(texts[1]):
        if idx_seq < len(seqence) and texts[0][idx[0]] == texts[1][idx[1]] == seqence[idx_seq]:
            idx[0], idx[1] = idx[0] + 1, idx[1] + 1
            idx_seq += 1
            continue
        elif texts[1][idx[1]] is seqence[idx_seq]:
            show_file(0)
        elif texts[0][idx[0]] is seqence[idx_seq]:
            show_file(1)
        else:
            show_file(0)
            print("-"*20)
            show_file(1, omit_line=True)
        print()
        
show_content(["diff1.txt", "diff2.txt"])
print("\ndiff:")
diff("diff1.txt", "diff2.txt")

diff1.txt
---------------------------------------- 
Gdybym jabłka 
jadł 
to 
bym jałbka jadł.
Erlang to taka funkcyjna Java.
Haskell <3
>3
>2
>1
 ----------------------------------------
diff2.txt
---------------------------------------- 
Gdybym jabłka 
jadł 
to 
bym gruszki jadł.
Haskell <3
>1
>2
>3
 ----------------------------------------

diff:
3
<'bym jałbka jadł.'
<'Erlang to taka funkcyjna Java.'
--------------------
>'bym gruszki jadł.'

5
>'>1'
>'>2'

7
<'>2'
<'>1'



In [41]:
damaged_texts_lines = [[],[]]
for line in text.split("\n")[:3000]:  # using 3/7 of the text
    tokens = tokenizer(line)
    damaged_lines = [[str(token) for token in tokens if random() > 0.03] for i in range(2)]
    for i in range(2):
        damaged_texts_lines[i].append(damaged_lines[i])

In [43]:
for i in range(2):
    with open(f"tmp{i}.txt", "w+") as file:
        for line in damaged_texts_lines[i]:
            file.write(" ".join(line + ["\n"]))

In [44]:
diff("tmp0.txt", "tmp1.txt")

0
<'William Shakespeare '
--------------------
>'Shakespeare '

3
<'tłum. Józef '
--------------------
>'tłum. Józef Paszkowski '

10
<'  * ESKALUS — książę panujący Weronie '
--------------------
>'  * ESKALUS — książę panujący w Weronie '

18
<'  * LAURENTY ojciec '
--------------------
>'  * LAURENTY — ojciec franciszkanin '

20
<'BALTAZAR — służący Romea '
--------------------
>'  * BALTAZAR — służący Romea '

25
<'  PAŹ PARYSA '
<'  * PIOTR '
--------------------
>'  * PAŹ PARYSA '
>'  * '

37
<'Rzecz odbywa się przez większą część sztuki w Weronie, przez część piątego aktu Mantui. '
--------------------
>'Rzecz odbywa się przez większą część sztuki w Weronie, przez część piątego aktu w Mantui. '

46
<'Tam, gdzie się rzecz ta rozgrywa, w Weronie, '
<'Do nowej zbrodni pchają złości dawne, '
--------------------
>'gdzie się rzecz ta rozgrywa, w Weronie, '
>'Do zbrodni pchają złości dawne, '

50
<'Z łon tych dwu wrogów wzięło bowiem życie, '
--------------------
>'łon tych dwu wrogów