# Levenshtein distance

In [1]:
import numpy as np

In [2]:
def delta(a: str, b: str):
    if a == b:
        return 0
    else:
        return 1

def edit_distance(x: str, y: str, delta):
    # create matrix
    edit_table = np.empty((len(x) + 1, len(y) + 1))
    string_table = np.empty((len(x) + 1, len(y) + 1))
    # fill first row and column
    edit_table[:, 0] = np.arange(len(x) + 1)
    edit_table[0, :] = np.arange(len(y) + 1)
        
    for i in range(len(x)):
        k = i + 1
        for j in range(len(y)):
            l = j + 1
            edit_table[k,l] = min(edit_table[k - 1, l] + 1, 
                                  edit_table[k, l - 1] + 1, 
                                  edit_table[k - 1 , l - 1] + delta(x[i], y[j]))
            
    return edit_table[len(x), len(y)], edit_table

In [3]:
def get_edits(x, y, edit_table):
    k = len(x)
    l = len(y)
    edits = []
    
    while k > 0 or l > 0:
        min_el = edit_table[k, l] + 1 # make sure it will always be bigger than surrundings
        if k >= 1 and l >= 1:
            min_el = min(min_el, edit_table[k - 1, l - 1])
        
        if l >= 1:
            min_el = min(min_el, edit_table[k, l - 1])
            
        if k >= 1:
            min_el = min(min_el, edit_table[k - 1, l])
        
        
        
        
        if edit_table[k - 1, l - 1] == min_el:
            k -= 1
            l -= 1
            if edit_table[k, l] + 1 == edit_table[k + 1, l + 1]:
                edits.append(f'{y[:l]}*{y[l]}*{x[k + 1:]} - substitute {x[k]} -> {y[l]}')
                
            
        elif edit_table[k, l - 1] == min_el:
            l -= 1
            edits.append(f'{y[:l]}*{y[l]}*{x[k:]} - insert {y[l]}')
            
        elif edit_table[k - 1, l] == min_el:
            k -= 1
            edits.append(f'{y[:l]}**{x[k + 1:]} - delete {x[k]}')
    
    return list(reversed(edits))

In [4]:
a = "los"
b = "kloc"
res, edit_table = edit_distance(a, b, delta)
res_edits = get_edits(a, b, edit_table)

print(f'Distance: {res}')
for edit in res_edits:
    print(edit)

Distance: 2.0
*k*los - insert k
klo*c* - substitute s -> c


In [5]:
a = "Łódź"
b = "Lodz"
res, edit_table = edit_distance(a, b, delta)
res_edits = get_edits(a, b, edit_table)

print(f'Distance: {res}')
for edit in res_edits:
    print(edit)

Distance: 3.0
*L*ódź - substitute Ł -> L
L*o*dź - substitute ó -> o
Lod*z* - substitute ź -> z


In [6]:
a = "kwintesencja"
b = "quintessence"
res, edit_table = edit_distance(a, b, delta)
res_edits = get_edits(a, b, edit_table)

print(f'Distance: {res}')
for edit in res_edits:
    print(edit)

Distance: 5.0
*q*wintesencja - substitute k -> q
q*u*intesencja - substitute w -> u
quintes*s*encja - insert s
quintessenc**a - delete j
quintessenc*e* - substitute a -> e


In [7]:
a = "ATGAATCTTACCGCCTCG"
b = "ATGAGGCTCTGGCCCCTG"
res, edit_table = edit_distance(a, b, delta)
res_edits = get_edits(a, b, edit_table)

print(f'Distance: {res}')
for edit in res_edits:
    print(edit)

Distance: 7.0
ATGA*G*TCTTACCGCCTCG - substitute A -> G
ATGAG*G*CTTACCGCCTCG - substitute T -> G
ATGAGGCT*C*TACCGCCTCG - insert C
ATGAGGCTCT*G*CCGCCTCG - substitute A -> G
ATGAGGCTCTG*G*CGCCTCG - substitute C -> G
ATGAGGCTCTGGC*C*CCTCG - substitute G -> C
ATGAGGCTCTGGCCCCT**G - delete C


# Longest common subsequence

In [8]:
def lcs(x, y, edit_table, return_as_list = False):
    k = len(x)
    l = len(y)
    common_subsequence_len = 0
    lcs_len = 0
    lcs_k = -1 
    lcs_l = -1
    
    while k > 0 or l > 0:
        min_el = edit_table[k, l] + 1 # make sure it will always be bigger than surrundings
        if k >= 1 and l >= 1:
            min_el = min(min_el, edit_table[k - 1, l - 1])
        
        if l >= 1:
            min_el = min(min_el, edit_table[k, l - 1])
            
        if k >= 1:
            min_el = min(min_el, edit_table[k - 1, l])
        
        
        
        
        if edit_table[k - 1, l - 1] == min_el:
            k -= 1
            l -= 1
            if edit_table[k, l] == edit_table[k + 1, l + 1]:
                common_subsequence_len += 1
                if common_subsequence_len > lcs_len:
                    lcs_len = common_subsequence_len
                    lcs_k = k
                    lcs_l = l
            else:
                common_subsequence_len = 0
                
            
        elif edit_table[k, l - 1] == min_el:
            l -= 1
            common_subsequence_len = 0
            
        elif edit_table[k - 1, l] == min_el:
            k -= 1
            common_subsequence_len = 0
    
    if return_as_list:
        lcs = []
        for i in range(lcs_len):
             lcs.append(x[lcs_k + i])
    else:
        lcs = x[lcs_k]
        for i in range(1, lcs_len):
             lcs = lcs + x[lcs_k + i]
    
    return lcs

In [9]:
a = "los"
b = "kloc"
res, edit_table = edit_distance(a, b, delta)
lcs_res = lcs(a, b, edit_table)

print(f'LCS: {lcs_res}')

LCS: lo


In [10]:
a = "ATGAATCTTACCGCCTCG"
b = "ATGAGGCTCTGGCCCCTG"
res, edit_table = edit_distance(a, b, delta)
lcs_res = lcs(a, b, edit_table)

print(f'LCS: {lcs_res}')

LCS: ATGA


In [11]:
a = "kwintesencja"
b = "quintessence"
res, edit_table = edit_distance(a, b, delta)
lcs_res = lcs(a, b, edit_table)

print(f'LCS: {lcs_res}')

LCS: intes


In [12]:
string = "ala ma kota i psa i jest super"
a = string.split()
b = a[:3] + a[-2:]
res, edit_table = edit_distance(a, b, delta)
lcs_res = lcs(a, b, edit_table, return_as_list=True)

print(f'LCS: {lcs_res}')

LCS: ['ala', 'ma', 'kota']


# Custom 'diff' tool

In [13]:
from spacy.tokenizer import Tokenizer
from spacy.lang.pl import Polish
from copy import copy
nlp = Polish()
# Create a blank Tokenizer with just the English vocab
tokenizer = Tokenizer(nlp.vocab)

2022-05-12 13:54:11.414319: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/przemek/projects/space_systems/ws_core/devel/lib:/opt/ros/noetic/lib:/opt/ros/noetic/lib/x86_64-linux-gnu
2022-05-12 13:54:11.414338: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [14]:
# split text
with open("romeo-i-julia-700.txt", "r") as file:
    text = file.read()
    text = text.replace("\n", " \n ")
    tokens = text.split(' ')

In [15]:
# remove 3% of lines
edited_tokens = []

idx_to_del = np.random.randint(0, len(tokens) - 1, len(tokens)*3//100)
for idx, token in enumerate(tokens):
    if idx not in idx_to_del:
        edited_tokens.append(token)
    elif token == "\n":
        edited_tokens.append(token)

edited_text = " ".join(edited_tokens)

In [16]:
def diff(text, other_text):
    for idx, lines in enumerate(zip(text.split('\n'), other_text.split('\n'))):
        line, line_edited = lines

        res, edit_table = edit_distance(line.split(' '), line_edited.split(' '), delta)
        lcs_res = lcs(line.split(' '), line_edited.split(' '), edit_table, return_as_list=True)

        difference = set([el for el in line.split(' ') if el not in lcs_res]) | set([el for el in line_edited.split(' ') if el not in lcs_res])
        if len(difference) != 0:
            print(f'DIFF DETECTED IN LINE {idx}:\nline1:\n{line}\nline2:\n{line_edited}\ndiff: {difference}\n\n')

DIFF DETECTED IN LINE 16:
line1:
  * BENWOLIO — synowiec Montekiego 
line2:
  * BENWOLIO — Montekiego 
diff: {'synowiec', 'Montekiego'}


DIFF DETECTED IN LINE 21:
line1:
  * SAMSON, GRZEGORZ — słudzy Kapuleta 
line2:
  SAMSON, GRZEGORZ — słudzy Kapuleta 
diff: {'*'}


DIFF DETECTED IN LINE 30:
line1:
  * JULIA — córka Kapuletów 
line2:
  * JULIA — Kapuletów 
diff: {'córka', 'Kapuletów'}


DIFF DETECTED IN LINE 32:
line1:
  * Obywatele weroneńscy, różne osoby płci obojej, liczący się do przyjaciół obu domów, maski, straż wojskowa i inne osoby. 
line2:
  * Obywatele weroneńscy, różne osoby płci obojej, liczący się do obu domów, maski, straż wojskowa i inne osoby. 
diff: {'wojskowa', 'obu', 'i', 'osoby.', 'maski,', 'przyjaciół', 'inne', 'domów,', 'straż'}


DIFF DETECTED IN LINE 56:
line1:
 I jak się ojców nienawiść nie zmienia, 
line2:
 I jak się ojców nie zmienia, 
diff: {'nie', 'nienawiść', 'zmienia,'}


DIFF DETECTED IN LINE 75:
line1:
 SAMSON 
line2:
 
diff: {'SAMSON'}


DIFF DETECT