### Imports

In [17]:
import numpy as np
from bisect import bisect
from unidecode import unidecode
from spacy.tokenizer import Tokenizer
from spacy.lang.pl import Polish
import random

### Edit distance

In [18]:
def delta_standard(a, b):
    """ Function calculating distance between two letters, tokens, lines, ...

    :arg
        a,b: letters, token, lines, ... between which there will be calculated edit distance

    :returns
        if the arguments are the same: 0; otherwise 1
    """

    if a == b:
        return 0
    else:
        return 1
    

def delta_unidecode(a, b):
    """ Function calculating distance between two letters, tokens, lines, ... 
            but with special case for polish characters

    :arg
        a,b: letters, token, lines, ... between which there will be calculated edit distance

    :returns
        if the arguments are the same: 0; if not but converted to ASCII are the same, then 0.5; 
        otherwise 0
    """

    if a == b:
        return 0
    elif unidecode(a) == unidecode(b):
        return 0.5
    else:
        return 1


def get_changes_sequence(parent):
    """ Getting sequence of steps to make to change one chain to another

     :arg
        parent: 2d array where every cell has tuple containing position of element from which it is the best
            (considering distance) to move to that cell

    :returns
        list of steps (tuples) to make to get the best (in distance) change from one chain to another
     """

    x, y = len(parent)-1, len(parent[0])-1
    sequence = [(x, y)]

    while (x, y) != (0, 0):
        x, y = parent[x][y]
        sequence.append((x, y))

    sequence.reverse()

    return sequence


def edit_distance(x, y, delta=delta_standard, get_distance=True):
    """ Calculating edit distance

    :arg
        x, y:         chains between which edit distance is calculated
        delta:        function used to get edit distance between two subchains (for example letters)
        get_distance: whether function is returning edit distance and parent array or just parent array

    :returns
        edit distance and parent 2d array if get_distance == True; just parent 2d array otherwise
    """

    # initializing table by filling first row and column
    edit_table = np.empty((len(x) + 1, len(y) + 1))
    for i in range(len(x) + 1):
        edit_table[i, 0] = i
    for j in range(len(y) + 1):
        edit_table[0, j] = j

    # filling all the 2d array
    parent = [[(0, 0) for _ in range(len(y)+1)] for _ in range(len(x)+1)]
    for i in range(1, len(x) + 1):
        for j in range(1, len(y) + 1):
            # getting minimal edit distance to get to this position in array and position itself
            minimum, position = min((edit_table[i-1, j] + 1, (i-1, j)),
                                    (edit_table[i, j-1] + 1, (i, j-1)),
                                    (edit_table[i-1, j-1] + delta(x[i-1], y[j-1]), (i-1, j-1)))
            edit_table[i, j] = minimum
            parent[i][j] = position

    # return depends on get_table variable
    if get_distance:
        return edit_table[len(x), len(y)], parent
    else:
        return parent


def visualize(x, y, delta=delta_standard):
    """ Visualizing changing one chain to another

        Args:
            x, y:  we are visualizing changing chain x to chain y
            delta: function used to get edit distance between two subchains (for example letters)
    """

    # getting edit distance and sequence of changes needed to make to get second chain from the first one
    distance, parent = edit_distance(x, y, delta)
    sequence = get_changes_sequence(parent)

    # printing header
    print(f"Edit distance: {distance}")

    curr_word = x
    curr_letter = 0
    print(x)

    # visualizing change by making steps specified in sequence
    for i in range(1, len(sequence)):
        # considering last two elements in sequence
        row, col = sequence[i]
        last_row, last_col = sequence[i-1]

        # part of chain which will be printed in all cases
        to_print = curr_word[:curr_letter] + "*"

        # step made diagonally
        if row - last_row + col - last_col == 2:
            if curr_word[curr_letter] != y[last_col]:
                print(f"{to_print}{y[last_col]}*{curr_word[(curr_letter+1):]}  "
                      f"| changed {curr_word[curr_letter]}->{y[last_col]}")

            curr_word = curr_word[:curr_letter] + y[last_col] + curr_word[(curr_letter+1):]
            curr_letter += 1
        # step made vertically
        elif row - last_row == 1:
            print(f"{to_print}*{curr_word[(curr_letter+1):]}  | removed {curr_word[curr_letter]}")
            curr_word = curr_word[:curr_letter] + curr_word[(curr_letter + 1):]
        # step made horizontally
        else:
            print(f"{to_print}{y[last_col]}*{curr_word[curr_letter:]}  | added {y[last_col]}")

            curr_word = curr_word[:curr_letter] + y[last_col] + curr_word[curr_letter:]
            curr_letter += 1
    print()

### Longest common subsequence

In [19]:
def delete_part_of_tokens(destination_path):
    """ Deleting 3% of tokens from romeo-and-juliet.txt and saving result to another file

        :arg
            destination_path: file to which result of the removal will be saved
    
        :returns
            97% of tokens as list of strings
    """

    # preparing tokenizer
    nlp = Polish()
    tokenizer = Tokenizer(nlp.vocab)

    # opening source file
    with open("romeo-and-juliet.txt", "r") as file:
        # tokenizing text
        text = file.read()
        tokens = tokenizer(text)

        # new text saved as string, tokens will be returned as list of strings
        new_text = ""
        new_tokens = []

        # getting random 3% indices of all tokens
        to_delete = set(random.sample(range(len(tokens)), 3 * len(tokens) // 100))

        # it is necessary to avoid deleting newline characters so in that case next possible token is deleted
        debt = 0

        # writing 97% of tokens to new_text
        for i in range(len(tokens)):
            if (i not in to_delete and debt == 0) or tokens[i].text[0] == "\n":
                new_text += tokens[i].text_with_ws
                new_tokens.append(tokens[i].text)

                if i in to_delete:
                    debt += 1
            elif i not in to_delete:
                debt -= 1

        # saving new text to the file
        with open(destination_path, "w") as dest_file:
            dest_file.write(new_text)
        
        return new_tokens


def get_file_lines(file_path):
    with open(file_path, "r") as file:
        text = file.read()
        lines = text.split("\n")
        return lines


def lcs(x, y):
    """ Getting the length of the longest common subsequence in two chains

        :arg
            x, y: chains which length of longest common subsequence will be calculated

        :returns
            length of longest common subsequence in x and y
    """

    ranges = [len(y)]
    y_letters = list(y)

    # for every letter in first chain
    for i in range(len(x)):
        positions = [index for index, letter in enumerate(y_letters) if letter == x[i]]
        positions.reverse()

        for p in positions:
            k = bisect(ranges, p)
            if k == bisect(ranges, p-1):
                if k < len(ranges) - 1:
                    ranges[k] = p
                else:
                    ranges[k:k] = [p]

    return len(ranges) - 1


def diff(x, y):
    """ Function which is similar to system command "diff" (printing result)

        :arg
            x, y: like system's "diff x y"
    """

    # getting edit distance table and parent array and extracting sequence
    parent = edit_distance(x, y, get_distance=False)
    sequence = get_changes_sequence(parent)

    # all elements of chain before index "last_element" of "sequence" were printed or were ok
    last_element = 0

    # for every element in the sequence
    for i in range(1, len(sequence)):
        row, col = sequence[i]
        last_row, last_col = sequence[i-1]

        # printing all different elements of chains between "last_element" and current index
        if row - last_row + col - last_col == 2 and x[last_row] == y[last_col]:
            for x_element in range(sequence[last_element][0], sequence[i-1][0]):
                print(f"< ({x_element})  {x[x_element]}")
            for y_element in range(sequence[last_element][1], sequence[i-1][1]):
                print(f"> ({y_element})  {y[y_element]}")

            last_element = i

    # last elements
    for x_element in range(sequence[last_element][0], sequence[len(sequence)-1][0]):
        print(f"< ({x_element})  {x[x_element]}")
    for y_element in range(sequence[last_element][1], sequence[len(sequence)-1][1]):
        print(f"> ({y_element})  {y[y_element]}")

### Tests - edit distance

In [20]:
visualize("los", "kloc")

Edit distance: 2.0
los
*k*los  | added k
klo*c*  | changed s->c



In [21]:
visualize("Łódź", "Lodz")

Edit distance: 3.0
Łódź
*L*ódź  | changed Ł->L
L*o*dź  | changed ó->o
Lod*z*  | changed ź->z



In [22]:
visualize("Łódź", "Lodz", delta_unidecode)

Edit distance: 1.5
Łódź
*L*ódź  | changed Ł->L
L*o*dź  | changed ó->o
Lod*z*  | changed ź->z



In [23]:
visualize("kwintesencja", "quintessence")

Edit distance: 5.0
kwintesencja
*q*wintesencja  | changed k->q
q*u*intesencja  | changed w->u
quinte*s*sencja  | added s
quintessenc**a  | removed j
quintessenc*e*  | changed a->e



In [24]:
visualize("ATGAATCTTACCGCCTCG", "ATGAGGCTCTGGCCCCTG")

Edit distance: 7.0
ATGAATCTTACCGCCTCG
ATGA*G*TCTTACCGCCTCG  | changed A->G
ATGAG*G*CTTACCGCCTCG  | changed T->G
ATGAGGCT*C*TACCGCCTCG  | added C
ATGAGGCTCT*G*CCGCCTCG  | changed A->G
ATGAGGCTCTG*G*CGCCTCG  | changed C->G
ATGAGGCTCTGGC*C*CCTCG  | changed G->C
ATGAGGCTCTGGCCCCT**G  | removed C



### Tests - longest common subsequence

In [25]:
lcs("los", "kloc")

2

In [26]:
lcs("Łódź", "Lodz")

1

In [27]:
lcs("kwintesencja", "quintessence")

8

In [28]:
lcs("ATGAATCTTACCGCCTCG", "ATGAGGCTCTGGCCCCTG")

13

### Tokenization, deleting 3% of tokens and saving result to file

In [29]:
tokens2 = delete_part_of_tokens("romeo-and-juliet2.txt")
tokens3 = delete_part_of_tokens("romeo-and-juliet3.txt")

### Longest common subsequence for these texts

In [30]:
length_of_lcs = lcs(tokens2, tokens3)
rate = 100 * length_of_lcs / len(tokens2)

print("lcs: " + str(length_of_lcs))
print("rate: {0:.2f}%".format(rate))

lcs: 2141
rate: 97.14%


### diff romeo-and-juliet2.txt romeo-and-juliet3.txt

In [31]:
lines1 = get_file_lines("romeo-and-juliet2.txt")
lines2 = get_file_lines("romeo-and-juliet3.txt")

diff(lines1, lines2)

< (2)  i Julia
> (2)  Romeo i Julia
< (5)  ISBN 978-83-288-2903-9
> (5)  978-83-288-2903-9
< (9)  
> (9)  OSOBY:
< (12)   * KAPULET — naczelnicy dwóch domów nieprzyjaznych sobie
> (12)   * MONTEKI, KAPULET — naczelnicy dwóch nieprzyjaznych sobie
< (14)   * ROMEO — syn Montekiego
< (15)   * MERKUCJO — krewny księcia
> (14)   * — syn Montekiego
> (15)   * MERKUCJO krewny 
< (17)   * TYBALT — krewny Pani 
> (17)   * TYBALT — krewny Pani Kapulet
< (19)   * JAN — brat tegoż zgromadzenia
> (19)   * JAN — brat z tegoż zgromadzenia
< (25)   * PAŹ 
> (25)   * PAŹ PARYSA
< (28)   * PANI MONTEKI — małżonka Montekiego
> (28)   PANI MONTEKI — małżonka Montekiego
< (30)   * JULIA — Kapuletów
> (30)   * JULIA — córka Kapuletów
< (32)   * Obywatele weroneńscy, różne osoby płci obojej, liczący się do przyjaciół obu domów, maski, straż wojskowa i inne osoby.
> (32)   * Obywatele różne osoby płci obojej, liczący się do przyjaciół obu domów, maski, straż wojskowa i inne osoby.
< (45)  Dwa rody, zacne i sł