# Imports


In [1]:
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import joblib
import random
import sys
import os
import re

from rapidfuzz import fuzz
from rapidfuzz.distance import Levenshtein
import unicodedata

from ann_levenshtein import LevenshteinIndex

# Data


### Random Strings


In [661]:
strings = [
    "AAA", "AAB", "AAC", "ABA", 
    "ABB", "ABC", "ACA", "ACB",
    "ACC", "BAA", "BAB", "BAC", 
    "BBA", "BBB", "BBC", "BCA",
    "BCB", "BCC", "CAA", "CAB", 
    "CAC", "CBA", "CBB", "CBC",
    "CCA", "CCB", "CCC", "ABCA", 
    "BABC", "CABA", "ACBC","CBAC"
]

In [690]:
indices = [18]
selected = [strings[i] for i in indices]
selected


['CAA']

### function


In [None]:
def Levenshtein_dist_vector(s1, s2):
        insertions, deletions, substitutions = 0, 0, 0
        for tag, _, _ in Levenshtein.editops(s1, s2):
            if tag == 'insert':
                insertions += 1
            elif tag == 'delete':
                deletions += 1
            elif tag == 'replace':
                substitutions += 1
        return (insertions, deletions, substitutions)

# Build tree


### Initialize


In [1512]:
np.random.seed(0)
indices = np.arange(len(strings))

In [1513]:
# Initialize
estimated_nodes = 8 * len(strings) + 8

tree_s1 = np.full(shape=estimated_nodes, fill_value=-1, dtype=np.int32)
tree_s2 = np.full(shape=estimated_nodes, fill_value=-1, dtype=np.int32)

tree_ppp = np.full(shape=estimated_nodes, fill_value=-1, dtype=np.int32)
tree_ppm = np.full(shape=estimated_nodes, fill_value=-1, dtype=np.int32)
tree_pmp = np.full(shape=estimated_nodes, fill_value=-1, dtype=np.int32)
tree_pmm = np.full(shape=estimated_nodes, fill_value=-1, dtype=np.int32)
tree_mpp = np.full(shape=estimated_nodes, fill_value=-1, dtype=np.int32)
tree_mpm = np.full(shape=estimated_nodes, fill_value=-1, dtype=np.int32)
tree_mmp = np.full(shape=estimated_nodes, fill_value=-1, dtype=np.int32)
tree_mmm = np.full(shape=estimated_nodes, fill_value=-1, dtype=np.int32)
leaf_value = np.full(shape=estimated_nodes, fill_value=-1, dtype=np.int32)

In [1514]:
tree_s1

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1

In [1515]:
# queue
queue = [(0, np.arange(len(strings)))]  # (idx, indices)
node_counter = 1


In [1516]:
queue

[(0,
  array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
         17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]))]

### node 0


In [1517]:
current_node_id, indices = queue.pop(0)

In [1518]:
print(current_node_id, indices)

0 [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31]


In [1519]:
np.random.seed(0)
s1_idx, s2_idx = np.random.choice(indices, 2, replace=False)
s1, s2 = strings[s1_idx], strings[s2_idx]

In [1520]:
print(s1, s2)

BAC CBB


In [1521]:
mask_ppp = np.full(len(indices), False, dtype=bool)
mask_ppm = np.full(len(indices), False, dtype=bool)
mask_pmp = np.full(len(indices), False, dtype=bool)
mask_pmm = np.full(len(indices), False, dtype=bool)
mask_mpp = np.full(len(indices), False, dtype=bool)
mask_mpm = np.full(len(indices), False, dtype=bool)
mask_mmp = np.full(len(indices), False, dtype=bool)
mask_mmm = np.full(len(indices), False, dtype=bool)

In [1522]:
for j, idx in enumerate(indices):
    each_string = strings[idx]
    (a1, a2, a3) = Levenshtein_dist_vector(each_string, s1)
    (b1, b2, b3) = Levenshtein_dist_vector(each_string, s2)
    # split into 8 parts
    if a1>=b1 and a2>=b2 and a3>=b3:  # +++
        mask_ppp[j] = True  # go tree_ppp
    elif a1>=b1 and a2>=b2 and a3<b3:  # ++-
        mask_ppm[j] = True  # go tree_ppm
    elif a1>=b1 and a2<b2 and a3>=b3:  # +-+
        mask_pmp[j] = True  # go tree_pmp
    elif a1>=b1 and a2<b2 and a3<b3:  # +--
        mask_pmm[j] = True  # go tree_pmm
    elif a1<b1 and a2>=b2 and a3>=b3:  # -++
        mask_mpp[j] = True  # go tree_mpp
    elif a1<b1 and a2>=b2 and a3<b3:  # -+-
        mask_mpm[j] = True  # go tree_mpm
    elif a1<b1 and a2<b2 and a3>=b3:  # --+
        mask_mmp[j] = True  # go tree_mmp
    else:  # ---
        mask_mmm[j] = True  # go tree_mmm

In [1523]:
indices_ppp = indices[mask_ppp]
indices_ppm = indices[mask_ppm]
indices_pmp = indices[mask_pmp]
indices_pmm = indices[mask_pmm]
indices_mpp = indices[mask_mpp]
indices_mpm = indices[mask_mpm]
indices_mmp = indices[mask_mmp]
indices_mmm = indices[mask_mmm]

In [1524]:
eight_indices = [indices_ppp, indices_ppm, indices_pmp, indices_pmm, indices_mpp, indices_mpm, indices_mmp, indices_mmm]

In [1525]:
tree_s1[current_node_id] = s1_idx
tree_s2[current_node_id] = s2_idx

In [1526]:
id_ppp = node_counter
id_ppm = node_counter + 1
id_pmp = node_counter + 2
id_pmm = node_counter + 3
id_mpp = node_counter + 4
id_mpm = node_counter + 5
id_mmp = node_counter + 6
id_mmm = node_counter + 7
node_counter += 8

In [1527]:
tree_ppp[current_node_id] = id_ppp
tree_ppm[current_node_id] = id_ppm
tree_pmp[current_node_id] = id_pmp
tree_pmm[current_node_id] = id_pmm
tree_mpp[current_node_id] = id_mpp
tree_mpm[current_node_id] = id_mpm
tree_mmp[current_node_id] = id_mmp
tree_mmm[current_node_id] = id_mmm

eight_ids = [id_ppp, id_ppm, id_pmp, id_pmm, id_mpp, id_mpm, id_mmp, id_mmm]
# end as a leaf
for indices, next_id in zip(eight_indices, eight_ids):
    if len(indices) == 0:
        pass

    elif len(indices) == 1:
        leaf_value[next_id] = indices[0]

    else:
        queue.append((next_id, indices))

In [1528]:
for q in queue:
    print(q)

(1, array([ 1,  4, 13, 16, 18, 19, 22, 23, 25, 26, 29]))
(2, array([ 0,  2,  3,  5,  6,  7,  8, 15, 20, 21, 24, 27, 30, 31]))
(7, array([ 9, 10, 12, 14, 17, 28]))


### until end


In [1529]:
while queue:
    current_node_id, indices = queue.pop(0)

    print('current node:', current_node_id)
    print('indices:', indices)
    print()

    np.random.seed(0)
    s1_idx, s2_idx = np.random.choice(indices, 2, replace=False)
    s1, s2 = strings[s1_idx], strings[s2_idx]
    print('s1:', s1)
    print('s2:', s2)
    print()

    mask_ppp = np.full(len(indices), False, dtype=bool)
    mask_ppm = np.full(len(indices), False, dtype=bool)
    mask_pmp = np.full(len(indices), False, dtype=bool)
    mask_pmm = np.full(len(indices), False, dtype=bool)
    mask_mpp = np.full(len(indices), False, dtype=bool)
    mask_mpm = np.full(len(indices), False, dtype=bool)
    mask_mmp = np.full(len(indices), False, dtype=bool)
    mask_mmm = np.full(len(indices), False, dtype=bool)

    for j, idx in enumerate(indices):
        each_string = strings[idx]
        (a1, a2, a3) = Levenshtein_dist_vector(each_string, s1)
        (b1, b2, b3) = Levenshtein_dist_vector(each_string, s2)
        # split into 8 parts
        if a1>=b1 and a2>=b2 and a3>=b3:  # +++
            mask_ppp[j] = True  # go tree_ppp
        elif a1>=b1 and a2>=b2 and a3<b3:  # ++-
            mask_ppm[j] = True  # go tree_ppm
        elif a1>=b1 and a2<b2 and a3>=b3:  # +-+
            mask_pmp[j] = True  # go tree_pmp
        elif a1>=b1 and a2<b2 and a3<b3:  # +--
            mask_pmm[j] = True  # go tree_pmm
        elif a1<b1 and a2>=b2 and a3>=b3:  # -++
            mask_mpp[j] = True  # go tree_mpp
        elif a1<b1 and a2>=b2 and a3<b3:  # -+-
            mask_mpm[j] = True  # go tree_mpm
        elif a1<b1 and a2<b2 and a3>=b3:  # --+
            mask_mmp[j] = True  # go tree_mmp
        else:  # ---
            mask_mmm[j] = True  # go tree_mmm

    indices_ppp = indices[mask_ppp]
    indices_ppm = indices[mask_ppm]
    indices_pmp = indices[mask_pmp]
    indices_pmm = indices[mask_pmm]
    indices_mpp = indices[mask_mpp]
    indices_mpm = indices[mask_mpm]
    indices_mmp = indices[mask_mmp]
    indices_mmm = indices[mask_mmm]

    eight_indices = [indices_ppp, indices_ppm, indices_pmp, indices_pmm, indices_mpp, indices_mpm, indices_mmp, indices_mmm]

    tree_s1[current_node_id] = s1_idx
    tree_s2[current_node_id] = s2_idx

    id_ppp = node_counter
    id_ppm = node_counter + 1
    id_pmp = node_counter + 2
    id_pmm = node_counter + 3
    id_mpp = node_counter + 4
    id_mpm = node_counter + 5
    id_mmp = node_counter + 6
    id_mmm = node_counter + 7
    node_counter += 8

    tree_ppp[current_node_id] = id_ppp
    tree_ppm[current_node_id] = id_ppm
    tree_pmp[current_node_id] = id_pmp
    tree_pmm[current_node_id] = id_pmm
    tree_mpp[current_node_id] = id_mpp
    tree_mpm[current_node_id] = id_mpm
    tree_mmp[current_node_id] = id_mmp
    tree_mmm[current_node_id] = id_mmm

    eight_ids = [id_ppp, id_ppm, id_pmp, id_pmm, id_mpp, id_mpm, id_mmp, id_mmm]
    # end as a leaf
    for indices, next_id in zip(eight_indices, eight_ids):
        if len(indices) == 0:
            pass

        elif len(indices) == 1:
            leaf_value[next_id] = indices[0]

        else:
            queue.append((next_id, indices))
            
    print('queue')
    for q in queue:
        print(q)

current node: 1
indices: [ 1  4 13 16 18 19 22 23 25 26 29]

s1: CAA
s2: CCC

queue
(2, array([ 0,  2,  3,  5,  6,  7,  8, 15, 20, 21, 24, 27, 30, 31]))
(7, array([ 9, 10, 12, 14, 17, 28]))
(9, array([13, 16, 22, 23, 25, 26]))
(10, array([ 1,  4, 18, 19, 29]))
current node: 2
indices: [ 0  2  3  5  6  7  8 15 20 21 24 27 30 31]

s1: CAC
s2: ACC

queue
(7, array([ 9, 10, 12, 14, 17, 28]))
(9, array([13, 16, 22, 23, 25, 26]))
(10, array([ 1,  4, 18, 19, 29]))
(17, array([ 0,  2,  8, 24, 30]))
(18, array([ 3,  5,  6,  7, 15, 27]))
(23, array([20, 21, 31]))
current node: 7
indices: [ 9 10 12 14 17 28]

s1: BABC
s2: BBA

queue
(9, array([13, 16, 22, 23, 25, 26]))
(10, array([ 1,  4, 18, 19, 29]))
(17, array([ 0,  2,  8, 24, 30]))
(18, array([ 3,  5,  6,  7, 15, 27]))
(23, array([20, 21, 31]))
(25, array([ 9, 12]))
(26, array([14, 17]))
current node: 9
indices: [13 16 22 23 25 26]

s1: CCC
s2: CBB

queue
(10, array([ 1,  4, 18, 19, 29]))
(17, array([ 0,  2,  8, 24, 30]))
(18, array([ 3,  5, 

In [1530]:
queue

[]

### End of queue


# Tree Analysis


In [1250]:
tree = [
    tree_s1, tree_s2, 
    tree_ppp, tree_ppm, tree_pmp, tree_pmm, 
    tree_mpp, tree_mpm, tree_mmp, tree_mmm, 
    leaf_value
]

In [1251]:
tree_s1

array([11, 18, 20, -1, -1, -1, -1, 28, -1, 26, 18, -1, -1, -1, -1, -1, -1,
        8, 27, -1, -1, -1, -1, 31, -1, 12, 17, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1

In [1252]:
tree_s2

array([22, 26,  8, -1, -1, -1, -1, 12, -1, 22,  1, -1, -1, -1, -1, -1, -1,
        0,  6, -1, -1, -1, -1, 21, -1,  9, 14, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1

In [1253]:
tree_ppp

array([ 1,  9, 17, -1, -1, -1, -1, 25, -1, 33, 41, -1, -1, -1, -1, -1, -1,
       49, 57, -1, -1, -1, -1, 65, -1, 73, 81, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1

In [1254]:
leaf_value

array([-1, -1, -1, -1, -1, -1, -1, -1, 11, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1