In [None]:
import numpy as np
import pandas as pd

# Introduction

Goal:

* evaluate spelling similarity of two string 



Step:

* initial state : the word we're transforming

* operators : delete,switch,replace,insert (Note that replace is equal to delete + insert)

* goal state : the word we're trying to get to

* path cost : what we want to minimize --- the number of edits


More application :

* DNA, spell correction and more


Detail introduction :

* https://web.stanford.edu/class/cs124/lec/med.pdf

# Algorithms

### Dynamic Programming(Levenshstein)

$$\text{Initialization}$$

\begin{align}
D[0,0] &= 0 \\
D[i,0] &= D[i-1,0] + del\_cost(source[i]) \tag{1}\\
D[0,j] &= D[0,j-1] + ins\_cost(target[j]) \\
\end{align}


$$\text{Per Cell Operations}$$
\begin{align}
 \\
D[i,j] =min
\begin{cases}
D[i-1,j] + del\_cost\\
D[i,j-1] + ins\_cost\\
D[i-1,j-1] + \left\{\begin{matrix}
rep\_cost; & if src[i]\neq tar[j]\\
0 ; & if src[i]=tar[j]
\end{matrix}\right.
\end{cases}
\tag{2}
\end{align}

In [None]:
# if we set the parameters to 1,1,2, the method is calledd levenshstein distance

def dp_solver(word,target,del_cost=1,ins_cost=1,rep_cost=2):

  # initialize
  m=len(word)
  n=len(target)
  D=np.zeros((m+1,n+1))
  for i in range(m+1):
    D[i,0]=i
  for j in range(n+1):
    D[0,j]=j

  for i in range(1,m+1):
    for j in range(1,n+1):
      if word[i-1]!=target[j-1]:
        rp=rep_cost
      else:
        rp=0
      
      D[i,j]=min([D[i-1,j]+del_cost,D[i,j-1]+ins_cost,D[i-1,j-1]+rp])

  min_dis=D[m,n]

  return D,min_dis

In [None]:
word='intention'
target='execution'

matrix,min_dis=dp_solver(word,target)

print('minimum distance edit number :' ,min_dis)

pd.DataFrame(matrix,columns=['#']+[c for c in target],
             index=['#']+[c for c in word])

 
 What can't we know from the DP table? :
 
 * I N T E & N T I O N

 * & E X E C U T I O N

 * I -> & : delete  (cost : 1)

 * N -> E : replace  (cost : 2)

 * T -> X : replace  (cost : 2)

 * & -> C : insert  (cost : 2)

 * N -> U : replace  (cost: 1 )


 Use BackTrace solve this problem

### BackTrace

* We often need to align each charactor of the two strings to each other 

* Every time we enter a cell, remember where we came from 

* When we reach the end, trace back the path from the upper right corner to read off the alignment

$$\text{Base Conditions}$$

\begin{align}
D[i,0] &= i \ \ \ D[0,j] = j 
\end{align}

$$\text{Recurrence Relation}$$

\begin{align}
 \\
D[i,j] =min
\begin{cases}
D[i-1,j] + del\_cost\\
D[i,j-1] + ins\_cost\\
D[i-1,j-1] + \left\{\begin{matrix}
rep\_cost; & if src[i]\neq tar[j]\\
0 ; & if src[i]=tar[j]
\end{matrix}\right.
\end{cases}
\tag{2}
\end{align}

\begin{align}
 \\
ptr[i,j] =
\begin{cases}
LEFT(insert)\\
DOWN(delete)\\
DIAG(replace)
\end{cases}
\end{align}

In [None]:
def BackTraceSolver(src,tar,del_cost=1,ins_cost=1,rep_cost=2):
  m=len(src)
  n=len(tar)

  D=np.zeros((m+1,n+1))
  
  for i in range(m+1):
    D[i,0]=i
  for j in range(n+1):
    D[0,j]=j

  prt={}
  for i in range(1,m+1):
    for j in range(1,n+1):
      if src[i-1]!=tar[j-1]:
        rp=rep_cost
      else:
        rp=0

      search={}
      search[(i-1,j)]=D[i-1,j]+del_cost
      search[(i,j-1)]=D[i,j-1]+ins_cost
      search[(i-1,j-1)]=D[i-1,j-1]+rp


      D[i,j]=min(search.values())

      re_search={val:key for key,val in search.items()}

      if (search[(i-1,j)]!=search[(i,j-1)]!=search[(i-1,j-1)]):
        d_i,d_j=re_search[D[i,j]]
        #record path
        prt[(i,j)]=(d_i,d_j)

      else:
        #record path
        prt[(i,j)]=(i-1,j-1)

  # trace back from last point
  trace_back=[]
  last_pt=(m,n)
  while True:
    try :
      prt[last_pt]
    except:
      trace_back.append(last_pt)
      break
    trace_back.append(last_pt)
    last_pt=prt[last_pt]


  min_dis=D[m,n]

  return D,min_dis,trace_back

In [None]:
src='intention'
tar='execution'

matrix,min_ids,trace_back=BackTraceSolver(src,tar)

In [None]:
trace_back

In [None]:
trace_matrix=matrix.copy()
for item in trace_back:
  i,j=item
  trace_matrix[i][j]=1e-7

In [None]:
df=pd.DataFrame(trace_matrix,columns=['#']+[c for c in target],
             index=['#']+[c for c in word])
df

* Show the result by checking the table and print

In [None]:
print('i->#') #delete
print('n->e') #replace
print('t->x') #replace
print('e->e,c') #insert
print('n->u') #replace
print('tion->tion') #same

heatmap

* left : insert

* up : delete

* diag : replace

In [None]:
import seaborn as sns

sns.heatmap(data=df)