In [1]:
import re
import numpy as np
import editdistance

Read all files and store into lists.

In [2]:
def readfile(filename):
    f = open(filename, "r")
    lines = f.readlines()
    ls = []
    for line in lines:
        ls.append(line.strip())
    f.close()
    return ls

dict_ls = readfile("dict.txt")
wiki_correct_ls = readfile("wiki_correct.txt")
wiki_misspell_ls = readfile("wiki_misspell.txt")
birkbeck_correct_ls = readfile("birkbeck_correct.txt")
birkbeck_misspell_ls = readfile("birkbeck_misspell.txt")

Implementation of Needleman-Wunsch Algorithm (Global Edit Distance) with Levenshtein Distance.

In [3]:
# transform s1 to s2
def edit_distance(s1,s2):
    # Levenshteiin Distance (match,insert,delete,replace): the lower the better
    m = 0
    i = 1
    d = 1
    r = 1
    
    # init matrix
    s1_len = len(s1)
    s2_len = len(s2)
    A = np.zeros((s2_len+1,s1_len+1))
    A[0][0] = 0
    for j in range(1, s2_len+1):
        A[j][0] = j*i;  # insert
    for k in range(1, s1_len+1):
        A[0][k] = k*d;  # delete
        
    # filling in table
    for j in range(1,s2_len+1):
        for k in range(1,s1_len+1):
            A[j][k] = min(A[j][k-1] + d,
                          A[j-1][k] + i,
                          A[j-1][k-1] + equal(s1[k-1],s2[j-1],m,r))

    return A[s2_len,s1_len]
    
def equal(ch1,ch2,cost1,cost2):
    if ch1 == ch2:
        return cost1
    else:
        return cost2

Find the best match for the word in dictionary according to the global edit distance.

In [4]:
def best_match_edit_distance(target):
    # init best matches
    min_dist = editdistance.eval(target,dict_ls[0])
    best_matches = [dict_ls[0]]
    
    for word in dict_ls[1:]:
        if abs(len(word) - len(target)) > min_dist:  # not possible to be min_dist, skip
            continue
        dist = editdistance.eval(target,word)  # cal global edit distance
        # replace if shorter distance
        if dist < min_dist:
            min_dist = dist
            best_matches = [word]
        elif dist == min_dist:
            best_matches.append(word)
    
    return best_matches

Find the best matches for the words in wiki data sets. 

In [8]:
wiki_correction_dict = {}
for word in wiki_misspell_ls:
    if word not in wiki_correction_dict:
        wiki_correction_dict[word] = best_match_edit_distance(word)

In [17]:
from ngram import NGram as ngram
ngram.compare('Ham','Spam',N=2)

0.2857142857142857