Import the required modules.

In [1]:
import re
import numpy as np
import Levenshtein as levenshtein
import json
import fuzzy
import numpy as np
from sklearn.model_selection import train_test_split
import soundex

Read all files and store into lists.

In [3]:
def readfile(filename):
    f = open(filename, "r")
    lines = f.readlines()
    ls = []
    for line in lines:
        ls.append(line.strip())
    f.close()
    return ls

In [4]:
dict_ls = readfile("dict.txt")
wiki_correct_ls = readfile("wiki_correct.txt")
wiki_misspell_ls = readfile("wiki_misspell.txt")
birkbeck_correct_ls = readfile("birkbeck_correct.txt")
birkbeck_misspell_ls = readfile("birkbeck_misspell.txt")

Find the best match(es) for the word in dictionary according to the global edit distance.

In [5]:
def best_match_levenshtein(target):
    # init best matches
    min_dist = levenshtein.distance(target,dict_ls[0])
    best_matches = [dict_ls[0]]
    
    for word in dict_ls[1:]:
        if abs(len(word) - len(target)) > min_dist:  # not possible to be min_dist, skip
            continue
        dist = levenshtein.distance(target,word)  # cal global edit distance
        # replace if shorter distance
        if dist < min_dist:
            min_dist = dist
            best_matches = [word]
        elif dist == min_dist:
            best_matches.append(word)
    
    return best_matches

Find the best matches for the words in wiki data sets according to levenshtein.

In [6]:
wiki_correction_dict = {}
for word in wiki_misspell_ls:
    if word not in wiki_correction_dict: # avoid repeated word
        wiki_correction_dict[word] = best_match_levenshtein(word)

Save correction correction results.

In [20]:
with open('ged_correction.json', 'w') as fp:
    json.dump(wiki_correction_dict, fp)
    fp.close()

In [10]:
with open('ged_correction.json', 'r') as fp:
   wiki_correction_dict = json.load(fp)
   fp.close()

Save the random selected mispelled subset (1335 tokens).

In [30]:
# misspell_subset, misspell_left = train_test_split(wiki_misspell_ls, 
#                                                   train_size = 0.3,
#                                                   test_size = 0.7,
#                                                   shuffle = True)

In [31]:
with open('misspell_subset.json', 'w') as fp:
    json.dump(misspell_subset, fp)
    fp.close()

In [None]:
with open('misspell_subset.json', 'r') as fp:
   misspell_subset = json.load(fp)
   fp.close()

Find the best matches for the words in wiki data subset according to levenshtein.

In [32]:
wiki_sub_correction_dict = {}
for word in misspell_subset:
    if word not in wiki_sub_correction_dict: # avoid repeated word
        wiki_sub_correction_dict[word] = best_match_levenshtein(word)

Save correction correction results.

In [33]:
with open('ged_sub_correction.json', 'w') as fp:
    json.dump(wiki_sub_correction_dict, fp)
    fp.close()

In [34]:
with open('ged_sub_correction.json', 'r') as fp:
   wiki_sub_correction_dict = json.load(fp)
   fp.close()

Evaluation Metric Implementation (Precision and Recall and F-Score)

In [18]:
def wiki_eval(result_dict):
    tp = 0
    fp = 0
    fn = 0

    for word in result_dict.keys():
        index = wiki_misspell_ls.index(word)
        true_word = wiki_correct_ls[index]
        
        if true_word in result_dict[word]:
            tp += 1
            fp += len(result_dict[word]) - 1
        else:
            fn += 1 # miss
            fp += len(result_dict[word])
            
    precision = tp*1.0 / (tp + fp)
    recall = tp*1.0 / (tp + fn)
    fscore = 2*(precision*recall)/(precision + recall)

    return precision,recall,fscore

Precision, recall, and F-score for levenshtein (baseline)

In [19]:
print wiki_eval(wiki_correction_dict)

(0.27312278440219145, 0.8019872249822569, 0.40747641084199776)


GED implementation with customized parameter set

In [37]:
# transform s1 to s2# trans 
def edit_distance(s1,s2, params):
    # Levenshteiin Distance (match,insert,delete,replace): the lower the better
    (m,i,d,r) = params
    
    # init matrix
    s1_len = len(s1)
    s2_len = len(s2)
    A = np.zeros((s2_len+1,s1_len+1))
    A[0][0] = 0
    for j in range(1, s2_len+1):
        A[j][0] = j*i;  # insert
    for k in range(1, s1_len+1):
        A[0][k] = k*d;  # delete
        
    # filling in table
    for j in range(1,s2_len+1):
        for k in range(1,s1_len+1):
            A[j][k] = min(A[j][k-1] + d,
                          A[j-1][k] + i,
                          A[j-1][k-1] + equal(s1[k-1],s2[j-1],m,r))
    return A[s2_len,s1_len]
    
def equal(ch1,ch2,cost1,cost2):
    if ch1 == ch2:
        return cost1
    else:
        return cost2

Testing implementation

In [38]:
edit_distance("crat","arts",(0,1,1,1))

3.0

Find the best match(es) for the word in dictionary according to the soundex table.

In [None]:
def best_match_soundex(target):
    
    soundex = fuzzy.Soundex(4)
    target_soundex = soundex(target)
    
    best_matches = []
    
    for word in dict_ls:
        print word
        print soundex(word)
        if (soundex(word) == target_soundex):  # same soundex
            best_matches.append(word)
    print best_matches
    
    return best_matches

In [None]:
wiki_correction_dict_soundex = {}
for word in wiki_misspell_ls:
    if word not in wiki_correction_dict_soundex:
        wiki_correction_dict_soundex[word] = best_match_soundex(word)

In [None]:
soundex = fuzzy.Soundex(4)
soundex("aam")

In [None]:
print wiki_dict_ls