Import the required modules.

In [1]:
import re
import numpy as np
import Levenshtein as levenshtein
import json
import fuzzy

Read all files and store into lists.

In [2]:
def readfile(filename):
    f = open(filename, "r")
    lines = f.readlines()
    ls = []
    for line in lines:
        ls.append(line.strip())
    f.close()
    return ls

In [3]:
dict_ls = readfile("dict.txt")
wiki_correct_ls = readfile("wiki_correct.txt")
wiki_misspell_ls = readfile("wiki_misspell.txt")
birkbeck_correct_ls = readfile("birkbeck_correct.txt")
birkbeck_misspell_ls = readfile("birkbeck_misspell.txt")

Find the best match(es) for the word in dictionary according to the global edit distance.

In [4]:
def best_match_levenshtein(target):
    # init best matches
    min_dist = levenshtein.distance(target,dict_ls[0])
    best_matches = [dict_ls[0]]
    
    for word in dict_ls[1:]:
        if abs(len(word) - len(target)) > min_dist:  # not possible to be min_dist, skip
            continue
        dist = levenshtein.distance(target,word)  # cal global edit distance
        # replace if shorter distance
        if dist < min_dist:
            min_dist = dist
            best_matches = [word]
        elif dist == min_dist:
            best_matches.append(word)
    
    return best_matches

Find the best matches for the words in wiki data sets according to levenshtein.

In [6]:
wiki_correction_dict = {}
for word in wiki_misspell_ls:
    if word not in wiki_correction_dict: # avoid repeated word
        wiki_correction_dict[word] = best_match_levenshtein(word)

Save correction results with global edit distance method.

In [7]:
with open('ged_correction.json', 'w') as fp:
    json.dump(wiki_correction_dict, fp)
    fp.close()

Evaluation Metric Implementation (Precision and Recall only)

In [None]:
def wiki_eval(result_dict):
    tp = 0
    fp = 0
    fn = 0

    for i in range(500):
        word = wiki_misspell_ls[i]
        correction = wiki_correction_dict[word]

        if wiki_correct_ls[i] in correction:
            tp += 1
            fp += len(correction) - 1
        else:
            fn += 1  # miss
            fp += len(correction)

    precision = tp*1.0 / (tp + fp)
    recall = tp*1.0 / (tp + fn)

    return precision,recall

In [9]:
with open('ged_correction.json', 'r') as fp:
   data = json.load(fp)
   fp.close()

In [27]:
for word in wiki_misspell_ls[:500]:
    print wiki_correction_dict[word]

['abandoned']
['abbot', 'abbott', 'about']
['aberration', 'aeration']
['abilities']
['abilities']
['ability']
['abandon', 'bondon']
['abandoned']
['abandoning']
['abandons']
['aborigine']
['abortifacient']
['abote']
['abbreviate', 'breviate']
['abbreviated']
['abbreviation']
['arbitrary']
['abseil', 'absoil', 'assail']
['abseiling', 'assailing']
['absconce', 'absence', 'ascence']
['absence']
['absolutely']
['absorbtion']
['absorbtion']
['abidance', 'abundance']
['abundances']
['abundances']
['abundant']
['abuts', 'butts']
['academy']
['academic', 'cadmic']
['academic', 'accademia']
['academy']
['accused']
['acceleration']
['accension']
['accension']
['acceptance']
['acceptable']
['accessories']
['accessable']
['accident']
['accidental', 'accidentally', 'accidentals', 'accidentary', 'accidently']
['accidently']
['acclimatization']
['accomodate']
['accommodated', 'accomodate']
['accommodates', 'accomodate']
['accommodating']
['accommodation']
['accommodations']
['accomodate']
['accomodat

Find the best match(es) for the word in dictionary according to the soundex table.

In [None]:
def best_match_soundex(target):
    
    soundex = fuzzy.Soundex(4)
    target_soundex = soundex(target)
    
    best_matches = []
    
    for word in dict_ls:
        print word
        print soundex(word)
        if (soundex(word) == target_soundex):  # same soundex
            best_matches.append(word)
    print best_matches
    
    return best_matches

In [None]:
wiki_correction_dict_soundex = {}
for word in wiki_misspell_ls:
    if word not in wiki_correction_dict_soundex:
        wiki_correction_dict_soundex[word] = best_match_soundex(word)

In [None]:
soundex = fuzzy.Soundex(4)
soundex("aam")

In [None]:
print wiki_dict_ls