# 1. Import modules & Read files

Import required modules.

In [125]:
import re
import numpy as np
import Levenshtein as levenshtein
import json
from sklearn.model_selection import train_test_split
import jellyfish

Define a function to read the files.

In [126]:
def readfile(filename):
    f = open(filename, "r")
    lines = f.readlines()
    ls = []
    for line in lines:
        ls.append(line.strip())
    f.close()
    return ls

Read dicitonary and wiki dataset and store as variables.

In [127]:
dict_ls = readfile("dict.txt")
correct_ls = readfile("wiki_correct.txt")
misspell_ls = readfile("wiki_misspell.txt")

# 2. Evaluation Metrics

Define a function to evaluate the spelling correction results. (Precision & Recall & F-Score)

In [128]:
def wiki_eval(result_dict, curr_misspell_ls, curr_correct_ls):
    tp = 0
    fp = 0
    fn = 0

    for i in range(len(curr_misspell_ls)):
        word = curr_misspell_ls[i]
        true_word = curr_correct_ls[i]
        if true_word in result_dict[word]:
            tp += 1
            fp += len(result_dict[word]) - 1
        else:
            fn += 1 # miss
            fp += len(result_dict[word])
            
    precision = tp*1.0 / (tp + fp)
    recall = tp*1.0 / (tp + fn)
    fscore = 2*(precision*recall)/(precision + recall)

    return precision,recall,fscore

# 3. Baseline method: Levenshtein Distance (LD)
## 3.1. Run with the whole Wikipedia Dataset (4453 tokens)

Define a function to find the best matches for the misspelled words according to the levenshtein distance.

In [129]:
def best_match_LD(target):
    # init best match as first entry of the dict
    min_dist = levenshtein.distance(target,dict_ls[0])
    best_matches = [dict_ls[0]]
    
    for word in dict_ls[1:]:
        if abs(len(word) - len(target)) > min_dist:  # not possible to be min_dist, skip
            continue
        dist = levenshtein.distance(target,word)  # cal global edit distance
        # replace if shorter distance
        if dist < min_dist:
            min_dist = dist
            best_matches = [word]
        elif dist == min_dist:
            best_matches.append(word)
    
    return best_matches

Find the best matches for the words in wiki data sets according to levenshtein.

In [131]:
LD_correction_dict = {}
for word in misspell_ls:
    if word not in LD_correction_dict: # avoid repeated word
        LD_correction_dict[word] = best_match_LD(word)

Save the correction results to json file.

In [132]:
# Save
np.save('levenshtein_results.npy', LD_correction_dict) 

Load the results from json file if required.|

In [133]:
# Load
LD_correction_dict = np.load('levenshtein_results.npy').item()

Precision, recall, and F-score for Levenshtein (baseline)

In [134]:
(precision, recall, fscore) = wiki_eval(LD_correction_dict, misspell_ls, correct_ls)
print "Evaluation Metrics (Levenshtein Distance, 100% Data)"
print "---------------------------------------------------"
print "Precision: " + str(precision)
print "Recall: " + str(recall)
print "F-score: " + str(fscore)

Evaluation Metrics (Levenshtein Distance, 100% Data)
---------------------------------------------------
Precision: 0.260432080497
Recall: 0.790478329216
F-score: 0.391785853414


## 3.2. Run with only 20% of the Wikipedia Dataset

### 3.2.1. Randomly extract 20% of the Wikipedia Dataset as a subset

Randomly extract the subset with "train_test_split" function.

In [46]:
misspell_subset, misspell_left, correct_subset, correct_left = train_test_split(misspell_ls, 
                                                                                correct_ls,
                                                                                train_size = 0.2,
                                                                                test_size = 0.8,
                                                                                shuffle = True)

Combine two subsets into a single list.

In [47]:
combined_subset = [(misspell_subset), (correct_subset)]

Save the combined subset to numpy file.

In [48]:
# Save
np.save('combined_subset.npy', combined_subset) 

Load the combined subset from numpy file and store as variables if required.

In [67]:
# Load
combined_subset = np.load('combined_subset.npy')

# store as variables
misspell_subset = combined_subset[0]
correct_subset = combined_subset[1]

### 3.2.2. Run Levenshtein with the data subset

Find the best matches for the words in data subset according to Levenshtein.

In [50]:
LD_subcorrection_dict = {}
for word in misspell_subset:
    if word not in LD_subcorrection_dict: # avoid repeated word
        LD_subcorrection_dict[word] = best_match_LD(word)

Save the correction results.

In [51]:
# Save
np.save('subset_levenshtein_results.npy', LD_subcorrection_dict)

In [65]:
# Load
LD_subcorrection_dict = np.load('subset_levenshtein_results.npy').item()

Precision, recall, and F-score for Levenshtein (baseline)

In [68]:
(precision, recall, fscore) = wiki_eval(LD_subcorrection_dict, misspell_subset, correct_subset)
print "Evaluation Metrics (Levenshtein Distance, 20% Data)"
print "---------------------------------------------------"
print "Precision: " + str(precision)
print "Recall: " + str(recall)
print "F-score: " + str(fscore)

Evaluation Metrics (Levenshtein Distance, 20% Data)
---------------------------------------------------
Precision: 0.269969278034
Recall: 0.789887640449
F-score: 0.402404121351


# 4. Damerau-Levenshtein Distance (DLD)

Define a function to find the best matches of the misspelled words with DLD.

In [71]:
def best_match_DLD(target):
    # init best match as first entry of the dict
    min_dist = jellyfish.damerau_levenshtein_distance(unicode(target,"utf-8"),unicode(dict_ls[0],"utf-8"))
    best_matches = [dict_ls[0]]
    
    for word in dict_ls[1:]:
        if abs(len(word) - len(target)) > min_dist:  # not possible to be min_dist, skip
            continue
        dist = jellyfish.damerau_levenshtein_distance(unicode(target,"utf-8"),unicode(word,"utf-8"))  # cal DLD
        # replace if shorter distance
        if dist < min_dist:
            min_dist = dist
            best_matches = [word]
        elif dist == min_dist:
            best_matches.append(word)

    return best_matches

Find the best matches for the wiki misspelled words according to DLD.

In [72]:
DLD_correction_dict = {}
for word in misspell_subset:
    if word not in DLD_correction_dict: # avoid repeated word
        DLD_correction_dict[word] = best_match_DLD(word)

Evaluate the results of DLD.

In [73]:
# Save
np.save('DLD_results.npy', DLD_correction_dict)

In [74]:
# Load
DLD_correction_dict = np.load('DLD_results.npy').item()

Precision, recall, and F-score for DLD

In [75]:
(precision, recall, fscore) = wiki_eval(DLD_correction_dict, misspell_subset, correct_subset)
print "Evaluation Metrics (Damerau-Levenshtein Distance, 20% Data)"
print "---------------------------------------------------"
print "Precision: " + str(precision)
print "Recall: " + str(recall)
print "F-score: " + str(fscore)

Evaluation Metrics (Damerau-Levenshtein Distance, 20% Data)
---------------------------------------------------
Precision: 0.342176258993
Recall: 0.855056179775
F-score: 0.488760436737


# 5. Weighted-Levenshtein Distance (WLD)

# 6. N-Gram Distance (N=2)

Define a function to get all the bigrams of a word as a dictionary.

In [94]:
def get_bigram_dict(word):
    bigram_dict = {}
    new_word = "#" + word + "#"  # padding
    
    for i in range(len(new_word)-1):
        gram = new_word[i:i+2]
        if gram not in bigram_dict:
            bigram_dict[gram] = 1
        else:
            bigram_dict[gram] += 1
        
    return bigram_dict

Define a function to calculate the bigram distance between the ngram dictionaries s_dict and t_dict.

In [104]:
def bigram_dist(s_dict,t_dict, s_len, t_len):
    s_grams = s_dict.keys()
    t_grams = t_dict.keys()
    
    intersect = 0
    
    for gram in s_grams:
        if gram in t_grams:
            intersect += min(s_dict[gram], t_dict[gram])
            
    return ((s_len+1) + (t_len+1) - 2*intersect)

Get the ngram dictionaries for each misspelled token & dictionary entries.

In [106]:
# n-gram dictionaries
misspell_subset_dict = []
dict_ls_dict = []

for word in misspell_subset:
    misspell_subset_dict.append(get_bigram_dict(word))

for word in dict_ls:
    dict_ls_dict.append(get_bigram_dict(word))

Define a function to find the best matches for a misspelled token with its index in the subset given.

In [112]:
def best_match_ngram(target_index):
    # init best match as first entry of the dict
    target_dict = misspell_subset_dict[target_index]
    target_len = len(misspell_subset)
    min_dist = bigram_dist(target_dict,dict_ls_dict[0], target_len, len(dict_ls[0]))
    best_matches = [dict_ls[0]]
    
    for i in range(1,len(dict_ls)):
        dist = bigram_dist(target_dict,dict_ls_dict[i], target_len, len(dict_ls[i]))
        # replace if shorter distance
        if dist < min_dist:
            min_dist = dist
            best_matches = [dict_ls[i]]
        elif dist == min_dist:
            best_matches.append(dict_ls[i])

    return best_matches

Find the best matches for the wiki misspelled words according to N-Gram (N=2).

In [118]:
ngram_correction_dict = {}
for i in range(len(misspell_subset)):
    word = misspell_subset[i]
    if word not in ngram_correction_dict: # avoid repeated word
        ngram_correction_dict[word] = best_match_ngram(i)

In [120]:
# Save
np.save('ngram_results.npy', ngram_correction_dict)

In [121]:
# Load
ngram_correction_dict = np.load('ngram_results.npy').item()

Precision, recall, and F-score for N-Gram.

In [122]:
(precision, recall, fscore) = wiki_eval(ngram_correction_dict, misspell_subset, correct_subset)
print "Evaluation Metrics (N-Gram Distance, 20% Data)"
print "---------------------------------------------------"
print "Precision: " + str(precision)
print "Recall: " + str(recall)
print "F-score: " + str(fscore)

Evaluation Metrics (N-Gram Distance, 20% Data)
---------------------------------------------------
Precision: 0.481452249408
Recall: 0.685393258427
F-score: 0.565600370885
