In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Week 1: Edit Distance

Edit Distance:
- Insert
- Delete
- Switch
- Replace

In [10]:
def edit_distance(source, target, ins_cost=1, del_cost=1, rep_cost=2):
    D = [[0 for _ in range(1+len(target))] for _ in range(1+len(source))]
    for i in range(len(D)):
        for j in range(len(D[0])):
            if i == 0 and j == 0:
                D[i][j] = 0
            elif i == 0:
                D[i][j] = D[i][j-1] + ins_cost
            elif j == 0:
                D[i][j] = D[i-1][j] + del_cost
            else:
                left_cost = D[i][j-1] + del_cost
                up_cost = D[i-1][j] + ins_cost
                diag_cost = D[i-1][j-1] + (0 if source[i-1] == target[j-1] else rep_cost)
                D[i][j] = min([left_cost, up_cost, diag_cost])
    return D#[-1][-1]

source = 'play'
target = 'stay'

edit_distance(source, target)

[[0, 1, 2, 3, 4],
 [1, 2, 3, 4, 5],
 [2, 3, 4, 5, 6],
 [3, 4, 5, 4, 5],
 [4, 5, 6, 5, 4]]

In [16]:
for row in edit_distance('bay', 'day'):
    print(row)

[0, 1, 2, 3]
[1, 2, 3, 4]
[2, 3, 2, 3]
[3, 4, 3, 2]


Autocorrect Model:
1. Identify a misspelled word: $w$
2. Find all strings n edit distance away: $s_1,\cdots,s_N$
3. Filter candidate words: $w_1,\cdots,w_n$
4. Calculate word probabilities: $p(w_1),\cdots,p(w_n)$
5. Replace with highest probability word: $\hat w = \text{argmax}_jp(w_j)$

In [None]:
vocab = set(...)
freqs = dict(...)
def autocorrect(misspelled, threshold=2):
    if misspelled in vocab:
        return misspelled
    dists = {word: edit_distance(misspelled, word) for word in vocab}
    candidates = []
    probs = []
    for word in vocab:
        if dists[word] <= threshold:
            candidates.append(word)
            probs.append(freqs[word] / len(vocab))
    for i in range(len(candidates)):
        if probs[i] == max(probs):
            return candidates[i]

# Week 2: POS Tagging