### Autocorrect

- Identify a mis-spelled word
- Find string $n$ edit distances away
- Filter candidates (by keeping only real words from the previous step)
- Calculate word probabilities by taking the context into consideration

In [1]:
def process_data(file_name):
    """
    Input: 
        A file_name which is found in your current directory. You just have to read it in. 
    Output: 
        words: a list containing all the words in the corpus (text file you read) in lower case. 
    """
    all_words = [] # return this variable correctly

    content = open(file_name, "r").read()
    content = content.lower()
    all_words = re.findall(r'\w+', content)
    
    return all_words


def get_count(word_l):
    '''
    Input:
        word_l: a set of words representing the corpus. 
    Output:
        word_count_dict: The wordcount dictionary where key is the word and value is its frequency.
    '''
    
    word_count_dict = {}  # fill this with word counts

    for word in word_l:
        if word not in word_count_dict:
            word_count_dict[word] = 1
        else:
            word_count_dict[word] += 1
    
    return word_count_dict


def get_probs(word_count_dict):
    '''
    Input:
        word_count_dict: The wordcount dictionary where key is the word and value is its frequency.
    Output:
        probs: A dictionary where keys are the words and the values are the probability that a word will occur. 
    '''
    probs = {}  # return this variable correctly
    
    total = 0 
    for word in word_count_dict:
        total += word_count_dict[word]
        
    for word in word_count_dict:
        probs[word] = word_count_dict[word] / total
    
    return probs


def delete_letter(word, verbose=False):
    '''
    Input:
        word: the string/word for which you will generate all possible words 
                in the vocabulary which have 1 missing character
    Output:
        delete_l: a list of all possible strings obtained by deleting 1 character from word
    '''
    
    delete_l = []
    split_l = []
    
    # 'nice' is split into : [('', 'nice'), ('n', 'ice'), ('ni', 'ce'), ('nic', 'e'), ('nice', '')]
    # For our 'nice' example you get: ['ice', 'nce', 'nie', 'nic']
    for i in range(len(word)):
        split_l.append((word[:i], word[i:]))
        
    for j in range(len(split_l)):
        first = split_l[j][0]
        second = split_l[j][1]
        new = first + second
        delete_l.append(new[:j]+new[j+1:])
    
    if verbose: print(f"input word {word}, \nsplit_l = {split_l}, \ndelete_l = {delete_l}")

    return delete_l


def switch_letter(word, verbose=False):
    '''
    Input:
        word: input string
     Output:
        switches: a list of all possible strings with one adjacent charater switched
    ''' 
    
    switch_l = []
    split_l = []
    
    if len(word) == 1:
        return word
    
    for i in range(len(word)):
        split_l.append((word[:i], word[i:]))
        
    for (a,b) in split_l:
        if len(a) > 1 and len(b) > 1:
            temp1 = b[0]
            temp2 = a[len(a)-1]
            switch_l.append(a[len(a)-2]+temp1+temp2+b[1:])
        elif len(a) == 1 and len(b) > 1:
            temp1 = b[0]
            temp2 = a[len(a)-1]
            switch_l.append(temp1+temp2+b[1:])
        elif len(a) > 1 and len(b) == 1:
            temp1 = b[0]
            temp2 = a[len(a)-1]
            switch_l.append(a[len(a)-2]+temp1+temp2)
        elif len(a) == 1 and len(b) == 1:
            switch_l.append(b+a)
    
    if verbose: print(f"Input word = {word} \nsplit_l = {split_l} \nswitch_l = {switch_l}") 

    return switch_l


def replace_letter(word, verbose=False):
    '''
    Input:
        word: the input string/word 
    Output:
        replaces: a list of all possible strings where we replaced one letter from the original word. 
    ''' 
    
    letters = 'abcdefghijklmnopqrstuvwxyz'
    replace_l = []
    split_l = []
    
    for i in range(len(word)):
        split_l.append((word[:i], word[i:]))
        
    # TODO
                
    if word in replace_l:
        replace_l.remove(word)
        
    replace_set = set(replace_l)
    
    # turn the set back into a list and sort it, for easier viewing
    replace_l = sorted(list(replace_set))
    
    if verbose: print(f"Input word = {word} \nsplit_l = {split_l} \nreplace_l {replace_l}")   
    
    return replace_l


def insert_letter(word, verbose=False):
    '''
    Input:
        word: the input string/word 
    Output:
        inserts: a set of all possible strings with one new letter inserted at every offset
    ''' 
    letters = 'abcdefghijklmnopqrstuvwxyz'
    insert_l = []
    split_l = []
    
    for i in range(len(word)+1):
        split_l.append((word[:i], word[i:]))
            
    for j in range(len(word)+1):
        for k in letters:
            if j == 0:
                insert_l.append(k+word)
            elif j == len(word):
                insert_l.append(word+k)
            else:
                insert_l.append(word[:j]+k+word[j:])

    if verbose: print(f"Input word {word} \nsplit_l = {split_l} \ninsert_l = {insert_l}")
    
    return insert_l


def edit_one_letter(word, allow_switches = True):
    """
    Input:
        word: the string/word for which we will generate all possible wordsthat are one edit away.
    Output:
        edit_one_set: a set of words with one possible edit. Please return a set. and not a list.
    """
    
    edit_one_set = set()
    
    inserts = insert_letter(word)
    for i in inserts:
        edit_one_set.add(i)
        
    deletes = delete_letter(word)
    for j in deletes:
        edit_one_set.add(j)
        
    replaces = replace_letter(word)
    for k in replaces:
        edit_one_set.add(k)
            
    if allow_switches:
        switches = switch_letter(word)
        for k in switches:
            edit_one_set.add(k)
            
    if word in edit_one_set:
        edit_one_set.remove(word)

    return edit_one_set


def edit_two_letters(word, allow_switches = True):
    '''
    Input:
        word: the input string/word 
    Output:
        edit_two_set: a set of strings with all possible two edits
    '''
    
    edit_two_set = set()
    
    edit_one_set = edit_one_letter(word)
    for word_one_edit_away in edit_one_set:
        word_two_edits_away = edit_one_letter(word_one_edit_away)
        for w in word_two_edits_away:
            edit_two_set.add(w)
            
    return edit_two_set.union(edit_one_set)


def get_corrections(word, probs, vocab, n=2, verbose = False):
    '''
    Input: 
        word: a user entered string to check for suggestions
        probs: a dictionary that maps each word to its probability in the corpus
        vocab: a set containing all the vocabulary
        n: number of possible word corrections you want returned in the dictionary
    Output: 
        n_best: a list of tuples with the most probable n corrected words and their probabilities.
    '''
    
    suggestions = []
    n_best = []
    
    if word in vocab:
        n_best.append((word, probs[word]))
        
    edit_one_set = edit_one_letter(word, True)    
    for word in edit_one_set:
        if word in vocab:
            n_best.append((word, probs[word]))
    
    edit_two_set = edit_two_letters(word, True)
    for word in edit_two_set:
        if word in vocab:
            n_best.append((word, probs[word]))
    
    if verbose: print("entered word = ", word, "\nsuggestions = ", suggestions)

    return n_best


def min_edit_distance(source, target, ins_cost = 1, del_cost = 1, rep_cost = 2):
    '''
    Input: 
        source: a string corresponding to the string you are starting with
        target: a string corresponding to the string you want to end with
        ins_cost: an integer setting the insert cost
        del_cost: an integer setting the delete cost
        rep_cost: an integer setting the replace cost
    Output:
        D: a matrix of len(source)+1 by len(target)+1 containing minimum edit distances
        med: the minimum edit distance (med) required to convert the source string to the target
    '''
    # use deletion and insert cost as  1
    m = len(source) 
    n = len(target) 
    #initialize cost matrix with zeros and dimensions (m+1,n+1) 
    D = np.zeros((m+1, n+1), dtype=int) 
        
    # Fill in column 0, from row 1 to row m, both inclusive
    for row in range(1,m+1): # Replace None with the proper range
        D[row,0] = D[row-1,0]+del_cost
        
    # Fill in row 0, for all columns from 1 to n, both inclusive
    for col in range(1,n+1): # Replace None with the proper range
        D[0,col] = D[0,col-1]+ins_cost
        
    # Loop through row 1 to row m, both inclusive
    for row in range(1,m+1): 
        
        # Loop through column 1 to column n, both inclusive
        for col in range(1,n+1):
            
            # Intialize r_cost to the 'replace' cost that is passed into this function
            r_cost = rep_cost
            
            # Check to see if source character at the previous row
            # matches the target character at the previous column, 
            if source[row-1] == target[col-1]:
                # Update the replacement cost to 0 if source and target are the same
                r_cost = 0
                
            # Update the cost at row, col based on previous entries in the cost matrix
            # Refer to the equation calculate for D[i,j] (the minimum of three calculated costs)
            D[row,col] = min(D[row-1,col]+del_cost, D[row,col-1]+ins_cost, D[row-1,col-1]+r_cost)
          
    # Set the minimum edit distance with the cost found at row m, column n
    med = D[row,col]
    
    return D, med

### Markov chains

![2-2-1](images/natural-language-processing/2-2-1.png)

- $Q = \{q_{1}, q_{2}, q_{3}\}$: set of all states in the model

![2-2-2](images/natural-language-processing/2-2-2.png)

- Blue circles: part of speech tags
- Arrows: transition probability from one part of speech to another
- Table $A$ can be contructed from the diagram

![2-2-3](images/natural-language-processing/2-2-3.png)

- $A$ can also be written as transition matrix

![2-2-4](images/natural-language-processing/2-2-4.png)

- Emission probability: probability to go from one state (POS tag) to a specific word

![2-2-5](images/natural-language-processing/2-2-5.png)

- To populate emission matrix $B$, use labeled dataset

![2-2-6](images/natural-language-processing/2-2-6.png)

- $C(t_{(i−1)},t_{(i)})$ is the number of times that tag $t_{(i)}$ shows up after tag $t_{(i-1)}$

![2-2-7](images/natural-language-processing/2-2-7.png)

- $\pi$ is the initial state
- Numbers in the table indicates the number of times that a tag shows up right after another tag
    - For example, Noun shows up after the initial state once, so 1
    - For example, Other shows up after the inital state twice, so 2
    
![2-2-8](images/natural-language-processing/2-2-8.png)

- To deal with "no occurance" problem, we introduce smoothing probability such that

![2-2-9](images/natural-language-processing/2-2-9.png)

![2-2-10](images/natural-language-processing/2-2-10.png)

- We also use smoothing probability when populating Emission matrix
- $P(W_{i}|t_{i}) = \dfrac{C(t_{i},w_{i})+\epsilon}{\displaystyle\sum_{j=1}^{V}C(t_{i},w_{i}) + N * \epsilon} = = \dfrac{C(t_{i},w_{i})+\epsilon}{C(t_{i}) + N * \epsilon}$
    - where $C(t_{i},w_{i})$ means how many times tag $t_{i}$ is associated with word $w_{i}$ 
    
    
### Viterbi algorithm

![2-2-11](images/natural-language-processing/2-2-11.png)

- To go from $\pi$ to $O$, multiply transition probability of 0.3 to emission probability of 0.5, which results in 0.15

![2-2-12](images/natural-language-processing/2-2-12.png)

### Viterbi initialization

- Populate two matrices $C$ and $D$ such that

![2-2-13](images/natural-language-processing/2-2-13.png)

![2-2-14](images/natural-language-processing/2-2-14.png)


### Viterbi forward pass

![2-2-15](images/natural-language-processing/2-2-15.png)

![2-2-16](images/natural-language-processing/2-2-16.png)


### Viterbi backward pass

![2-2-17](images/natural-language-processing/2-2-17.png)

- This equation gets the top right element of matrix $C$

![2-2-18](images/natural-language-processing/2-2-18.png)