# Genetate noisy data for spell checker

The objective of this project is to build a model that can take a sentence with spelling mistakes as input, and output the same sentence, but with the mistakes corrected. The data that we will use for this project will be twenty popular books from [Project Gutenberg](http://www.gutenberg.org/ebooks/search/?sort_order=downloads). 

The sections of the project are:
- Loading the Data
- Preparing the Data

In [2]:
import sys
sys.executable

'/usr/local/Cellar/jupyterlab/2.2.2/libexec/bin/python3.8'

In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
from os import listdir
from os.path import isfile, join
from collections import namedtuple
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
import time
import re
from sklearn.model_selection import train_test_split

## Loading the Data

In [5]:
def load_book(path):
    """Load a book from its file"""
    input_file = os.path.join(path)
    with open(input_file) as f:
        book = f.read()
    return book

In [10]:
# Collect all of the book file names
path = './books/'
book_files = [f for f in listdir(path) if isfile(join(path, f))]
book_files = book_files[1:]

In [11]:
# Load the books using the file names
books = []
for book in book_files:
    books.append(load_book(path+book))

In [12]:
# Compare the number of words in each book 
for i in range(len(books)):
    print("There are {} words in {}.".format(len(books[i].split()), book_files[i]))

There are 126999 words in Pride_and_Prejudice_by_Jane_Austen.rtf.
There are 113452 words in David_Copperfield_by_Charles_Dickens.rtf.
There are 194282 words in The_Romance_of_Lust_by_Anonymous.rtf.
There are 25395 words in Metamorphosis_by_Franz_Kafka.rtf.
There are 191598 words in Great_Expectations_by_Charles_Dickens.rtf.
There are 165188 words in Oliver_Twist_by_Charles_Dickens.rtf.
There are 53211 words in The_Prince_by_Nicolo_Machiavelli.rtf.
There are 96185 words in The_Adventures_of_Tom_Sawyer_by_Mark_Twain.rtf.
There are 480495 words in The_Count_of_Monte_Cristo_by_Alexandre_Dumas.rtf.
There are 78912 words in Frankenstein_by_Mary_Shelley.rtf.
There are 33464 words in Through_the_Looking_Glass_by_Lewis_Carroll.rtf.
There are 9463 words in The_Yellow_Wallpaper_by_Charlotte_Perkins_Gilman.rtf.
There are 166996 words in Dracula_by_Bram_Stoker.rtf.
There are 163109 words in Emma_by_Jane_Austen.rtf.
There are 105428 words in Grimms_Fairy_Tales_by_The_Brothers_Grimm.rtf.
There are 83

In [13]:
# Check to ensure the text looks alright
books[0][:500]

'{\\rtf1\\ansi\\ansicpg1252\\cocoartf1404\\cocoasubrtf470\n{\\fonttbl\\f0\\fmodern\\fcharset0 Courier;}\n{\\colortbl;\\red255\\green255\\blue255;\\red0\\green0\\blue0;}\n\\margl1440\\margr1440\\vieww10800\\viewh8400\\viewkind0\n\\deftab720\n\\pard\\pardeftab720\\sl280\\partightenfactor0\n\n\\f0\\fs24 \\cf2 \\expnd0\\expndtw0\\kerning0\n\\outl0\\strokewidth0 \\strokec2 The Project Gutenberg EBook of Pride and Prejudice, by Jane Austen\\\n\\\nThis eBook is for the use of anyone anywhere at no cost and with\\\nalmost no restrictions whatsoever.  Y'

## Preparing the Data

In [14]:
def clean_text(text):
    '''Remove unwanted characters and extra spaces from the text'''
    text = re.sub(r'\n', ' ', text) 
    text = re.sub(r'[{}@_*>()\\#%+=\[\]]','', text)
    text = re.sub('a0','', text)
    text = re.sub('\'92t','\'t', text)
    text = re.sub('\'92s','\'s', text)
    text = re.sub('\'92m','\'m', text)
    text = re.sub('\'92ll','\'ll', text)
    text = re.sub('\'91','', text)
    text = re.sub('\'92','', text)
    text = re.sub('\'93','', text)
    text = re.sub('\'94','', text)
    text = re.sub('\.','. ', text)
    text = re.sub('\!','! ', text)
    text = re.sub('\?','? ', text)
    text = re.sub(' +',' ', text)
    return text

In [15]:
# Clean the text of the books
clean_books = []
for book in books:
    clean_books.append(clean_text(book))

In [16]:
# Check to ensure the text has been cleaned properly
clean_books[0][:500]

'rtf1ansiansicpg1252cocoartf1404cocoasubrtf470 fonttblf0fmodernfcharset0 Courier; colortbl;red255green255blue255;red0green0blue0; margl1440margr1440vieww10800viewh8400viewkind0 deftab720 pardpardeftab720sl280partightenfactor0 f0fs24 cf2 expnd0expndtw0kerning0 outl0strokewidth0 strokec2 The Project Gutenberg EBook of Pride and Prejudice, by Jane Austen This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it unde'

In [17]:
# Create a dictionary to convert the vocabulary (characters) to integers
vocab_to_int = {}
count = 0
for book in clean_books:
    for character in book:
        if character not in vocab_to_int:
            vocab_to_int[character] = count
            count += 1

# Add special tokens to vocab_to_int
codes = ['<PAD>','<EOS>','<GO>']
for code in codes:
    vocab_to_int[code] = count
    count += 1

In [18]:
# Check the size of vocabulary and all of the values
vocab_size = len(vocab_to_int)
print("The vocabulary contains {} characters.".format(vocab_size))
print(sorted(vocab_to_int))

The vocabulary contains 78 characters.
[' ', '!', '"', '$', '&', "'", ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<EOS>', '<GO>', '<PAD>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


*Note: We could have made this project a little easier by using only lower case words and fewer special characters ($,&,-...), but I want to make this spell checker as useful as possible.*

In [19]:
# Create another dictionary to convert integers to their respective characters
int_to_vocab = {}
for character, value in vocab_to_int.items():
    int_to_vocab[value] = character

In [20]:
# Split the text from the books into sentences.
sentences = []
for book in clean_books:
    for sentence in book.split('. '):
        sentences.append(sentence + '.')
print("There are {} sentences.".format(len(sentences)))

There are 127068 sentences.


In [21]:
# Check to ensure the text has been split correctly.
sentences[:5]

['rtf1ansiansicpg1252cocoartf1404cocoasubrtf470 fonttblf0fmodernfcharset0 Courier; colortbl;red255green255blue255;red0green0blue0; margl1440margr1440vieww10800viewh8400viewkind0 deftab720 pardpardeftab720sl280partightenfactor0 f0fs24 cf2 expnd0expndtw0kerning0 outl0strokewidth0 strokec2 The Project Gutenberg EBook of Pride and Prejudice, by Jane Austen This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever.',
 'You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.',
 'gutenberg.',
 'org Title: Pride and Prejudice Author: Jane Austen Posting Date: August 26, 2008 EBook 1342 Release Date: June, 1998 Last Updated: October 17, 2016 Language: English Character set encoding: UTF-8 START OF THIS PROJECT GUTENBERG EBOOK PRIDE AND PREJUDICE Produced by Anonymous Volunteers PRIDE AND PREJUDICE By Jane Austen Chapter 1 It is a truth universally acknowledged, that a single m

*Note: I expect that you have noticed the very ugly text in the first sentence. We do not need to worry about removing it from any of the books because will be limiting our data to sentences that are shorter than it.*

In [22]:
# Convert sentences to integers
int_sentences = []

for sentence in sentences:
    int_sentence = []
    for character in sentence:
        int_sentence.append(vocab_to_int[character])
    int_sentences.append(int_sentence)

In [23]:
# Find the length of each sentence
lengths = []
for sentence in int_sentences:
    lengths.append(len(sentence))
lengths = pd.DataFrame(lengths, columns=["counts"])

In [24]:
lengths.describe()

Unnamed: 0,counts
count,127068.0
mean,122.043756
std,118.487983
min,1.0
25%,47.0
50%,93.0
75%,162.0
max,8906.0


In [25]:
# Limit the data we will use to train our model
max_length = 92
min_length = 10

good_sentences = []

for sentence in int_sentences:
    if len(sentence) <= max_length and len(sentence) >= min_length:
        good_sentences.append(sentence)

print("We will use {} to train and test our model.".format(len(good_sentences)))

We will use 56237 to train and test our model.


*Note: I decided to not use very long or short sentences because they are not as useful for training our model. Shorter sentences are less likely to include an error and the text is more likely to be repetitive. Longer sentences are more difficult to learn due to their length and increase the training time quite a bit. If you are interested in using this model for more than just a personal project, it would be worth using these longer sentence, and much more training data to create a more accurate model.*

In [26]:
# Split the data into training and testing sentences
training, testing = train_test_split(good_sentences, test_size = 0.15, random_state = 2)

print("Number of training sentences:", len(training))
print("Number of testing sentences:", len(testing))

Number of training sentences: 47801
Number of testing sentences: 8436


In [27]:
# Sort the sentences by length to reduce padding, which will allow the model to train faster
training_sorted = []
testing_sorted = []

for i in range(min_length, max_length+1):
    for sentence in training:
        if len(sentence) == i:
            training_sorted.append(sentence)
    for sentence in testing:
        if len(sentence) == i:
            testing_sorted.append(sentence)

In [28]:
# Check to ensure the sentences have been selected and sorted correctly
for i in range(5):
    print(training_sorted[i], len(training_sorted[i]))

[63, 24, 7, 20, 23, 19, 61, 0, 6, 42] 10
[61, 16, 0, 22, 6, 1, 13, 5, 23, 42] 10
[32, 24, 4, 5, 30, 19, 39, 13, 16, 42] 10
[59, 5, 7, 10, 24, 1, 20, 23, 39, 42] 10
[10, 16, 1, 23, 5, 17, 23, 0, 10, 42] 10


In [29]:
letters = ['a','b','c','d','e','f','g','h','i','j','k','l','m',
           'n','o','p','q','r','s','t','u','v','w','x','y','z',]

def noise_maker(sentence, threshold):
    '''Relocate, remove, or add characters to create spelling mistakes'''
    
    noisy_sentence = []
    i = 0
    while i < len(sentence):
        random = np.random.uniform(0,1,1)
        # Most characters will be correct since the threshold value is high
        if random < threshold:
            noisy_sentence.append(sentence[i])
        else:
            new_random = np.random.uniform(0,1,1)
            # ~33% chance characters will swap locations
            if new_random > 0.67:
                if i == (len(sentence) - 1):
                    # If last character in sentence, it will not be typed
                    continue
                else:
                    # if any other character, swap order with following character
                    noisy_sentence.append(sentence[i+1])
                    noisy_sentence.append(sentence[i])
                    i += 1
            # ~33% chance an extra lower case letter will be added to the sentence
            elif new_random < 0.33:
                random_letter = np.random.choice(letters, 1)[0]
                noisy_sentence.append(vocab_to_int[random_letter])
                noisy_sentence.append(sentence[i])
            # ~33% chance a character will not be typed
            else:
                pass     
        i += 1
    return noisy_sentence

*Note: The noise_maker function is used to create spelling mistakes that are similar to those we would make. Sometimes we forget to type a letter, type a letter in the wrong location, or add an extra letter.*

In [30]:
# Check to ensure noise_maker is making mistakes correctly.
threshold = 0.9
for sentence in training_sorted[:5]:
    print(sentence)
    print(noise_maker(sentence, threshold))
    print()

[63, 24, 7, 20, 23, 19, 61, 0, 6, 42]
[63, 24, 7, 20, 23, 19, 28, 61, 0, 6, 42]

[61, 16, 0, 22, 6, 1, 13, 5, 23, 42]
[0, 22, 6, 1, 13, 5, 23, 42]

[32, 24, 4, 5, 30, 19, 39, 13, 16, 42]
[32, 24, 4, 5, 30, 19, 39, 13, 16, 42]

[59, 5, 7, 10, 24, 1, 20, 23, 39, 42]
[59, 5, 7, 10, 24, 1, 20, 23, 39, 42]

[10, 16, 1, 23, 5, 17, 23, 0, 10, 42]
[10, 16, 1, 23, 23, 0, 42, 10]

