## NLP Part B: Spellchecker and Autocorrector Application
____________________________
Created by: Group 4

# Restart

In [4]:
# Importing all required libraries for this task.
import nltk
from nltk.util import ngrams
from nltk.metrics.distance import edit_distance
from nltk.corpus import words
from nltk.tokenize import RegexpTokenizer
from itertools import chain
import json
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import PunktSentenceTokenizer
from nltk.stem import *
from nltk.corpus import wordnet as wn
import time
from tqdm import tqdm
from difflib import SequenceMatcher
lemmatizer = WordNetLemmatizer()

#### Create Dictionary

In [5]:
def parsing(sent):  
    """Parsing the sentence to corrected and original and storing in the dictionary."""
    loriginal = []
    lcorrected = []
    lcorr = []
    indexes = []
    cnt = 0
    
    for i in sent:
        if '|' in i:
            # Splitting the sentence on '|'
            str1 = i.split('|')
            # Previous word to '|' is storing in loriginal list.
            loriginal.append(str1[0])
            # Next word to '|' is storing in lcorrected list.
            lcorrected.append(str1[1])
            #Noting down the index of error.
            indexes.append(cnt)
        
        else:
            # If there is no '|' in sentence, sentence is stored in loriginal and lcorrected as it is.
            loriginal.append(i)
            lcorrected.append(i)
        cnt = cnt+1
        
    #Loading to loriginal, lcorrected and index list to dictionary.      
    dictionary = {'original': loriginal, 'corrected': lcorrected, 'indexes': indexes}
    
    return dictionary


#### Preprocessing

In [9]:
def preprocessing():
    """Loading the data from 'holbrook.txt' and passing to parsing function to get parssed sentences. 
    Returning the whole dictionary as data."""
    data = []
    
    # Reading the txt file
    text_file = open("metamorphosis_cleant", "r")
    lines = []
    for i in text_file:
        lines.append(i.strip())
    
    # Word tokenizing the sentences
    sentences = [nltk.word_tokenize(sent) for sent in lines]
    
    # Calling a parse function to get corrected, original sentences.
    for sent in sentences:
        data.append(parsing(sent))
    
    return data

print(data)

NameError: name 'data' is not defined

#### References:
##### https://www.languagetool.org/

### 1. Data PreProcessing
Data preprocessing stages are split into the following parts:
+ #### Tokenization of Words
+ #### Case Normalization
+ #### Removing the following:
  - Punctuation
  - Stop Words
  - Numeric Characters
  - Special Characters
  - Accented Characters

+ #### Stemming and Lemmatization?
  
<u><i>More Text Cleaning Considerations:</i></u>

- Handling large documents and large collections of text documents that do not fit into memory.
- Extracting text from markup like HTML, PDF, or other structured document formats.
- Transliteration of characters from other languages into English.
- Decoding Unicode characters into a normalized form, such as UTF8.
- Handling of domain specific words, phrases, and acronyms.
- Handling or removing numbers, such as dates and amounts.
- Locating and correcting common typos and misspellings.
- Resolve contractions for casual text.
- References: shorturl.at/pvHS8

-----------------------------------------
### 4. Design Deliverables

b)	Your application must be able to find the spelling errors and suggest a few words to the user to modify the text.

c)	The spelling errors that need to be addressed by your system are:

i.	Non-words (wrong spelling, where the word does not exist)

ii.	Real-words (wrong spelling due to wrong context, but the misspelt word does exist)
    - Grammatical errors, typos e.t.c

d)	The techniques used for the detection of the spelling errors must include: <body>
  <p style="color:rgb(255,0,0);"> - Bigrams</p>
     <p style="color:rgb(255,0,0);"> - Minimum Edit Distance,</p>
     <p style="color:rgb(255,0,0);">- Other suitable popular techniques used in NLP</p>
   </body>

<p>e)	Provide the following functionality in your application: </p>
<p>   Ability to show a sorted list of all words in the corpus with the facility of exploring the list and search for a     specific word.</p>

	Ability to highlight the misspelled words, and right click to suggest the correct words (with their minimum edit     distance from the wrong word)


#### Preriquisites Processes

In [1]:
# Load Corpus
filename = 'metamorphosis_clean.txt'
#Add corpus to your working directory

file = open(filename, 'rt')
text = file.read()
file.close()

# import packages
import nltk
import re
import string
import unicodedata
import heapq                               
import os

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter

#### Tokenization of Words

In [3]:
# split into words by white space
words = text.split()

# split based on words only

words = re.split(r'\W+', text)

# split into words by white space
words = text.split()
print(words[:1000])

['The', '100', 'tradé®†¥mark!', '™', '®', 'Reading', 'Books', 'The', 'Project', 'Gutenberg', 'EBook', 'of', 'Metamorphosis,', 'by', 'Franz', 'Kafka', 'Translated', 'by', 'David', 'Wyllie.', 'This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever.', 'You', 'may', 'copy', 'it,', 'give', 'it', 'away', 'or', 're-use', 'it', 'under', 'the', 'terms', 'of', 'the', 'Project', 'Gutenberg', 'License', 'included', 'with', 'this', 'eBook', 'or', 'online', 'at', 'www.gutenberg.net', '**', 'This', 'is', 'a', 'COPYRIGHTED', 'Project', 'Gutenberg', 'eBook,', 'Details', 'Below', '**', '**', 'Please', 'follow', 'the', 'copyright', 'guidelines', 'in', 'this', 'file.', '**', 'ß', 'Title:', 'Metamorphosis', 'Author:', 'Franz', 'Kafka', 'Translator:', 'David', 'Wyllie', 'Release', 'Date:', 'August', '16,', '2005', '[EBook', '#5200]', 'First', 'posted:', 'May', '13,', '2002', 'Last', 'updated:', 'May', '20,', '2012

# 

#### Case Normalization

In [28]:
# text normalization - convert to lower case
nor_words = [word.lower() for word in words]
#print(nor_words[:100])

#### Removing Punctuation 

In [29]:
# remove punctuation from each word
table = str.maketrans('', '', string.punctuation)
stripped_words = [w.translate(table) for w in nor_words]
#print(stripped_words[:100])

#### Removing Stop Words

In [30]:
# Remove stop words
stop_words = set(stopwords.words('english'))
no_st_words = [w for w in stripped_words if not w in stop_words]
#print(no_st_words[:100])

#### Removing Numeric Characters

In [31]:
#Remove numeric characters
no_numbers = ' '.join(c for c in no_st_words if not c.isdigit())
#print(no_numbers[:100])

#### Remove Special Characters

In [32]:
# function to remove special characters
def remove_s_c(no_numbers):
    # define the pattern to keep
    rem = r'[^a-zA-z0-9.,!?/:;\"\'\s\w+)]' 
    return re.sub(rem, '',no_numbers)
 
# calling the function
no_sc_words = remove_s_c(no_numbers)

print(no_sc_words[:100])

# resulting in double spaces after removing special characters

tradémark   reading books project gutenberg ebook metamorphosis franz kafka translated david wyllie 


#### Remove Accented Characters

In [33]:
# imports
# function to remove accented characters
def remove_a_c(no_sc_words):
    new_text = unicodedata.normalize('NFKD', no_sc_words).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text
# call function
no_ac_words = remove_a_c(no_sc_words)
no_ws_words = (" ".join(no_ac_words.split()))

print(no_ac_words[:100])

trademark   reading books project gutenberg ebook metamorphosis franz kafka translated david wyllie 


### Remove Whitespaces

In [34]:
no_ws_words = (" ".join(no_ac_words.split()))

print(no_ws_words[:100])


trademark reading books project gutenberg ebook metamorphosis franz kafka translated david wyllie eb


### Expanding Contractions

In [35]:
from pycontractions import Contractions

contractions.fix("you're happy now")
# "you are happy now"
contractions.fix("yall're happy now", slang=False) # default: true
# "yall are happy"
contractions.fix("yall're happy now")
# "you all are happy now"

ModuleNotFoundError: No module named 'pycontractions'

In [37]:
pip install contractions
#conda install -c conda-forge spacy

SyntaxError: invalid syntax (<ipython-input-37-d7eab82b0c99>, line 1)

### Write to File


In [54]:
#import pathlib
# write to file
#pathlib.Path("output.txt").write_text((no_sc_words))

#import re

#fin = open("data.txt", "rt")
#fout = open("out.txt", "wt")

#for line in fin:
#	fout.write(re.sub('\s+',' ',line))
#	
#fin.close()
#fout.close()

#### Stemming of Words

In [261]:
# using the SnowballStemmer which is based on The Porter Stemming Algorithm
##snowball_stemmer = SnowballStemmer('english')

##word_tokens = nltk.word_tokenize(no_ac_words)
##stemmed_word = [snowball_stemmer.stem(word) for word in word_tokens]
#print(stemmed_word[:100])

#### Lemmatization of Words

In [262]:
# Lemmatization
##wordnet_lemmatizer = WordNetLemmatizer()

##word_tokens = nltk.word_tokenize(no_ac_words)
##lemmatized_word = [wordnet_lemmatizer.lemmatize(word) for word in word_tokens]
#print(lemmatized_word)

# 2. Design Deliverables

In [25]:
import re
import nltk

from contractions import contractions_dict
def expand_contractions(text, contractions_dict):
    contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),
                                      flags=re.IGNORECASE | re.DOTALL)
def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contractions_dict.get(match) \
            if contractions_dict.get(match) \
            else contractions_dict.get(match.lower())
        expanded_contraction = expanded_contraction
        return expanded_contraction
expanded_text = contractions_pattern.sub(expand_match, text)
expanded_text = re.sub("'", "", expanded_text)
return expanded_text
def main():
    text = """I ain't going there. You'll have to go alone."""
    
    text=expand_contractions(text,contractions_dict)
    sentences = nltk.sent_tokenize(text)
    tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
    
    print (tokenized_sentences)
if __name__ == '__main__':
    main()

ModuleNotFoundError: No module named 'contractions'