# Prototyping Spell Corrections for Cal-Fresh Application Dataset

Author: Rocio Ng (DSWG Lead)

### Summary:  
* The purpose of this notebook is to test various methods for spell checking/correcting free text entered into Applications for the CalFresh Program (https://www.getcalfresh.org/)

### Resources:
* Spanish Language Corpus - https://www.corpusdata.org/spanish.asp

Eric's notes:
replaced i'm with i am
replaced ' (most often in contractions) with ""
replaced all other punctuation with " " 
then replaced all double spaces with single

do we need to add whitelists to enchant dictionary?  

## Load Libraries

In [None]:
import numpy as np
import pandas as pd
from langdetect import detect

import warnings
warnings.filterwarnings(action='once') # displays warnings only once

import os
import sys

# For loading Helper Functions
module_path = os.path.abspath(os.path.join('helper-modules'))
if module_path not in sys.path:
    sys.path.append(module_path)

# For Multicore processing
from multiprocessing import Pool

# Helper Modules
from spell_checking_functions import *
from text_processing_functions import *

In [None]:
# Testing spell checker

correction_phrase("helpp meh with calfrsh whil i'm applying for ssi")

In [None]:
print(detect('Hi')) # False Negative Results
print(detect('I currently live in my truck'))
print(detect("estoy embarazada"))

## Load data
* Make sure paths point to where data files are stored locally if you choose to rename/move things

In [None]:
text_df = pd.read_csv("1-Data-Files/500_sample_results.csv")
# text_df = pd.read_csv("1-Data-Files/orig_entRep_300.csv")

In [None]:
text_df.shape

In [None]:
text_df.head()

## Processing

* Light text processing
* Detect Langage
* Count Spelling Errors

In [None]:
# demo 
initial_phrase_processing(" PERSON wenT to The Store at CARDINAL place!!!")
initial_phrase_processing(" PERSON wenT to The Store at CARDINAL!!!") # Entities need to be reformatted to have whitespace surrounding them

In [None]:
text_df['processed_phrase'] = text_df.with_entity_replacement\
    .apply(lambda x: initial_phrase_processing(x))

text_df['language'] = text_df.with_entity_replacement\
    .apply(lambda x: detect_B(x)) # using modified version of the built-in detect function

In [None]:
text_df.head()

In [None]:
text_df['spelling_errors'] = text_df.processed_phrase\
    .apply(lambda x: check_phrase(x))

text_df = text_df.sort_values('spelling_errors', ascending = False)

In [None]:
text_df.groupby(by = "language").count()

In [None]:
other_languages = text_df[text_df.language.isin(['None'])]

In [None]:
other_languages

## Apply Spell Checking Functions

* Convert Dataframe column of Phrases to List to enable Multiprocessing
* Run spell Correction_phrase function on text
* Append back to Dataframe

In [None]:
text_df['spelling_corrections'] = text_df.apply(lambda )

In [None]:
spelling_error_list = text_df['processed_phrase'].tolist()

In [None]:
spelling_error_list = text_df[['processed_phrase', 'language']].flatten()

In [None]:
spelling_error_list.head()

In [None]:
# Preview
spelling_error_list[3:6]

In [None]:
my_pool = Pool(processes=4) # change to number of cores in machine

In [None]:
correction("tte")

In [None]:
%time spelling_corrections = my_pool.map(correction_phrase, spelling_error_list)

In [None]:
spelling_corrections[3:6]

In [None]:
text_df['spelling_corrections'] = spelling_corrections

In [None]:
text_df.head()

In [None]:
subset_df = text_df.iloc[10:15]

In [None]:
text_df.to_csv("gcf_circumstances_spell_correct.csv")

## Load White List Documents

In [None]:
# For converting text to CSV, preferable 
# import csv 

# txt_file = r"2-Whitelist-Docs/wordsEn.txt"
# csv_file = r"2-Whitelist-Docs/wordsEn.csv"

# in_txt = csv.reader(open(txt_file, "rb"), delimiter = '\t')
# out_csv = csv.writer(open(csv_file, 'wb'))

# out_csv.writerows(in_txt)

In [None]:
whitelist_df1 = pd.read_csv("2-Whitelist-Docs/white-list.csv")
whitelist_df2 = pd.read_csv("2-Whitelist-Docs/wordsEn.csv")
whitelist_list1 = whitelist_df1["word"].tolist()
whitelist_list2 = whitelist_df2["word"].tolist()

In [None]:
"part" in whitelist_list1
"part" in whitelist_list2

In [None]:
whitelist_list = whitelist_list1 + whitelist_list2

In [None]:
test_phrase = "This is a Test.   For Rocio. Hello. "
def check_whitelist(phrase, whitelist, method = "remove"):
    
    # clean phrase and split into words
    phrase = phrase.lower().strip()
    phrase = removePunctuation(phrase)
    word_list = phrase.split(" ")
    
    # word_list = re.findall(r"[\w']+|[.,!?;]", phrase)
    word_list = [word for word in word_list if word != "" ]  # for clearing double spaces

    # remove words not in white list and calculate # of words removed
    if method == "remove":
        cleaned_word_list = [word for word in word_list if word in whitelist]
        delta = len(word_list) - len(cleaned_word_list)
        cleaned_phrase = " ".join(cleaned_word_list)
    elif method == "replace":
        cleaned_word_list = [word if word in whitelist else "[redacted]" for word in word_list]
        cleaned_phrase = " ".join(cleaned_word_list)
        delta = cleaned_word_list.count("[redacted]")
    return cleaned_phrase, delta


def removed_words(phrase, whitelist):
    pass

In [None]:
check_whitelist(test_phrase, whitelist_list, "replace")

In [None]:
text_df['whitelisted_phrase'] = text_df.spelling_corrections\
    .apply(lambda x: check_whitelist(x, whitelist_list, "replace")[0])

In [None]:
text_df.head()

In [None]:
text_df["whitelisted_phrase"].to_csv("cleant_data.csv")

In [None]:
text_df.to_csv("gcf_circumstances_spell_correct_whitelist_300.csv")

## Validate Effectiveness of Corrections

In [None]:
# text_df['words_removed_raw_words'] = text_df.original_additional_information_text\
#     .apply(lambda x: int(check_whitelist(x, whitelist_list)[1]))

# text_df['words_removed_spell_corrected'] = text_df.spelling_corrections\
#     .apply(lambda x: int(check_whitelist(x, whitelist_list)[1]))

text_df = text_df\
    .assign(pct_improvement = 100*(1 - (text_df.words_removed_spell_corrected/text_df.words_removed_raw_words)))\
    .assign(improvement = text_df.words_removed_raw_words - text_df.words_removed_spell_corrected)

In [None]:
missing_words = ["test", "in", "an", "never", "work", "part", "house"]

In [None]:
text_df.head(20)

In [None]:
text_df.sum(axis = 0)

In [None]:
# baseline to beat 43

## Try Other Versions of the SpellChecker

In [None]:
from spellcheck_v2_RN import *

In [None]:
from spacy.en import English

In [None]:
import spacy

In [None]:
from spacy.tokenizer.Tokenizer.

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
from spacy.lang.en.stop_words import STOP_WORDS

In [None]:
from spacy.tokenizer_exceptions import BASE_EXCEPTIONS


In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
doc = nlp("This sentence ain't gonna be ORG grammatically correct7. 9 >>{:o) THis sentence about SAR7 PERSON doesn't have mispeled wordz.")
