In [51]:
import pandas as pd
import matplotlib.pyplot as plt
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('words')
from nltk.corpus import words
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import contractions
import texthero
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams

[nltk_data] Downloading package wordnet to C:\Users\Zhichao Carton
[nltk_data]     Zeng\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Zhichao Carton
[nltk_data]     Zeng\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package words to C:\Users\Zhichao Carton
[nltk_data]     Zeng\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


<span style="font-size:22px;font-family:sans-serif"> Now do text cleaning... </span>

In [20]:
data_train = pd.read_csv('train.csv')
data_train.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


<span style="font-size:16px;font-family:sans-serif;color:LightSeaGreen">1. Remove space characters (and other special characters here, if any, in the future), also switching all characters to lower case... </span>

In [21]:
def remove_n(ss):
    ss = ss.replace("\t", " ")
    return ss.replace("\n", " ")

data_train['text_n'] = data_train['full_text'].apply(remove_n)

texthero_pipe = [#texthero.preprocessing.fillna,
                    texthero.preprocessing.lowercase
                    #texthero.preprocessing.remove_digits,
                    #texthero.preprocessing.remove_punctuation,
                    #texthero.preprocessing.remove_diacritics
                    ]
data_train['text_n'] = texthero.clean(data_train['text_n'], pipeline = texthero_pipe) 

<span style="font-size:16px;font-family:sans-serif;color:LightSeaGreen">2. Dealing with contractions... </span>

In [22]:
data_train['text_nc'] = data_train['text_n'].apply(contractions.fix)

<span style="font-size:16px;font-family:sans-serif;color:LightSeaGreen">3. Lemmatize: combine ifferent forms of the same word... </span>

In [46]:
tokenizer = RegexpTokenizer(r'\w+')

# tokenize, lemmatize, then detokenize
def lmtz(ss):
    tokens_list = ss.apply(lambda x: tokenizer.tokenize(x))
    new_tokens_list = []
    for tokens in (tokens_list):
        new_tokens = []
        for token in (tokens): 
            new_tokens.append(lemmatizer.lemmatize(token))
        new_tokens_list.append(TreebankWordDetokenizer().detokenize(new_tokens))
    return new_tokens_list

data_train['text_lmtz'] = pd.Series(lmtz(data_train['text_nc']))

0       i think that student would benefit from learni...
1       when a problem is a change you have to let it ...
2       dear principal if you change the school policy...
3       the best time in life is when you become yours...
4       small act of kindness can impact in other peop...
                              ...                        
3906    i believe using cellphone in class for educati...
3907    working alone student do not have to argue wit...
3908    a problem is a chance for you to do your best ...
3909    many people disagree with albert schweitzer s ...
3910    do you think that failure is the main thing fo...
Name: text_lmtz, Length: 3911, dtype: object

<span style="font-size:16px;font-family:sans-serif;color:LightSeaGreen">4. Spelling correction by comparing with nltk. </span>

In [84]:
correct_words = words.words()

def has_numbers(inputString):
    return any(char.isdigit() for char in inputString)

# if a word is not found in words.words(), will return the 'most similar' word in the dictionary using jaccard metric
def correct_spelling(word):  
    if not has_numbers(word):
        try:
            temp = [(jaccard_distance(set(ngrams(word, 2)),
                                      set(ngrams(w, 2))),w)
                    for w in correct_words if w[0]==word[0]]   # so first letter must be correct
            return str(sorted(temp, key = lambda val:val[0])[0][1])
        except Exception as e: 
            print (word)
            print (e)
    else:
        return word
    return word
    
def count_mis_spelling_and_correct(essay):
    essay_tok = tokenizer.tokenize(essay)
    cnt = 0
    essay_new = []
    for wd in essay_tok:
        if len(wd) > 7:                                            # only apply the spelling correction to len > 7 words
            if wd not in correct_words:
                wd_new = correct_spelling(wd)
                cnt += 1
            else:
                wd_new = wd
        else:
            wd_new = wd
        essay_new.append(wd_new) 
    return str(essay), cnt

temp_ls, err_cnt = [], []
for case in data_train['text_nc']:
    txt, err = count_mis_spelling_and_correct(case)
    temp_ls.append(txt)
    err_cnt.append(err)
    
data_train['text_spelling'] = pd.Series(temp_ls)
data_train['no. spelling error'] = pd.Series(err_cnt)

KeyboardInterrupt: 

<span style="font-size:16px;font-family:sans-serif;color:LightSeaGreen">Removing Stopwords... </span>

In [80]:
data_train['text_spelling']

0       i think that students would benefit from learn...
1       when a problem is a change you have to let it ...
2       dear, principal  if you change the school poli...
3       the best time in life is when you become yours...
4       small act of kindness can impact in other peop...
                              ...                        
3906    i believe using cellphones in class for educat...
3907    working alone, students do not have to argue w...
3908    "a problem is a chance for you to do your best...
3909    many people disagree with albert schweitzer's ...
3910    do you think that failure is the main thing fo...
Name: text_spelling, Length: 3911, dtype: object