In [1]:
import spacy
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
from contractions import CONTRACTION_MAP
import unicodedata

In [2]:
nlp = spacy.load('en_core_web_sm', parse=False, tag=False, entity=False)
tokenzier = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [3]:
document = """<p>Héllo! Héllo! can you hear me! I just heard about <b>Python</b>!<br/>\r\n 
              It's an amazing language which can be used for Scripting, Web development,\r\n\r\n
              Information Retrieval, Natural Language Processing, Machine Learning & Artificial Intelligence!\n
              What are you waiting for? Go and get started.<br/> He's learning, she's learning, they've already\n\n
              got a headstart!</p>
           """
document

"<p>Héllo! Héllo! can you hear me! I just heard about <b>Python</b>!<br/>\r\n \n              It's an amazing language which can be used for Scripting, Web development,\r\n\r\n\n              Information Retrieval, Natural Language Processing, Machine Learning & Artificial Intelligence!\n\n              What are you waiting for? Go and get started.<br/> He's learning, she's learning, they've already\n\n\n              got a headstart!</p>\n           "

## Cleaning Text -strip HTML

In [4]:
def strip_html_tags(text):
    soup = BeautifulSoup(text,'html.parser')
    stripped_text = soup.get_text()
    return stripped_text

In [5]:
strip_html_tags(document)

"Héllo! Héllo! can you hear me! I just heard about Python!\r\n \n              It's an amazing language which can be used for Scripting, Web development,\r\n\r\n\n              Information Retrieval, Natural Language Processing, Machine Learning & Artificial Intelligence!\n\n              What are you waiting for? Go and get started. He's learning, she's learning, they've already\n\n\n              got a headstart!\n"

## Removing accented characters

In [6]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD',text).encode('ascii','ignore').decode('utf-8','ignore')
    return text

In [7]:
remove_accented_chars(document)

"<p>Hello! Hello! can you hear me! I just heard about <b>Python</b>!<br/>\r\n \n              It's an amazing language which can be used for Scripting, Web development,\r\n\r\n\n              Information Retrieval, Natural Language Processing, Machine Learning & Artificial Intelligence!\n\n              What are you waiting for? Go and get started.<br/> He's learning, she's learning, they've already\n\n\n              got a headstart!</p>\n           "

## Expanding Contractions

In [21]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        #print(match)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())
        #print(expanded_contraction)
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [22]:
print(expand_contractions(document))

<p>Héllo! Héllo! can you hear me! I just heard about <b>Python</b>!<br/>
 
              It is an amazing language which can be used for Scripting, Web development,


              Information Retrieval, Natural Language Processing, Machine Learning & Artificial Intelligence!

              What are you waiting for? Go and get started.<br/> He is learning, she is learning, they have already


              got a headstart!</p>
           


## Removing Special Characters

In [24]:
def remove_special_chars(text):
    text = re.sub('[^a-zA-Z1-9\s]','',text)
    return text

In [26]:
remove_special_chars(document)

'pHllo Hllo can you hear me I just heard about bPythonbbr\r\n \n              Its an amazing language which can be used for Scripting Web development\r\n\r\n\n              Information Retrieval Natural Language Processing Machine Learning  Artificial Intelligence\n\n              What are you waiting for Go and get startedbr Hes learning shes learning theyve already\n\n\n              got a headstartp\n           '

## Lemmatizing text

In [41]:
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [42]:
lemmatize_text(document)

'< p > héllo ! Héllo ! can you hear me ! I just hear about < b > Python</b>!<br/ > \r\n \n               It be an amazing language which can be use for Scripting , web development , \r\n\r\n\n               Information Retrieval , Natural Language Processing , Machine Learning & Artificial Intelligence ! \n\n               what be you wait for ? go and get started.<br/ > He be learn , she be learn , they have already \n\n\n               get a headstart!</p > \n           '

## Removing the StopWords

In [43]:
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenzier.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [44]:
remove_stopwords(document)

"<p>Héllo ! Héllo ! hear ! heard <b>Python</b> ! <br/> ' amazing language used Scripting , Web development , Information Retrieval , Natural Language Processing , Machine Learning &amp; Artificial Intelligence ! waiting ? Go get started.<br/> ' learning , ' learning , ' already got headstart ! </p>"

## Normalize text corpus - tying it all together

In [45]:
def normalize_corpus(corpus, html_stripping = True, contraction_expension = True, accented_char_removal=True, text_lower_case = True
                     , text_lemmatization=True, special_char_removal = True, stopword_removal=True):
    normalized_corpus = []
    #normalize each document in corpus
    for doc in corpus:
        #strip Html
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accent chars
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        #expand contractions
        if contraction_expension:
            doc = expand_contractions(doc)
        #lower case letter
        if text_lower_case:
            doc = doc.lower()
        # remove extra new lines
        doc = re.sub(r'[\r|\n|\r|\n]+','',doc)
        #insert spaces between special characters to isolate them
        special_chars_pattern = re.compile(r'([{.(-)!}])')
        doc = special_chars_pattern.sub('\\1',doc)
        #lemmantize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # to remove special chars
        if special_char_removal:
            doc = remove_special_chars(doc)
        # remove extra whitespace
        doc = re.sub(' +', ' ',doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
        normalized_corpus.append(doc)
    return normalized_corpus

In [49]:
document

"<p>Héllo! Héllo! can you hear me! I just heard about <b>Python</b>!<br/>\r\n \n              It's an amazing language which can be used for Scripting, Web development,\r\n\r\n\n              Information Retrieval, Natural Language Processing, Machine Learning & Artificial Intelligence!\n\n              What are you waiting for? Go and get started.<br/> He's learning, she's learning, they've already\n\n\n              got a headstart!</p>\n           "

In [47]:
normalize_corpus([document])

['hello hello hear hear python amazing language use scripting web development information retrieval natural language processing machine learning artificial intelligence wait go get start learn learn already get headstart']

In [48]:
normalize_corpus([document], text_lemmatization=False)

['hello hello hear heard python amazing language used scripting web development information retrieval natural language processing machine learning artificial intelligence waiting go get started learning learning already got headstart']

In [50]:
normalize_corpus([document], text_lemmatization=False, stopword_removal=False)

['hello hello can you hear me i just heard about python it is an amazing language which can be used for scripting web development information retrieval natural language processing machine learning artificial intelligence what are you waiting for go and get started he is learning she is learning they have already got a headstart']