In [42]:
##### import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
import unicodedata
import re

In [43]:
document = """<h1>Title Goes Here</h1>
<b>Bolded Text</b>
<i>Italicized Text</i>
<img src="this should all be gone"/>
<a href="this will be gone, too">But this will still be here!</a>
I run. He ran. She is running. Will they stop running?
I talked. She was talking. They talked to them about running. Who ran to the talking runner?
[Some text we don't want to keep is in here]
¡Sebastián, Nicolás, Alejandro and Jéronimo are going to the store tomorrow morning!
something... is! wrong() with.,; this :: sentence.
I can't do this anymore. I didn't know them. Why couldn't you have dinner at the restaurant?
My favorite movie franchises, in order: Indiana Jones; Marvel Cinematic Universe; Star Wars; Back to the Future; Harry Potter.
Don't do it.... Just don't. Billy! I know what you're doing. This is a great little house you've got here.
[This is some other unwanted text]
John: "Well, well, well."
James: "There, there. There, there."
&nbsp;&nbsp;
There are a lot of reasons not to do this. There are 101 reasons not to do it. 1000000 reasons, actually.
I have to go get 2 tutus from 2 different stores, too.
22    45   1067   445
{{Here is some stuff inside of double curly braces.}}
{Here is more stuff in single curly braces.}
[DELETE]
</body>
</html>"""

## NOISE REMOVAL/HTML-MARKUP TAGS

In [44]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

In [45]:
document = denoise_text(document)
document

'Title Goes Here\nBolded Text\nItalicized Text\n\nBut this will still be here!\nI run. He ran. She is running. Will they stop running?\nI talked. She was talking. They talked to them about running. Who ran to the talking runner?\n\n¡Sebastián, Nicolás, Alejandro and Jéronimo are going to the store tomorrow morning!\nsomething... is! wrong() with.,; this :: sentence.\nI can\'t do this anymore. I didn\'t know them. Why couldn\'t you have dinner at the restaurant?\nMy favorite movie franchises, in order: Indiana Jones; Marvel Cinematic Universe; Star Wars; Back to the Future; Harry Potter.\nDon\'t do it.... Just don\'t. Billy! I know what you\'re doing. This is a great little house you\'ve got here.\n\nJohn: "Well, well, well."\nJames: "There, there. There, there."\n\xa0\xa0\nThere are a lot of reasons not to do this. There are 101 reasons not to do it. 1000000 reasons, actually.\nI have to go get 2 tutus from 2 different stores, too.\n22    45   1067   445\n{{Here is some stuff inside of

## CONTRACTION CHECK

In [46]:
import contractions

In [47]:
def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)

In [48]:
document = replace_contractions(document)
print(document)

Title Goes Here
Bolded Text
Italicized Text

But this will still be here!
I run. He ran. She is running. Will they stop running?
I talked. She was talking. They talked to them about running. Who ran to the talking runner?

¡Sebastián, Nicolás, Alejandro and Jéronimo are going to the store tomorrow morning!
something... is! wrong() with.,; this :: sentence.
I cannot do this anymore. I did not know them. Why could not you have dinner at the restaurant?
My favorite movie franchises, in order: Indiana Jones; Marvel Cinematic Universe; Star Wars; Back to the Future; Harry Potter.
do not do it.... Just do not. Billy! I know what you are doing. This is a great little house you have got here.

John: "Well, well, well."
James: "There, there. There, there."
  
There are a lot of reasons not to do this. There are 101 reasons not to do it. 1000000 reasons, actually.
I have to go get 2 tutus from 2 different stores, too.
22    45   1067   445
{{Here is some stuff inside of double curly braces.}}
{H

## REMOVING ACCENTED CHARS

In [49]:
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

remove_accented_chars('Sómě Áccěntěd těxt')

'Some Accented text'

## TOKENIZATION

In [50]:
document = nltk.word_tokenize(document)
print (document)

['Title', 'Goes', 'Here', 'Bolded', 'Text', 'Italicized', 'Text', 'But', 'this', 'will', 'still', 'be', 'here', '!', 'I', 'run', '.', 'He', 'ran', '.', 'She', 'is', 'running', '.', 'Will', 'they', 'stop', 'running', '?', 'I', 'talked', '.', 'She', 'was', 'talking', '.', 'They', 'talked', 'to', 'them', 'about', 'running', '.', 'Who', 'ran', 'to', 'the', 'talking', 'runner', '?', '¡Sebastián', ',', 'Nicolás', ',', 'Alejandro', 'and', 'Jéronimo', 'are', 'going', 'to', 'the', 'store', 'tomorrow', 'morning', '!', 'something', '...', 'is', '!', 'wrong', '(', ')', 'with.', ',', ';', 'this', ':', ':', 'sentence', '.', 'I', 'can', 'not', 'do', 'this', 'anymore', '.', 'I', 'did', 'not', 'know', 'them', '.', 'Why', 'could', 'not', 'you', 'have', 'dinner', 'at', 'the', 'restaurant', '?', 'My', 'favorite', 'movie', 'franchises', ',', 'in', 'order', ':', 'Indiana', 'Jones', ';', 'Marvel', 'Cinematic', 'Universe', ';', 'Star', 'Wars', ';', 'Back', 'to', 'the', 'Future', ';', 'Harry', 'Potter', '.', '

## NORMALIZATION
    ### Removing Non-ASCII Chars
    ### To Lowercase
    ### Removing Punctuation
    ### Replacing Integers with Words
    ### Removing Stopwords
    ### Stemming
    ### Lemmatizing

In [51]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    stopword_list= stopwords.words('english')
    stopword_list.remove('no') # no and not may give us information so those are removed from stop lists
    stopword_list.remove('not')
    stopword_list.append('the')
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    from nltk.stem.snowball import SnowballStemmer
    #stemmer = nltk.porter.PorterStemmer() # First option
    #stemmer= nltk.porter.SnowballStemmer('english') # Second option
    stemmer = LancasterStemmer() # this is third but the most aggressive Stemmer and stems the words to be barely readable
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_words(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word)
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words

In [56]:
document2 = nltk.word_tokenize("""#Go @United!! States'""")

In [57]:
normalize(document2)

['go', 'united', 'states']

### Reoving a Regex Pattern

In [58]:
# Remove a regex pattern 

import re 

def remove_regex(input_text, regex_pattern):
    
    urls = re.finditer(regex_pattern, input_text) 
    for i in urls: 
        input_text = re.sub(i.group().strip(), '', input_text)
    return input_text

In [62]:
regex_pattern = "#[\w]*"  
v1 = remove_regex("remove this #hashtag from the text", regex_pattern)
v1

'remove this  from the text'

## CLEANING TEXT

In [63]:
tweet = "I luv my &lt;3 iphone &amp; you’re awsm apple. DisplayIsAwesome, sooo happppppy 🙂 http://www.apple.com"

In [64]:
tweet2 = nltk.word_tokenize(tweet)

In [66]:
print(normalize(tweet2))

['luv', 'lt', 'three', 'iphone', 'amp', 'awsm', 'apple', 'displayisawesome', 'sooo', 'happppppy', 'http', 'wwwapplecom']


### Escaping html chars

We should get rid of html chars

In [67]:
from html.parser import HTMLParser

In [68]:
html_parser = HTMLParser()

In [69]:
tweet = html_parser.unescape(tweet)

  """Entry point for launching an IPython kernel.


In [70]:
tweet

'I luv my <3 iphone & you’re awsm apple. DisplayIsAwesome, sooo happppppy 🙂 http://www.apple.com'

In [72]:
print(remove_non_ascii(tweet2))

['I', 'luv', 'my', '&', 'lt', ';', '3', 'iphone', '&', 'amp', ';', 'you', '', 're', 'awsm', 'apple', '.', 'DisplayIsAwesome', ',', 'sooo', 'happppppy', '', 'http', ':', '//www.apple.com']


### Decoding chars

In [73]:
tweet = tweet.encode('ascii','ignore').decode('utf8')
tweet

'I luv my <3 iphone & youre awsm apple. DisplayIsAwesome, sooo happppppy  http://www.apple.com'

### Splitting Attached Words

In [74]:
cleaned = " ".join(re.findall('[A-Z][^A-Z]*', tweet))
cleaned

'I luv my <3 iphone & youre awsm apple.  Display Is Awesome, sooo happppppy  http://www.apple.com'

### Standardizing Words

Sometimes words are not in proper formats. For example: “I looooveee you” should be “I love you”. Simple rules and regular expressions can help solve these cases.


In [75]:
import itertools

In [76]:
from itertools import *
tweet = ''.join(''.join(s)[:2] for _, s in itertools.groupby(tweet))

In [77]:
tweet

'I luv my <3 iphone & youre awsm apple. DisplayIsAwesome, soo happy  http://ww.apple.com'

### Tokenizing

In [78]:
tweet = nltk.word_tokenize(tweet)

In [79]:
tweet = ''.join(''.join(s)[:2] for _, s in itertools.groupby(tweet))

In [80]:
tweet

'Ilumy<3ip&yoawap.Di,sohaht://'