# Exercises

In [31]:
# imports
import numpy as np
import pandas as pd

import acquire

import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer

from time import strftime

import unicodedata

import json

## 1. `basic_clean` function
This function should take in a string and apply some basic text cleaning to it:
1. Lowercase everything,
2. Normalize unicode characters, and
3. Replace anything that is not a letter, number, whitespace or a single quote

In [24]:
# 1. lowercase
string = '''
Carrot cake brownie carrot cake ice cream croissant powder bear claw. Icing tiramisu soufflé fruitcake carrot 
cake macaroon liquorice. Dragée sweet icing lollipop chocolate bar jelly beans.
Cake sugar plum cookie tiramisu dessert cupcake sweet lollipop liquorice. Dragée ice cream pastry shortbread 
halvah chupa chups sweet ice cream. Cheesecake pastry powder donut cake.
'''.lower()#.strip().lower()

# 2. normalize unicode
string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')

# 3. replace non-alphanumeric characters
string = re.sub(r"[^a-z0-9'\s]", '', string)#.replace('\n', ' ')

string

'\ncarrot cake brownie carrot cake ice cream croissant powder bear claw icing tiramisu souffle fruitcake carrot \ncake macaroon liquorice dragee sweet icing lollipop chocolate bar jelly beans\ncake sugar plum cookie tiramisu dessert cupcake sweet lollipop liquorice dragee ice cream pastry shortbread \nhalvah chupa chups sweet ice cream cheesecake pastry powder donut cake\n'

In [27]:
def basic_clean(string):
    '''
    This function takes in a string as a paramenter and performs the following basic cleaning functions:
        1. lowercase,
        2. normalize unicode, and
        3. remove non-alphanumeric characters
    
    The function returns the cleaned string.
    '''
    # 1. lowercase
    string = string.strip().lower()
    
    # 2. normalize unicode
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # 3. replace non-alphanumeric characters
    string = re.sub(r"[^a-z0-9'\s]", '', string)
    
    return string

In [28]:
# tesitng the function
basic_clean("""
-9p]o.6ykrq` \14v3tu
7ig'kots64s[ai.o'[k/
jmnr/;ilyetqams6/;3f
h821\e83qyw69eme.;r,
rsvb,l72qzfmv33bm,u=
7p 7ditj\dzjpr9s=icm
qva,d],p;sz7vu[wglvx
n1trtu;weit5pq8j0q7o
rls9dozsc`ruop8m.-\0
nz8lr=cfvksu/bhd-2d'
""")

"9po6ykrq \x0cv3tu\n7ig'kots64saio'k\njmnrilyetqams63f\nh821e83qyw69emer\nrsvbl72qzfmv33bmu\n7p 7ditjdzjpr9sicm\nqvadpsz7vuwglvx\nn1trtuweit5pq8j0q7o\nrls9dozscruop8m\nnz8lrcfvksubhd2d'"

## 2. `tokenize` function
This function should take in a string and tokenize all the words in the string.

In [29]:
test_string = """
-9p]o.6ykrq` \14v3tu
7ig'kots64s[ai.o'[k/
jmnr/;ilyetqams6/;3f
h821\e83qyw69eme.;r,
rsvb,l72qzfmv33bm,u=
7p 7ditj\dzjpr9s=icm
qva,d],p;sz7vu[wglvx
n1trtu;weit5pq8j0q7o
rls9dozsc`ruop8m.-\0
nz8lr=cfvksu/bhd-2d'
"""

In [38]:
tokenizer = nltk.tokenize.ToktokTokenizer()

tokenizer.tokenize(test_string, return_str=True)

"-9p ] o.6ykrq ` \x0cv3tu\n7ig ' kots64s[ ai.o ' [ k/\njmnr/ ; ilyetqams6/ ; 3f\nh821\\e83qyw69eme. ; r , \nrsvb , l72qzfmv33bm , u=\n7p 7ditj\\dzjpr9s=icm\nqva , d ] , p ; sz7vu[ wglvx\nn1trtu ; weit5pq8j0q7o\nrls9dozsc ` ruop8m.-\x00\nnz8lr=cfvksu/bhd-2d '"

In [39]:
def tokenize(string):
    '''
    This function takes in a string and returns the tokenized version of that string.
    '''
    
    # creating tokenizer object
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    # using tokenizer object on string
    string = tokenizer.tokenize(string, return_str=True)
    
    return string
    

In [42]:
tokenize('''
This is a string. Let's see what happens when we tokenize it!
''')

"This is a string. Let ' s see what happens when we tokenize it !"

## 3. `stem` function
This function should accept some text and return the text after applying stemming to all the words.

In [45]:
def stem(string):
    '''
    This function takes in a string, stems each individual word, and then joins
    the stem words back together in the returned string.
    '''
    
    # creating the stem object
    ps = nltk.porter.PorterStemmer()
    
    # creating the stems for each individual word in the string
    stems = [ps.stem(word) for word in string.split()]
    
    # putting the stemmed words back together into string
    string = ' '.join(stems)
    
    return string

In [46]:
stem(string)

'carrot cake browni carrot cake ice cream croissant powder bear claw ice tiramisu souffl fruitcak carrot cake macaroon liquoric drage sweet ice lollipop chocol bar jelli bean cake sugar plum cooki tiramisu dessert cupcak sweet lollipop liquoric drage ice cream pastri shortbread halvah chupa chup sweet ice cream cheesecak pastri powder donut cake'

In [51]:
pd.Series(string.split()).value_counts().head(
)

cake      5
carrot    3
sweet     3
ice       3
cream     3
dtype: int64

## 4. `lemmatize` function
This function should accept some text and return the text after applying lemmatization to each word.

In [52]:
def lemmatize(string):
    '''
    
    '''
    
    # create the lemmatization object
    wnl = nltk.stem.WordNetLemmatizer()
    
    # creating a list of string of each word in the article and applying the lemmatize object to each word
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    # joining the individual list of lemma string words to a single string of words
    string = ' '.join(lemmas)
    
    return string

In [53]:
lemmatize(string)

'carrot cake brownie carrot cake ice cream croissant powder bear claw icing tiramisu souffle fruitcake carrot cake macaroon liquorice dragee sweet icing lollipop chocolate bar jelly bean cake sugar plum cookie tiramisu dessert cupcake sweet lollipop liquorice dragee ice cream pastry shortbread halvah chupa chups sweet ice cream cheesecake pastry powder donut cake'

In [54]:
string

'\ncarrot cake brownie carrot cake ice cream croissant powder bear claw icing tiramisu souffle fruitcake carrot \ncake macaroon liquorice dragee sweet icing lollipop chocolate bar jelly beans\ncake sugar plum cookie tiramisu dessert cupcake sweet lollipop liquorice dragee ice cream pastry shortbread \nhalvah chupa chups sweet ice cream cheesecake pastry powder donut cake\n'

## 5. `remove_stopwords`
This function should accept some text and return the text after removing all the stopwords.
<br>
It should also define two parameters:
- `extra-words` any additional stop words (words we want removed that are not already listed as stop words)
- `exclude-words` any words you want excluded from stop words search (stop words we don't want removed)

In [57]:
# calls a list of common english stopwords
stopwords.words('english')[:5]

['i', 'me', 'my', 'myself', 'we']

In [63]:
# saving the list of english stopwords
stopword_list = stopwords.words('english')

# savings the words in the string as a list of strings for each indidvidual word
words = 'Mary had a little lamb. Little lamb.'.split()

# creating a for loop that will loop through each of the individual words in the string
#     and return a list of only the words that are not in the list of stopwords
[word for word in words \
    if word not in stopword_list]

['Mary', 'little', 'lamb.', 'Little', 'lamb.']

In [68]:
def remove_stopwords(string):
    '''
    This function takes in a string of words and splits it into a list of strings for each individual 
    word. It then loops through the list of words and returns a string of the joined string words, 
    excluding the stopwords.
    '''
    
    # splitting the string of words into a list of strings
    words = string.split()
    
    # saving the stop words
    stopword_list = stopwords.words('english')
    
    # looping through the list of words and creating a new list with the words not in the stopwords list
    filtered_words = [word for word in words if word not in stopword_list]
    
    # joining the list of words back to a string of words
    filtered_string = ' '.join(filtered_words)
    
    # printing number of words removed
    print(f'Removed {len(string) - len(filtered_string)} stop words.\n> Original string: {len(string)}\n> New string: {len(filtered_string)}')
    print()
    
    return filtered_string

In [69]:
# testing function
remove_stopwords('This little light of mine, I"m going to let it shine. Let it shine, let it shine, let it shine.')

Removed 18 stop words.
> Original string: 95
> New string: 77



'This little light mine, I"m going let shine. Let shine, let shine, let shine.'

## 6. `news_df`
Use your data from the acquire to produce a dataframe of the news articles. 

## 7. `codeup_df`
Make another dataframe from the codeup blog posts.

## 8. DataFrame columns
For each dataframe, produce the following columns:
- `title` to hold the title
- `original` to hold the original article/post content
- `clean` to hold the normalized and tokenized original with the stopwords removed.
- `stemmed` to hold the stemmed version of the cleaned data.
- `lemmatized` to hold the lemmatized version of the cleaned data.

## 9. Ask youself:
- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?