In [None]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

The end result of this exercise should be a file named prepare.py that defines the requested functions.

In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both 
- the codeup blog articles 
- and the news articles that were previously acquired.

# ACQUIRE

In [1]:
original = acquire.get_article_text()
print(original)

NameError: name 'acquire' is not defined

# PREPARE

## 1. PREPARE - Convert text to all lower case for normalcy.

In [None]:
article = original.lower()
print(article)

## 2. PREPARE - Remove any accented characters, non-ASCII characters.

In [None]:
article = unicodedata.normalize('NFKD', article)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')

print(article)

## 3. PREPARE - Remove special characters.

In [None]:
# remove anything that is not a through z, a number, a single quote, or whitespace
article = re.sub(r"[^a-z0-9'\s]", '', article)
print(article)

### Tokenization - Remove Special Characters

In [None]:
tokenizer = nltk.tokenize.ToktokTokenizer()

print(tokenizer.tokenize(original, return_str=True))

## 4. PREPARE - Stem or lemmatize the words.

### Stem

In [None]:
# Create the nltk stemmer object, then use it
ps = nltk.porter.PorterStemmer()

ps.stem('call'), ps.stem('called'), ps.stem('calling')

In [None]:
stems = [ps.stem(word) for word in article.split()]
article_stemmed = ' '.join(stems)
print(article_stemmed)

In [None]:
pd.Series(stems).value_counts().head(10)

### Lemmatization

In [None]:
wnl = nltk.stem.WordNetLemmatizer()

for word in 'study studies'.split():
    print('stem:', ps.stem(word), '-- lemma:', wnl.lemmatize(word))

In [None]:
lemmas = [wnl.lemmatize(word) for word in article.split()]
article_lemmatized = ' '.join(lemmas)

print(article_lemmatized)

In [None]:
pd.Series(lemmas).value_counts()[:10]

## 5. PREPARE - Remove stopwords.

In [None]:
stopword_list = stopwords.words('english')

stopword_list.remove('no')
stopword_list.remove('not')

stopword_list[:10]

In [None]:
len(stopword_list)

In [None]:
words = article.split()
filtered_words = [w for w in words if w not in stopword_list]

print('Removed {} stopwords'.format(len(words) - len(filtered_words)))
print('---')

article_without_stopwords = ' '.join(filtered_words)

print(article_without_stopwords)

## 6. PREPARE -  Store the clean text and the original text for use in future notebooks.

1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [None]:
def basic_clean(string):
    '''
    This function takes in a string, applies basic text cleaning to it,
    then returns normalized text, making all text lowercase, normalizing unicode characters,
    and replacing anything that is not a letter, number, whitespace, or a single quote.
    '''
    # removes accented characters; removes inconsistencies in unicode, converts resulting string to ASCII character, while ignoring warnings, and decodes to turn resulting bytes back into string. 
    string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    # # removes special characters, substituting anything that is NOT a letter, number, apostrophe, or whitespace, then makes text lowercase
    string = re.sub(r"[^a-z0-9'\s]", '', string).lower()
    return string
    

2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [None]:
def tokenize(string):
    '''
    This function takes in a string and
    tokenizes the string; breaking them down into discrete units.
    '''
    # Create tokenizer.
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    # Use tokenizer
    string = tokenizer.tokenize(string, return_str = True)
    
    return string

3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [None]:
def stem(text):
    '''This function accepts text and returns the stemmed text.
    '''
    # create the stemmer
    ps = nltk.porter.PorterStemmer()
    
    # apply the stemming transformation to all the words in the text using split
    stems = [ps.stem(word) for word in article.split()]
    
    # join the list of words into a string again assigned to the variable article_stemmed
    article_stemmed = ' '.join(stems)
    return text

4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [None]:
def lemmatize(text):
    
words = article.split()
filtered_words = [w for w in words if w not in stopword_list]

print('Removed {} stopwords'.format(len(words) - len(filtered_words)))
print('---')

article_without_stopwords = ' '.join(filtered_words)

print(article_without_stopwords)
    
    return text

5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [None]:
def remove_stopwords(text):
    stopword_list = stopwords.words('english')

    stopword_list.remove('no')
    stopword_list.remove('not')

    stopword_list[:10]
    
    words = article.split()
    filtered_words = [w for w in words if w not in stopword_list]

    print('Removed {} stopwords'.format(len(words) - len(filtered_words)))
    print('---')

    article_without_stopwords = ' '.join(filtered_words)

    print(article_without_stopwords)
    
    
    return text

6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

8.For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

9. Ask Yourself

- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
     - Lemmatized text because it is a smaller dataset, and lemmatizing will result in more accurate identification of the 'meaning' of the word, identifying the lexicographically correct root word. 
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
    - It depends on the amount of time I have. Stemmed text could be better if short on time because it is a larger dataset, and lemmatizing, although it will result in more accurate identification of the 'meaning' of the word, it is considerably slower for larger datasets. However, if not short on time, I would want the most accurate results through lemmatized text. 
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?
    - Stemmed text because it is a larger dataset, and lemmatizing, although it will result in more accurate identification of the 'meaning' of the word, it is considerably slower for larger datasets. 
