# NLP Data Prep Exercises

In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer

from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('all')
import pandas as pd
import prepare
import acquire

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ScottBarnett/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ScottBarnett/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/ScottBarnett/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     /Users/ScottBarnett/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/ScottBarnett/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/ScottBarnett/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[n

The end result of this exercise should be a file named prepare.py that defines the requested functions.

In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

## 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:
* Lowercase everything
* Normalize unicode characters
* Replace anything that is not a letter, number, whitespace or a single quote.

In [2]:
def basic_clean(text):
    '''
    take in a string and apply some basic text cleaning to it:
    * Lowercase everything
    * Normalize unicode characters
    * Replace anything that is not a letter, number, whitespace or a single quote.
    '''
    text = text.lower()  # Lowercase everything
    tedfxt = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')  # Normalize unicode characters
    text = re.sub(r"[^a-z0-9\s']", '', text)  # Replace anything that is not a letter, number, whitespace, or single quote
    return text

## 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [3]:
def tokenize(text):
    '''
    take in a string and tokenize all the words in the string
    '''
    tokenizer = ToktokTokenizer()
    return tokenizer.tokenize(text)

## 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [4]:
def stem(text):
    '''
    accept some text and return the text after applying stemming to all the words
    '''
    stemmer = PorterStemmer()
    stemmer = nltk.stem.PorterStemmer()
    return ' '.join([stemmer.stem(word) for word in text.split()])

## 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [5]:
def lemmatize(text):
    '''
    accept some text and return the text after applying lemmatization to each word
    '''
    lemmatizer = nltk.stem.WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])


## 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.
This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [6]:
def remove_stopwords(text, extra_words=[], exclude_words=[]):
    '''
    accept some text and return the text after removing all the stopwords.
    This function defines two optional parameters, extra_words and exclude_words. These parameters define any additional stop words to include,
    and any words that we don't want to remove.
    '''
    stopword_list = stopwords.words('english')
    for word in extra_words:
        stopword_list.append(word)
    for word in exclude_words:
        stopword_list.remove(word)
    return ' '.join([word for word in text.split() if word not in stopword_list])


## 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [7]:
news_df = acquire.get_news_articles()

In [8]:
news_df = pd.DataFrame(news_df)

In [9]:
news_df.head()

Unnamed: 0,title,content,category
0,"Sensex, Nifty end at fresh closing highs",Benchmark indices Sensex and Nifty ended at re...,business
1,Amazon tricked millions of customers into enro...,US Federal Trade Commission (FTC) has sued Ama...,business
2,TIME releases list of the world's 100 most inf...,TIME magazine has released its annual list of ...,business
3,Which are the world's top 10 airlines accordin...,Singapore Airlines is the world's best airline...,business
4,"Loves India, is a fan of PM: Paytm Founder on ...",Paytm Founder Vijay Shekhar Sharma shared a vi...,business


## 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

urls = ['https://codeup.com/featured/apida-heritage-month/','https://codeup.com/featured/women-in-tech-panelist-spotlight/','https://codeup.com/featured/women-in-tech-rachel-robbins-mayhill/','https://codeup.com/codeup-news/women-in-tech-panelist-spotlight-sarah-mellor/','https://codeup.com/events/women-in-tech-madeleine/']

In [10]:
urls = ['https://codeup.com/featured/apida-heritage-month/','https://codeup.com/featured/women-in-tech-panelist-spotlight/','https://codeup.com/featured/women-in-tech-rachel-robbins-mayhill/','https://codeup.com/codeup-news/women-in-tech-panelist-spotlight-sarah-mellor/','https://codeup.com/events/women-in-tech-madeleine/']

In [11]:
codeup_df = acquire.get_articles_texts(urls)

In [12]:
codeup_df = pd.DataFrame(codeup_df)

In [13]:
codeup_df.head()

Unnamed: 0,title,content
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American a...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...
2,Women in tech: Panelist Spotlight – Rachel Rob...,Women in tech: Panelist Spotlight – Rachel Rob...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,Women in tech: Panelist Spotlight – Sarah Mell...
4,Women in Tech: Panelist Spotlight – Madeleine ...,Women in tech: Panelist Spotlight – Madeleine ...


## 8. For each dataframe, produce the following columns:
* title to hold the title
* original to hold the original article/post content
* clean to hold the normalized and tokenized original with the stopwords removed.
* stemmed to hold the stemmed version of the cleaned data.
* lemmatized to hold the lemmatized version of the cleaned data.

In [14]:
# For news_df
news_df = news_df.rename(columns={ 'content':'original'})

In [15]:
news_df.head(2)

Unnamed: 0,title,original,category
0,"Sensex, Nifty end at fresh closing highs",Benchmark indices Sensex and Nifty ended at re...,business
1,Amazon tricked millions of customers into enro...,US Federal Trade Commission (FTC) has sued Ama...,business


In [16]:
news_df['clean'] = news_df['original'].apply(basic_clean).apply(tokenize).apply(lambda x: ' '.join(x))

In [17]:
news_df.head(2)

Unnamed: 0,title,original,category,clean
0,"Sensex, Nifty end at fresh closing highs",Benchmark indices Sensex and Nifty ended at re...,business,benchmark indices sensex and nifty ended at re...
1,Amazon tricked millions of customers into enro...,US Federal Trade Commission (FTC) has sued Ama...,business,us federal trade commission ftc has sued amazo...


In [18]:
stemmer = PorterStemmer()
news_df['stemmed'] = news_df['clean'].apply(tokenize).apply(lambda x: [stemmer.stem(word) for word in x]).apply(lambda x: ' '.join(x))

In [19]:
news_df.head(2)

Unnamed: 0,title,original,category,clean,stemmed
0,"Sensex, Nifty end at fresh closing highs",Benchmark indices Sensex and Nifty ended at re...,business,benchmark indices sensex and nifty ended at re...,benchmark indic sensex and nifti end at record...
1,Amazon tricked millions of customers into enro...,US Federal Trade Commission (FTC) has sued Ama...,business,us federal trade commission ftc has sued amazo...,us feder trade commiss ftc ha su amazon accus ...


In [20]:
lemmatizer = WordNetLemmatizer()
news_df['lemmatized'] = news_df['clean'].apply(tokenize).apply(lambda x: [lemmatizer.lemmatize(word) for word in x]).apply(lambda x: ' '.join(x))

In [21]:
news_df.head(2)

Unnamed: 0,title,original,category,clean,stemmed,lemmatized
0,"Sensex, Nifty end at fresh closing highs",Benchmark indices Sensex and Nifty ended at re...,business,benchmark indices sensex and nifty ended at re...,benchmark indic sensex and nifti end at record...,benchmark index sensex and nifty ended at reco...
1,Amazon tricked millions of customers into enro...,US Federal Trade Commission (FTC) has sued Ama...,business,us federal trade commission ftc has sued amazo...,us feder trade commiss ftc ha su amazon accus ...,u federal trade commission ftc ha sued amazon ...


In [22]:
# For codeup_df
codeup_df.head(2)

Unnamed: 0,title,content
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American a...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...


In [23]:
codeup_df = codeup_df.rename(columns={ 'content':'original'})
codeup_df['clean'] = codeup_df['original'].apply(basic_clean).apply(tokenize).apply(lambda x: ' '.join(x))
stemmer = PorterStemmer()
codeup_df['stemmed'] = codeup_df['clean'].apply(tokenize).apply(lambda x: [stemmer.stem(word) for word in x]).apply(lambda x: ' '.join(x))
lemmatizer = WordNetLemmatizer()
codeup_df['lemmatized'] = codeup_df['clean'].apply(tokenize).apply(lambda x: [lemmatizer.lemmatize(word) for word in x]).apply(lambda x: ' '.join(x))

In [24]:
codeup_df.head(2)

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American a...,may is traditionally known as asian american a...,may is tradit known as asian american and paci...,may is traditionally known a asian american an...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...,women in tech panelist spotlight magdalena rah...,women in tech panelist spotlight magdalena rah...,woman in tech panelist spotlight magdalena rah...


## 9. Ask yourself:
* If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
* If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
* If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?

# If the corpus is 493KB, using lemmatized text would be more preferable as it retains the base form of words and provides better interpretability.

# If the corpus is 25MB, using lemmatized text would still be preferable as it can handle larger text sizes without significant increase in computational resources.

# If the corpus is 200TB and computational resources are charged by the megabyte, using stemmed text would be more preferable. Stemming is a simpler process compared to lemmatization and results in smaller text sizes, reducing the computational cost significantly.

## Testing prepare functions

In [25]:
news_df = acquire.get_news_articles()

In [26]:
news_df = pd.DataFrame(news_df)

In [27]:
news_df = news_df.rename(columns={'content': 'original'})
news_df['clean'] = news_df['original'].apply(prepare.basic_clean).apply(prepare.tokenize).apply(lambda x: ' '.join(x))
stemmer = PorterStemmer()
news_df['stemmed'] = news_df['clean'].apply(prepare.tokenize).apply(lambda x: [stemmer.stem(word) for word in x]).apply(lambda x: ' '.join(x))
lemmatizer = WordNetLemmatizer()
news_df['lemmatized'] = news_df['clean'].apply(prepare.tokenize).apply(lambda x: [lemmatizer.lemmatize(word) for word in x]).apply(lambda x: ' '.join(x))


In [28]:
news_df.head(3)

Unnamed: 0,title,original,category,clean,stemmed,lemmatized
0,"Sensex, Nifty end at fresh closing highs",Benchmark indices Sensex and Nifty ended at re...,business,benchmark indices sensex and nifty ended at re...,benchmark indic sensex and nifti end at record...,benchmark index sensex and nifty ended at reco...
1,Amazon tricked millions of customers into enro...,US Federal Trade Commission (FTC) has sued Ama...,business,us federal trade commission ftc has sued amazo...,us feder trade commiss ftc ha su amazon accus ...,u federal trade commission ftc ha sued amazon ...
2,TIME releases list of the world's 100 most inf...,TIME magazine has released its annual list of ...,business,time magazine has released its annual list of ...,time magazin ha releas it annual list of the w...,time magazine ha released it annual list of th...


In [29]:
codeup_df = acquire.get_articles_texts(urls)

In [30]:
codeup_df = pd.DataFrame(codeup_df)

In [31]:
codeup_df = codeup_df.rename(columns={'content': 'original'})
codeup_df['clean'] = codeup_df['original'].apply(prepare.basic_clean).apply(prepare.tokenize).apply(lambda x: ' '.join(x))
stemmer = PorterStemmer()
codeup_df['stemmed'] = codeup_df['clean'].apply(prepare.tokenize).apply(lambda x: [stemmer.stem(word) for word in x]).apply(lambda x: ' '.join(x))
lemmatizer = WordNetLemmatizer()
codeup_df['lemmatized'] = codeup_df['clean'].apply(prepare.tokenize).apply(lambda x: [lemmatizer.lemmatize(word) for word in x]).apply(lambda x: ' '.join(x))

In [32]:
codeup_df.head(3)

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American a...,may is traditionally known as asian american a...,may is tradit known as asian american and paci...,may is traditionally known a asian american an...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...,women in tech panelist spotlight magdalena rah...,women in tech panelist spotlight magdalena rah...,woman in tech panelist spotlight magdalena rah...
2,Women in tech: Panelist Spotlight – Rachel Rob...,Women in tech: Panelist Spotlight – Rachel Rob...,women in tech panelist spotlight rachel robbin...,women in tech panelist spotlight rachel robbin...,woman in tech panelist spotlight rachel robbin...


In [33]:
urls =  ['https://codeup.com/featured/apida-heritage-month/','https://codeup.com/featured/women-in-tech-panelist-spotlight/','https://codeup.com/featured/women-in-tech-rachel-robbins-mayhill/','https://codeup.com/codeup-news/women-in-tech-panelist-spotlight-sarah-mellor/','https://codeup.com/events/women-in-tech-madeleine/']
prepare.wrangle_codeup(urls)

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American a...,may is traditionally known as asian american a...,may is tradit known as asian american and paci...,may is traditionally known a asian american an...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...,women in tech panelist spotlight magdalena rah...,women in tech panelist spotlight magdalena rah...,woman in tech panelist spotlight magdalena rah...
2,Women in tech: Panelist Spotlight – Rachel Rob...,Women in tech: Panelist Spotlight – Rachel Rob...,women in tech panelist spotlight rachel robbin...,women in tech panelist spotlight rachel robbin...,woman in tech panelist spotlight rachel robbin...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,Women in tech: Panelist Spotlight – Sarah Mell...,women in tech panelist spotlight sarah mellor ...,women in tech panelist spotlight sarah mellor ...,woman in tech panelist spotlight sarah mellor ...
4,Women in Tech: Panelist Spotlight – Madeleine ...,Women in tech: Panelist Spotlight – Madeleine ...,women in tech panelist spotlight madeleine cap...,women in tech panelist spotlight madelein capp...,woman in tech panelist spotlight madeleine cap...
