In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd
from time import strftime

import acquire

import warnings
warnings.filterwarnings('ignore')



## Exercises

#### The end result of this exercise should be a file named prepare.py that defines the requested functions.

#### In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

    - Lowercase everything
    - Normalize unicode characters
    - Replace anything that is not a letter, number, whitespace or a single quote.

2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

    - This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

8. For each dataframe, produce the following columns:

    - title to hold the title
    - original to hold the original article/post content
    - clean to hold the normalized and tokenized original with the stopwords removed.
    - stemmed to hold the stemmed version of the cleaned data.
    - lemmatized to hold the lemmatized version of the cleaned data.

9. Ask yourself:

    - If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
    - If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
    - If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?

In [2]:
# we will define a basic_clean function for a single document (one string)
def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    # we will normalize our data into standard NFKD unicode, feed it into an ascii encoding
    # decode it back into UTF-8
    string = unicodedata.normalize('NFKD', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    # utilize our regex substitution to remove our undesirable characters, then lowercase
    string = re.sub(r"[^\w0-9'\s]", '', string).lower()
    return string

In [3]:
def tokenize(string):
    '''
    This function takes in a string and
    returns a tokenized string.
    '''
    # make our tokenizer, taken from nltk's ToktokTokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    # apply our tokenizer's tokenization to the string being input, ensure it returns a string
    string = tokenizer.tokenize(string, return_str = True)
    
    return string

In [4]:
def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # create our stemming object
    ps = nltk.porter.PorterStemmer()
    # use a list comprehension => stem each word for each word inside of the entire document,
    # split by the default, which are single spaces
    stems = [ps.stem(word) for word in string.split()]
    # glue it back together with spaces, as it was before
    string = ' '.join(stems)
    
    return string

In [5]:
def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # create our lemmatizer object
    wnl = nltk.stem.WordNetLemmatizer()
    # use a list comprehension to lemmatize each word
    # string.split() => output a list of every token inside of the document
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    # glue the lemmas back together by the strings we split on
    string = ' '.join(lemmas)
    #return the altered document
    return string

In [6]:
list1 = [1, 2, 3, 4]
list2 = [2, 1, 3, 4]

print(set(list1)==set(list2))

True


In [7]:
mylist = ['a', 'b', 'c', 'c', 'd']

myset = set(mylist)

print(mylist, myset)

['a', 'b', 'c', 'c', 'd'] {'b', 'a', 'c', 'd'}


In [8]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # assign our stopwords from nltk into stopword_list
    stopword_list = stopwords.words('english')
    # utilizing set casting, i will remove any excluded stopwords
    stopword_list = set(stopword_list) - set(exclude_words)
    # add in any extra words to my stopwords set using a union
    stopword_list = stopword_list.union(set(extra_words))
    # split our document by spaces
    words = string.split()
    # every word in our document, as long as that word is not in our stopwords
    filtered_words = [word for word in words if word not in stopword_list]
    # glue it back together with spaces, as it was so it shall be
    string_without_stopwords = ' '.join(filtered_words)
    # return the document back
    return string_without_stopwords

In [9]:
test_list = ['Hello', 'Jemison', 'How', 'are', 'you']
test_list

['Hello', 'Jemison', 'How', 'are', 'you']

In [10]:
'/'.join(test_list)

'Hello/Jemison/How/are/you'

In [11]:
news_df = acquire.get_news_articles_data()
news_df

Unnamed: 0,title,content,category
0,Twitter wins bid to fast-track trial over Musk...,Twitter's lawsuit seeking to hold Elon Musk to...,national
1,"Lyricist Jaani injured in accident in Punjab, ...",Renowned Punjabi lyricist and music composer J...,national
2,"Rare twin giraffes born in Kenya, pics surface",Rare twin giraffes have been born at Nairobi N...,national
3,She borrowed mother's stole: Father of girl al...,The father of a 17-year-old who claimed she wa...,national
4,Delhi-bound flight rejected take-off due to do...,A flight was rejected take-off from Leh due to...,national
...,...,...,...
295,NIA raids 3 Andhra locations in Chhattisgarh N...,National Investigation Agency (NIA) conducted ...,automobile
296,Strikes on Syria will destabilise Middle East:...,Iranian Supreme Leader Ayatollah Ali Khamenei ...,automobile
297,Turkey court upholds exit from key European tr...,A Turkish court ruled on Tuesday that Presiden...,automobile
298,"France probes origins of wildfire, 1 detained",French investigators detained a 39-year-old ma...,automobile


In [12]:
codeup_df = acquire.get_blog_articles_data()
codeup_df

Unnamed: 0,title,content
0,What Jobs Can You Get After a Coding Bootcamp?...,Have you been considering a career in Cloud Ad...
1,What Jobs Can You Get After a Coding Bootcamp?...,If you are interested in embarking on a career...
2,Is Our Cloud Administration Program Right for ...,Changing careers can be scary. The first thing...
3,5 Reasons To Attend Our New Cloud Administrati...,Come Work In The Cloud\nWhen your Monday rolls...
4,What Jobs Can You Get After a Coding Bootcamp?...,Have you been considering a career in Cloud Ad...
5,What Jobs Can You Get After a Coding Bootcamp?...,If you are interested in embarking on a career...
6,In-Person Workshop: Learn to Code – JavaScript...,Join us for our live in-person JavaScript cras...
7,In-Person Workshop: Learn to Code – Python on ...,"According to LinkedIn, the “#1 Most Promising ..."
8,Free JavaScript Workshop at Codeup Dallas on 6/28,Event Info: \nLocation – Codeup Dallas\nTime –...
9,Is Our Cloud Administration Program Right for ...,Changing careers can be scary. The first thing...


In [13]:
news_df.rename(columns={'content': 'original'}, inplace=True)
codeup_df.rename(columns={'content': 'original'}, inplace=True)

In [14]:
news_df

Unnamed: 0,title,original,category
0,Twitter wins bid to fast-track trial over Musk...,Twitter's lawsuit seeking to hold Elon Musk to...,national
1,"Lyricist Jaani injured in accident in Punjab, ...",Renowned Punjabi lyricist and music composer J...,national
2,"Rare twin giraffes born in Kenya, pics surface",Rare twin giraffes have been born at Nairobi N...,national
3,She borrowed mother's stole: Father of girl al...,The father of a 17-year-old who claimed she wa...,national
4,Delhi-bound flight rejected take-off due to do...,A flight was rejected take-off from Leh due to...,national
...,...,...,...
295,NIA raids 3 Andhra locations in Chhattisgarh N...,National Investigation Agency (NIA) conducted ...,automobile
296,Strikes on Syria will destabilise Middle East:...,Iranian Supreme Leader Ayatollah Ali Khamenei ...,automobile
297,Turkey court upholds exit from key European tr...,A Turkish court ruled on Tuesday that Presiden...,automobile
298,"France probes origins of wildfire, 1 detained",French investigators detained a 39-year-old ma...,automobile


In [15]:
codeup_df

Unnamed: 0,title,original
0,What Jobs Can You Get After a Coding Bootcamp?...,Have you been considering a career in Cloud Ad...
1,What Jobs Can You Get After a Coding Bootcamp?...,If you are interested in embarking on a career...
2,Is Our Cloud Administration Program Right for ...,Changing careers can be scary. The first thing...
3,5 Reasons To Attend Our New Cloud Administrati...,Come Work In The Cloud\nWhen your Monday rolls...
4,What Jobs Can You Get After a Coding Bootcamp?...,Have you been considering a career in Cloud Ad...
5,What Jobs Can You Get After a Coding Bootcamp?...,If you are interested in embarking on a career...
6,In-Person Workshop: Learn to Code – JavaScript...,Join us for our live in-person JavaScript cras...
7,In-Person Workshop: Learn to Code – Python on ...,"According to LinkedIn, the “#1 Most Promising ..."
8,Free JavaScript Workshop at Codeup Dallas on 6/28,Event Info: \nLocation – Codeup Dallas\nTime –...
9,Is Our Cloud Administration Program Right for ...,Changing careers can be scary. The first thing...


In [16]:
def prep_article_data(df, column, extra_words=[], exclude_words=[], stem=True, lemmatize=True):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords,
                                  extra_words=extra_words,
                                  exclude_words=exclude_words)
    
    df['stemmed'] = df['clean'].apply(stem)
    
    df['lemmatized'] = df['clean'].apply(lemmatize)
    
    return df[['title', column,'clean', 'stemmed', 'lemmatized']]

In [20]:
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords,
                                  extra_words=extra_words,
                                  exclude_words=exclude_words)
    
    df['stemmed'] = df['clean'].apply(stem)
    
    df['lemmatized'] = df['clean'].apply(lemmatize)
    
    return df[['title', column,'clean', 'stemmed', 'lemmatized']]

In [21]:
prep_article_data(news_df, 'original', extra_words = ['ha'], exclude_words = ['no'])

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/Ray/nltk_data'
    - '/opt/homebrew/anaconda3/nltk_data'
    - '/opt/homebrew/anaconda3/share/nltk_data'
    - '/opt/homebrew/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [22]:
prep_article_data(codeup_df, 'original', extra_words = ['ha'], exclude_words = ['no']).head()

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/Users/Ray/nltk_data'
    - '/opt/homebrew/anaconda3/nltk_data'
    - '/opt/homebrew/anaconda3/share/nltk_data'
    - '/opt/homebrew/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
