# Exercises

In [1]:
# imports
import numpy as np
import pandas as pd

import acquire

import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer

from time import strftime

import unicodedata

import json

## 1. `basic_clean` function
This function should take in a string and apply some basic text cleaning to it:
1. Lowercase everything,
2. Normalize unicode characters, and
3. Replace anything that is not a letter, number, whitespace or a single quote

In [2]:
# 1. lowercase
string = '''
Carrot cake brownie carrot cake ice cream croissant powder bear claw. Icing tiramisu soufflé fruitcake carrot 
cake macaroon liquorice. Dragée sweet icing lollipop chocolate bar jelly beans.
Cake sugar plum cookie tiramisu dessert cupcake sweet lollipop liquorice. Dragée ice cream pastry shortbread 
halvah chupa chups sweet ice cream. Cheesecake pastry powder donut cake.
'''.lower()#.strip().lower()

# 2. normalize unicode
string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')

# 3. replace non-alphanumeric characters
string = re.sub(r"[^a-z0-9'\s]", '', string)#.replace('\n', ' ')

string

'\ncarrot cake brownie carrot cake ice cream croissant powder bear claw icing tiramisu souffle fruitcake carrot \ncake macaroon liquorice dragee sweet icing lollipop chocolate bar jelly beans\ncake sugar plum cookie tiramisu dessert cupcake sweet lollipop liquorice dragee ice cream pastry shortbread \nhalvah chupa chups sweet ice cream cheesecake pastry powder donut cake\n'

In [3]:
def basic_clean(string):
    '''
    This function takes in a string as a paramenter and performs the following basic cleaning functions:
        1. lowercase,
        2. normalize unicode, and
        3. remove non-alphanumeric characters
    
    The function returns the cleaned string.
    '''
    # 1. lowercase
    string = string.strip().lower()
    
    # 2. normalize unicode
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    
    # 3. replace non-alphanumeric characters
    string = re.sub(r"[^a-z0-9'\s]", '', string)
    
    return string

In [4]:
# tesitng the function
basic_clean("""
-9p]o.6ykrq` \14v3tu
7ig'kots64s[ai.o'[k/
jmnr/;ilyetqams6/;3f
h821\e83qyw69eme.;r,
rsvb,l72qzfmv33bm,u=
7p 7ditj\dzjpr9s=icm
qva,d],p;sz7vu[wglvx
n1trtu;weit5pq8j0q7o
rls9dozsc`ruop8m.-\0
nz8lr=cfvksu/bhd-2d'
""")

"9po6ykrq \x0cv3tu\n7ig'kots64saio'k\njmnrilyetqams63f\nh821e83qyw69emer\nrsvbl72qzfmv33bmu\n7p 7ditjdzjpr9sicm\nqvadpsz7vuwglvx\nn1trtuweit5pq8j0q7o\nrls9dozscruop8m\nnz8lrcfvksubhd2d'"

## 2. `tokenize` function
This function should take in a string and tokenize all the words in the string.

In [5]:
test_string = """
-9p]o.6ykrq` \14v3tu
7ig'kots64s[ai.o'[k/
jmnr/;ilyetqams6/;3f
h821\e83qyw69eme.;r,
rsvb,l72qzfmv33bm,u=
7p 7ditj\dzjpr9s=icm
qva,d],p;sz7vu[wglvx
n1trtu;weit5pq8j0q7o
rls9dozsc`ruop8m.-\0
nz8lr=cfvksu/bhd-2d'
"""

In [6]:
tokenizer = nltk.tokenize.ToktokTokenizer()

tokenizer.tokenize(test_string, return_str=True)

"-9p ] o.6ykrq ` \x0cv3tu\n7ig ' kots64s[ ai.o ' [ k/\njmnr/ ; ilyetqams6/ ; 3f\nh821\\e83qyw69eme. ; r , \nrsvb , l72qzfmv33bm , u=\n7p 7ditj\\dzjpr9s=icm\nqva , d ] , p ; sz7vu[ wglvx\nn1trtu ; weit5pq8j0q7o\nrls9dozsc ` ruop8m.-\x00\nnz8lr=cfvksu/bhd-2d '"

In [7]:
def tokenize(string):
    '''
    This function takes in a string and returns the tokenized version of that string.
    '''
    
    # creating tokenizer object
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    # using tokenizer object on string
    string = tokenizer.tokenize(string, return_str=True)
    
    return string
    

In [8]:
tokenize('''
This is a string. Let's see what happens when we tokenize it!
''')

"This is a string. Let ' s see what happens when we tokenize it !"

## 3. `stem` function
This function should accept some text and return the text after applying stemming to all the words.

In [9]:
def stem(string):
    '''
    This function takes in a string, stems each individual word, and then joins
    the stem words back together in the returned string.
    '''
    
    # creating the stem object
    ps = nltk.porter.PorterStemmer()
    
    # creating the stems for each individual word in the string
    stems = [ps.stem(word) for word in string.split()]
    
    # putting the stemmed words back together into string
    string = ' '.join(stems)
    
    return string

In [10]:
stem(string)

'carrot cake browni carrot cake ice cream croissant powder bear claw ice tiramisu souffl fruitcak carrot cake macaroon liquoric drage sweet ice lollipop chocol bar jelli bean cake sugar plum cooki tiramisu dessert cupcak sweet lollipop liquoric drage ice cream pastri shortbread halvah chupa chup sweet ice cream cheesecak pastri powder donut cake'

In [11]:
pd.Series(string.split()).value_counts().head(
)

cake      5
cream     3
carrot    3
sweet     3
ice       3
dtype: int64

## 4. `lemmatize` function
This function should accept some text and return the text after applying lemmatization to each word.

In [12]:
def lemmatize(string):
    '''
    
    '''
    
    # create the lemmatization object
    wnl = nltk.stem.WordNetLemmatizer()
    
    # creating a list of string of each word in the article and applying the lemmatize object to each word
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    # joining the individual list of lemma string words to a single string of words
    string = ' '.join(lemmas)
    
    return string

In [13]:
lemmatize(string)

'carrot cake brownie carrot cake ice cream croissant powder bear claw icing tiramisu souffle fruitcake carrot cake macaroon liquorice dragee sweet icing lollipop chocolate bar jelly bean cake sugar plum cookie tiramisu dessert cupcake sweet lollipop liquorice dragee ice cream pastry shortbread halvah chupa chups sweet ice cream cheesecake pastry powder donut cake'

In [14]:
string

'\ncarrot cake brownie carrot cake ice cream croissant powder bear claw icing tiramisu souffle fruitcake carrot \ncake macaroon liquorice dragee sweet icing lollipop chocolate bar jelly beans\ncake sugar plum cookie tiramisu dessert cupcake sweet lollipop liquorice dragee ice cream pastry shortbread \nhalvah chupa chups sweet ice cream cheesecake pastry powder donut cake\n'

## 5. `remove_stopwords`
This function should accept some text and return the text after removing all the stopwords.
<br>
It should also define two parameters:
- `extra-words` any additional stop words (words we want removed that are not already listed as stop words)
- `exclude-words` any words you want excluded from stop words search (stop words we don't want removed)

In [15]:
# calls a list of common english stopwords
stopwords.words('english')[:5]

['i', 'me', 'my', 'myself', 'we']

In [16]:
# saving the list of english stopwords
stopword_list = stopwords.words('english')

# savings the words in the string as a list of strings for each indidvidual word
words = 'Mary had a little lamb. Little lamb.'.split()

# creating a for loop that will loop through each of the individual words in the string
#     and return a list of only the words that are not in the list of stopwords
[word for word in words \
    if word not in stopword_list]

['Mary', 'little', 'lamb.', 'Little', 'lamb.']

In [17]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string of words and splits it into a list of strings for each individual 
    word. It then loops through the list of words and returns a string of the joined string words, 
    excluding the stopwords.
    '''
    
    # splitting the string of words into a list of strings
    words = string.split()
    
    # saving the stop words
    stopword_list = stopwords.words('english')
    
    # looping through the list of words and creating a new list with the words not in the stopwords list
    filtered_words = [word for word in words if word not in stopword_list]
    
    # joining the list of words back to a string of words
    filtered_string = ' '.join(filtered_words)
    
    # printing number of words removed
    print(f'Removed {len(string) - len(filtered_string)} stop words.\n> Original string: {len(string)}\n> New string: {len(filtered_string)}')
    print()
    
    return filtered_string

In [18]:
# testing function
remove_stopwords('This little light of mine, I"m going to let it shine. Let it shine, let it shine, let it shine.')

Removed 18 stop words.
> Original string: 95
> New string: 77



'This little light mine, I"m going let shine. Let shine, let shine, let shine.'

## 6. `news_df`
Use your data from the acquire to produce a dataframe of the news articles. 

In [19]:
inshorts = acquire.get_inshorts_articles()
inshorts.head()

Unnamed: 0,title,author,content,date,category
0,LIC files draft papers with SEBI to seek appro...,Pragya Swastik,State-run Life Insurance Corporation of India ...,"13 Feb 2022,Sunday",business
1,We have lost a visionary: Rahul Gandhi on Rahu...,Shalini Ojha,Former Congress chief Rahul Gandhi condoled th...,"12 Feb 2022,Saturday",business
2,Piyush Goyal shares meme featuring Shark Tank'...,Apaar Sharma,Union Minister Piyush Goyal has shared a Shark...,"13 Feb 2022,Sunday",business
3,"I've made multi-billion dollar businesses, it ...",Daisy Mowke,"Speaking on Figuring Out podcast, BharatPe Co-...","12 Feb 2022,Saturday",business
4,13-yr-old girl gets ₹50 lakh funding on Shark ...,Ridham Gambhir,A Class 8 girl became the youngest contestant ...,"13 Feb 2022,Sunday",business


## 7. `codeup_df`
Make another dataframe from the codeup blog posts.

In [20]:
codeup_blogs = acquire.get_blog_articles()
codeup_blogs.head()

Unnamed: 0,title,published,content
0,Codeup Dallas Open House,"Nov 30, 2021",Come join us for the re-opening of our Dallas ...
1,Codeup’s Placement Team Continues Setting Records,"Nov 19, 2021",Our Placement Team is simply defined as a grou...
2,"IT Certifications 101: Why They Matter, and Wh...","Nov 18, 2021","AWS, Google, Azure, Red Hat, CompTIA…these are..."
3,A rise in cyber attacks means opportunities fo...,"Nov 17, 2021","In the last few months, the US has experienced..."
4,Use your GI Bill® benefits to Land a Job in Tech,"Nov 4, 2021","As the end of military service gets closer, ma..."


## 8. DataFrame columns
For each dataframe, produce the following columns:
- `title` to hold the title
- `original` to hold the original article/post content
- `clean` to hold the normalized and tokenized original with the stopwords removed.
- `stemmed` to hold the stemmed version of the cleaned data.
- `lemmatized` to hold the lemmatized version of the cleaned data.

In [21]:
inshorts.head()

Unnamed: 0,title,author,content,date,category
0,LIC files draft papers with SEBI to seek appro...,Pragya Swastik,State-run Life Insurance Corporation of India ...,"13 Feb 2022,Sunday",business
1,We have lost a visionary: Rahul Gandhi on Rahu...,Shalini Ojha,Former Congress chief Rahul Gandhi condoled th...,"12 Feb 2022,Saturday",business
2,Piyush Goyal shares meme featuring Shark Tank'...,Apaar Sharma,Union Minister Piyush Goyal has shared a Shark...,"13 Feb 2022,Sunday",business
3,"I've made multi-billion dollar businesses, it ...",Daisy Mowke,"Speaking on Figuring Out podcast, BharatPe Co-...","12 Feb 2022,Saturday",business
4,13-yr-old girl gets ₹50 lakh funding on Shark ...,Ridham Gambhir,A Class 8 girl became the youngest contestant ...,"13 Feb 2022,Sunday",business


In [23]:
news = inshorts[['title', 'content']].rename(columns = {'content': 'original'})
news.head()

Unnamed: 0,title,original
0,LIC files draft papers with SEBI to seek appro...,State-run Life Insurance Corporation of India ...
1,We have lost a visionary: Rahul Gandhi on Rahu...,Former Congress chief Rahul Gandhi condoled th...
2,Piyush Goyal shares meme featuring Shark Tank'...,Union Minister Piyush Goyal has shared a Shark...
3,"I've made multi-billion dollar businesses, it ...","Speaking on Figuring Out podcast, BharatPe Co-..."
4,13-yr-old girl gets ₹50 lakh funding on Shark ...,A Class 8 girl became the youngest contestant ...


In [27]:
# adding clean column
news['clean'] = news.original.apply(basic_clean)

In [30]:
# adding stem column
news['stemmed'] = news.clean.apply(stem)

In [32]:
# adding lemmatized column (from clean column)
news['lemmatized'] = news.clean.apply(lemmatize)

In [34]:
news.head()

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,LIC files draft papers with SEBI to seek appro...,State-run Life Insurance Corporation of India ...,staterun life insurance corporation of india l...,staterun life insur corpor of india lic on sun...,staterun life insurance corporation of india l...
1,We have lost a visionary: Rahul Gandhi on Rahu...,Former Congress chief Rahul Gandhi condoled th...,former congress chief rahul gandhi condoled th...,former congress chief rahul gandhi condol the ...,former congress chief rahul gandhi condoled th...
2,Piyush Goyal shares meme featuring Shark Tank'...,Union Minister Piyush Goyal has shared a Shark...,union minister piyush goyal has shared a shark...,union minist piyush goyal ha share a shark tan...,union minister piyush goyal ha shared a shark ...
3,"I've made multi-billion dollar businesses, it ...","Speaking on Figuring Out podcast, BharatPe Co-...",speaking on figuring out podcast bharatpe cofo...,speak on figur out podcast bharatp cofound ash...,speaking on figuring out podcast bharatpe cofo...
4,13-yr-old girl gets ₹50 lakh funding on Shark ...,A Class 8 girl became the youngest contestant ...,a class 8 girl became the youngest contestant ...,a class 8 girl becam the youngest contest to p...,a class 8 girl became the youngest contestant ...


In [42]:
# function to perform above

def cleaners(df, cols_to_keep, col_to_clean):
    '''
    This function takes in:
    - df
    - cols_to_keep >> list of columns from the original df to keep
    - col_to_clean >> series column that prepare functions will be applied to
    and retuns:
    a df with the cols_to_keep added to the cleaned, stemmed, and lemmed columns.
    '''
    df = df[cols_to_keep]
    
    df['clean'] = df[col_to_clean].apply(basic_clean)
    df['stemmed'] = df.clean.apply(stem)
    df['lemmatized'] = df.clean.apply(lemmatize)
    
    return df

In [45]:
cleaners(codeup_blogs, ['title', 'content'], 'content').head()

Unnamed: 0,title,content,clean,stemmed,lemmatized
0,Codeup Dallas Open House,Come join us for the re-opening of our Dallas ...,come join us for the reopening of our dallas c...,come join us for the reopen of our dalla campu...,come join u for the reopening of our dallas ca...
1,Codeup’s Placement Team Continues Setting Records,Our Placement Team is simply defined as a grou...,our placement team is simply defined as a grou...,our placement team is simpli defin as a group ...,our placement team is simply defined a a group...
2,"IT Certifications 101: Why They Matter, and Wh...","AWS, Google, Azure, Red Hat, CompTIA…these are...",aws google azure red hat comptiathese are big ...,aw googl azur red hat comptiathes are big name...,aws google azure red hat comptiathese are big ...
3,A rise in cyber attacks means opportunities fo...,"In the last few months, the US has experienced...",in the last few months the us has experienced ...,in the last few month the us ha experienc doze...,in the last few month the u ha experienced doz...
4,Use your GI Bill® benefits to Land a Job in Tech,"As the end of military service gets closer, ma...",as the end of military service gets closer man...,as the end of militari servic get closer mani ...,a the end of military service get closer many ...
