In [1]:
import pandas as pd
import numpy as np

import re
import unicodedata
import nltk
from nltk.corpus import stopwords

import acquire

### Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace.

In [29]:
def basic_clean(some_string):
    '''
    basic_clean will take in a single string as an argument,
    apply unicode normalization and ascii encoding,
    then decode and use regex to replace anything thats not a letter, number,
    or whitespace
    
    return: a cleaned version of some_string
    '''
    some_string = unicodedata.normalize('NFKD', some_string).encode('ascii', 'ignore').\
    decode('utf-8').lower()
    return re.sub(r'[^a-z0-9\s]', '', some_string)

In [3]:
basic_clean('Hi there here is some content! wow thats what I call content!!')

'hi there here is some content wow thats what i call content'

### Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [4]:
def tokenize(some_string):
    '''
    Tokenize will take in a single argument (a string) and 
    return: a single tokenized string version of the input string
    '''
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(some_string, return_str=True)

In [5]:
tokenize('This is some content I want to test tokenization on!')

'This is some content I want to test tokenization on !'

### Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [6]:
def stem(some_string):
    '''
    stem will take in a single string instance and stem the contents
    it will return a single string
    '''
    # make my stemmer object
    stemmer = nltk.porter.PorterStemmer()
    # return the joined back together version of
    # the list comprehension that contains the list of every word
    #from the contents of your document stemmed
    return ' '.join([stemmer.stem(word) for word in some_string.split()])

In [7]:
stem('I am testing the stemmer on this content!')

'i am test the stemmer on thi content!'

### Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [8]:
def lemmatize(some_string):
    '''
    lemmatize will take in the contents of a single string,
    split up the contents with split()
    use the split contents as a list to apply a lemmatizer to
    each word,
    and return a single string of the lemmatized words joined
    with a single instance of whitespace (' '.join())
    '''
    lemmatizer = nltk.WordNetLemmatizer()
    return ' '.join(
        [lemmatizer.lemmatize(word,'v'
                             ) for word in some_string.split()])

In [9]:
lemmatize('testing my function on this content wowza!')

'test my function on this content wowza!'

### Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords. This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [10]:
['ham', 'sandwich'] + ['cheese']

['ham', 'sandwich', 'cheese']

In [11]:
set1 = set([1,2,3,4,5])

In [12]:
set2 = set([5,5,6,7,8])

In [13]:
set2 - set1

{6, 7, 8}

In [14]:
list(set1.union(set2))

[1, 2, 3, 4, 5, 6, 7, 8]

In [33]:
def remove_stopwords(some_string, extra_words=[], keep_words=[]):
    '''
    remove stopwords will take in a single document as a string
    and return a new string that has stopwords removed
    '''
    stopwords_custom = set(stopwords.words('english')) - \
    set(keep_words)
    stopwords_custom = list(stopwords_custom.union(extra_words))
    return ' '.join([word for word in some_string.split()
                     if word not in stopwords_custom])

In [None]:
stopwords_custom

In [34]:
remove_stopwords('I can\'t wont not do that')

"I can't wont"

### Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [19]:
news_df = acquire.get_news_articles_data()

In [20]:
news_df.head()

Unnamed: 0,title,content,category
0,RR's Yashasvi Jaiswal smashes fastest fifty in...,Rajasthan Royals (RR) opener Yashasvi Jaiswal ...,national
1,Rajasthan Royals record biggest win of IPL 202...,Rajasthan Royals (RR) on Thursday recorded the...,national
2,RR break record for scoring most runs in 1st o...,RR on Thursday broke the record for scoring mo...,national
3,Which Indians have smashed fifty off 15 or les...,RR's Yashasvi Jaiswal today slammed the fastes...,national
4,Laxman Sivaramakrishnan mocks Kamal Haasan ove...,Ex-India spinner Laxman Sivaramakrishnan took ...,national


### Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [21]:
codeup_df = acquire.get_blog_articles_data()

In [22]:
codeup_df.head()

Unnamed: 0,title,content
0,Women in tech: Panelist Spotlight – Magdalena ...,\nCodeup is hosting a Women in Tech Panel in h...
1,Women in tech: Panelist Spotlight – Rachel Rob...,\nCodeup is hosting a Women in Tech Panel in h...
2,Women in Tech: Panelist Spotlight – Sarah Mellor,\nCodeup is hosting a Women in Tech Panel in ...
3,Women in Tech: Panelist Spotlight – Madeleine ...,\nCodeup is hosting a Women in Tech Panel in h...
4,Black Excellence in Tech: Panelist Spotlight –...,\n\nCodeup is hosting a Black Excellence in Te...


### For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.

In [23]:
news_df.columns

Index(['title', 'content', 'category'], dtype='object')

In [24]:
codeup_df.columns

Index(['title', 'content'], dtype='object')

In [26]:
codeup_df.rename(columns={'content':'original'})[:1]

Unnamed: 0,title,original
0,Women in tech: Panelist Spotlight – Magdalena ...,\nCodeup is hosting a Women in Tech Panel in h...


In [37]:
codeup_df['content'].apply(basic_clean).apply(tokenize).apply(remove_stopwords)

0    codeup hosting women tech panel honor womens h...
1    codeup hosting women tech panel honor womens h...
2    codeup hosting women tech panel honor womens h...
3    codeup hosting women tech panel honor womens h...
4    codeup hosting black excellence tech panel hon...
5    codeup hosting second black excellence tech pa...
Name: content, dtype: object

In [38]:
def transform_data(df):
    df = df.rename(columns={'content':'original'})
    # df['clean'] = cleaned and tokenized version with stopwords removed
    df['clean'] = df['original'].apply(basic_clean
                                      ).apply(tokenize
                                             ).apply(remove_stopwords)
    # df['stemmed'] = stemmed version of clean data
    df['stemmed'] = df['clean'].apply(stem)
    # df['lemmatized'] = lemmatized version of clean data
    df['lematized'] = df['clean'].apply(lemmatize)
    return df

In [40]:
codeup_df = transform_data(codeup_df)

In [41]:
news_df = transform_data(news_df)

In [42]:
news_df.head()

Unnamed: 0,title,original,category,clean,stemmed,lematized
0,RR's Yashasvi Jaiswal smashes fastest fifty in...,Rajasthan Royals (RR) opener Yashasvi Jaiswal ...,national,rajasthan royals rr opener yashasvi jaiswal th...,rajasthan royal rr open yashasvi jaiswal thurs...,rajasthan royals rr opener yashasvi jaiswal th...
1,Rajasthan Royals record biggest win of IPL 202...,Rajasthan Royals (RR) on Thursday recorded the...,national,rajasthan royals rr thursday recorded biggest ...,rajasthan royal rr thursday record biggest win...,rajasthan royals rr thursday record biggest wi...
2,RR break record for scoring most runs in 1st o...,RR on Thursday broke the record for scoring mo...,national,rr thursday broke record scoring runs first ip...,rr thursday broke record score run first ipl i...,rr thursday break record score run first ipl i...
3,Which Indians have smashed fifty off 15 or les...,RR's Yashasvi Jaiswal today slammed the fastes...,national,rrs yashasvi jaiswal today slammed fastest fif...,rr yashasvi jaiswal today slam fastest fifti i...,rrs yashasvi jaiswal today slam fastest fifty ...
4,Laxman Sivaramakrishnan mocks Kamal Haasan ove...,Ex-India spinner Laxman Sivaramakrishnan took ...,national,exindia spinner laxman sivaramakrishnan took d...,exindia spinner laxman sivaramakrishnan took d...,exindia spinner laxman sivaramakrishnan take d...
