In [1]:
import acquire

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

In [2]:
blogs = acquire.get_codeup_blogs_text()

In [3]:
blogs[0]

'Codeup’s Data Science Career Accelerator is Here!Posted on September 30, 2018In San AntonioThe rumors are true! The time has arrived. Codeup has officially opened applications to our new Data Science career accelerator, with only 25 seats available! This immersive program is one of a kind in San Antonio, and will help you land a job in Glassdoor’s #1 Best Job in America.Data Science is a method of providing actionable intelligence from data. The data revolution has hit San Antonio, resulting in an explosion in Data Scientist positions across companies like USAA, Accenture, Booz Allen Hamilton, and HEB. We’ve even seen UTSA invest $70 M for a Cybersecurity Center and School of Data Science. We built a program to specifically meet the growing demands of this industry.Our program will be 18 weeks long, full-time, hands-on, and project-based. Our curriculum development and instruction is led by Senior Data Scientist, Maggie Giust, who has worked at HEB, Capital Group, and Rackspace, along

In [4]:
type(blogs)

list

In [5]:
def basic_clean(string):
    '''
    Takes in a string and apply some basic text cleaning to it:
    - Lowercase everything
    - Normalize unicode characters
    - Replace anything that is not a letter, number, whitespace or a single quote
    '''
    
    # Lowercase Everything
    string = string.lower()
    
    # Normalize Unicode Characters
    string = (unicodedata.normalize('NFKD', string)
                .encode('ascii', 'ignore')
                .decode('utf-8', 'ignore')
             )
    
    # Replace anything that isn't a letter, number, whitespace or single quote
    string = re.sub(r"[^a-z0-9'\s]", '', string)
 
    return string

In [6]:
cleaned = basic_clean(blogs[0])
print(cleaned)

codeups data science career accelerator is hereposted on september 30 2018in san antoniothe rumors are true the time has arrived codeup has officially opened applications to our new data science career accelerator with only 25 seats available this immersive program is one of a kind in san antonio and will help you land a job in glassdoors 1 best job in americadata science is a method of providing actionable intelligence from data the data revolution has hit san antonio resulting in an explosion in data scientist positions across companies like usaa accenture booz allen hamilton and heb weve even seen utsa invest 70 m for a cybersecurity center and school of data science we built a program to specifically meet the growing demands of this industryour program will be 18 weeks long fulltime handson and projectbased our curriculum development and instruction is led by senior data scientist maggie giust who has worked at heb capital group and rackspace along with input from dozens of practit

In [7]:
def tokenize(string):
    tokenizer = nltk.tokenize.ToktokTokenizer()

    return tokenizer.tokenize(string, return_str=True)

In [8]:
tokenize(cleaned)

'codeups data science career accelerator is hereposted on september 30 2018in san antoniothe rumors are true the time has arrived codeup has officially opened applications to our new data science career accelerator with only 25 seats available this immersive program is one of a kind in san antonio and will help you land a job in glassdoors 1 best job in americadata science is a method of providing actionable intelligence from data the data revolution has hit san antonio resulting in an explosion in data scientist positions across companies like usaa accenture booz allen hamilton and heb weve even seen utsa invest 70 m for a cybersecurity center and school of data science we built a program to specifically meet the growing demands of this industryour program will be 18 weeks long fulltime handson and projectbased our curriculum development and instruction is led by senior data scientist maggie giust who has worked at heb capital group and rackspace along with input from dozens of practi

In [9]:
def stem(string):
    # Create the nltk stemmer object, then use it
    ps = nltk.porter.PorterStemmer()

    stems = [ps.stem(word) for word in string.split()]
    string_stemmed = ' '.join(stems)
    
    return string_stemmed

In [10]:
stem(cleaned)

'codeup data scienc career acceler is herepost on septemb 30 2018in san antonioth rumor are true the time ha arriv codeup ha offici open applic to our new data scienc career acceler with onli 25 seat avail thi immers program is one of a kind in san antonio and will help you land a job in glassdoor 1 best job in americadata scienc is a method of provid action intellig from data the data revolut ha hit san antonio result in an explos in data scientist posit across compani like usaa accentur booz allen hamilton and heb weve even seen utsa invest 70 m for a cybersecur center and school of data scienc we built a program to specif meet the grow demand of thi industryour program will be 18 week long fulltim handson and projectbas our curriculum develop and instruct is led by senior data scientist maggi giust who ha work at heb capit group and rackspac along with input from dozen of practition and hire partner student will work with real data set realist problem and the entir data scienc pipel

In [11]:
def lemmatize(string):
    wnl = nltk.stem.WordNetLemmatizer()

    lemmas = [wnl.lemmatize(word) for word in string.split()]
    string_lemmatized = ' '.join(lemmas)

    return string_lemmatized

In [12]:
lemmatize(cleaned)

'codeups data science career accelerator is hereposted on september 30 2018in san antoniothe rumor are true the time ha arrived codeup ha officially opened application to our new data science career accelerator with only 25 seat available this immersive program is one of a kind in san antonio and will help you land a job in glassdoors 1 best job in americadata science is a method of providing actionable intelligence from data the data revolution ha hit san antonio resulting in an explosion in data scientist position across company like usaa accenture booz allen hamilton and heb weve even seen utsa invest 70 m for a cybersecurity center and school of data science we built a program to specifically meet the growing demand of this industryour program will be 18 week long fulltime handson and projectbased our curriculum development and instruction is led by senior data scientist maggie giust who ha worked at heb capital group and rackspace along with input from dozen of practitioner and hi

In [61]:
def remove_stopwords(string, extra_words=[], exlude_words=[]):
    # Making the stopword list
    stopword_list = stopwords.words('english')
    
    # Adding words to the list
    if extra_words != []:
        [stopword_list.append(word) for word in extra_words]

    # dropping words from the stopword list so we keep them in the text
    if exlude_words != []:
        [stopword_list.remove(word) for word in exlude_wars]

    words = string.split()
    filtered_words = [w for w in words if w not in stopword_list]

    string_without_stopwords = ' '.join(filtered_words)

    return string_without_stopwords

In [62]:
remove_stopwords(cleaned)

'codeups data science career accelerator hereposted september 30 2018in san antoniothe rumors true time arrived codeup officially opened applications new data science career accelerator 25 seats available immersive program one kind san antonio help land job glassdoors 1 best job americadata science method providing actionable intelligence data data revolution hit san antonio resulting explosion data scientist positions across companies like usaa accenture booz allen hamilton heb weve even seen utsa invest 70 cybersecurity center school data science built program specifically meet growing demands industryour program 18 weeks long fulltime handson projectbased curriculum development instruction led senior data scientist maggie giust worked heb capital group rackspace along input dozens practitioners hiring partners students work real data sets realistic problems entire data science pipeline collection deployment receive professional development training resume writing interviewing contin

In [33]:
def prep_article(article):
    '''
    Takes in an article, returns a dictionary with the following:
    'title': 'the original title'.
    'original': original,
    'stemmed': article_stemmed,
    'lemmatized': article_lemmatized,
    'clean': article_without_stopwords
    '''
    
    title   = article['title']
    body    = article['body']
    stemmed = stem(body)
    lem     = lemmatize(body)
    clean   = remove_stopwords(body)
        
    article_data = {'title'      : title,
                    'original'   : body,
                    'stemmed'    : stemmed,
                    'lemmatized' : lem,
                    'clean'      : clean
                    }

    return article_data

In [34]:
articles = acquire.get_news_articles()

data = prep_article(articles[0])

In [51]:
def prep_article_data(articles):
    '''
    Takes in a list of articles dictionaries. 
    Applies the prep_article function to each one, and returns the transformed data
    '''
    
    output = []
    for article in articles:
        article_data = prep_article(article)
        output.append(article_data)
   
    return output
        

In [52]:
data = prep_article_data(articles)

In [57]:
data[:1]

[{'title': 'Twitter CEO donates $10M to project giving $1,000 cash to COVID-19 hit families',
  'original': "Twitter's billionaire CEO Jack Dorsey has donated $10 million to Project 100 which will give $1,000 in cash to American families who have been affected by the COVID-19 pandemic. Other donors to Project 100 include Alphabet and Google CEO Sundar Pichai, Microsoft Co-founder Bill Gates and others. Dorsey also donated $10 million this month to help US prison fight COVID-19.",
  'stemmed': "twitter' billionair ceo jack dorsey ha donat $10 million to project 100 which will give $1,000 in cash to american famili who have been affect by the covid-19 pandemic. other donor to project 100 includ alphabet and googl ceo sundar pichai, microsoft co-found bill gate and others. dorsey also donat $10 million thi month to help US prison fight covid-19.",
  'lemmatized': "Twitter's billionaire CEO Jack Dorsey ha donated $10 million to Project 100 which will give $1,000 in cash to American family 

In [55]:
df = pd.DataFrame(data)

In [56]:
len(df)

100