# `Prepare.py` development

In [1]:
import re
import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import pandas as pd

import acquire

In [2]:
scrape_codeup_content = acquire.read_url_or_file_codeup()
scrape_inshort_content = acquire.read_url_or_file_inshort()

Found File
Found File


In [3]:
df_codeup = pd.DataFrame(scrape_codeup_content)
df_inshort = pd.DataFrame(scrape_inshort_content)
df_codeup

Unnamed: 0,title,content
0,What Jobs Can You Get After a Coding Bootcamp?...,Have you been considering a career in Cloud Ad...
1,What Jobs Can You Get After a Coding Bootcamp?...,If you are interested in embarking on a career...
2,Is Our Cloud Administration Program Right for ...,Changing careers can be scary. The first thing...
3,5 Reasons To Attend Our New Cloud Administrati...,Come Work In The Cloud\nWhen your Monday rolls...
4,What Jobs Can You Get After a Coding Bootcamp?...,Have you been considering a career in Cloud Ad...
5,What Jobs Can You Get After a Coding Bootcamp?...,If you are interested in embarking on a career...
6,In-Person Workshop: Learn to Code – JavaScript...,Join us for our live in-person JavaScript cras...
7,In-Person Workshop: Learn to Code – Python on ...,"According to LinkedIn, the “#1 Most Promising ..."
8,Free JavaScript Workshop at Codeup Dallas on 6/28,Event Info: \nLocation – Codeup Dallas\nTime –...
9,Is Our Cloud Administration Program Right for ...,Changing careers can be scary. The first thing...


1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [4]:
def make_lower(content_list):
    cleaned_content = []
    for content in content_list:
        clean_content = {
            'title': content['title'].lower(),
            'content': content['content'].lower()
        }
        cleaned_content.append(clean_content)
    return cleaned_content

def make_no_special_chars(content_list):
    cleaned_content = []
    r_ex = r"[^a-zA-Z0-9\s]"
    for content in content_list:
        clean_content = {
            'title': re.sub(r_ex,'', content['title']),
            'content': re.sub(r_ex,'', content['content'])
        }
        cleaned_content.append(clean_content)
    return cleaned_content

def make_normal(content_list):
    cleaned_content = []
    for content in content_list:
        clean_content = {
            'title': unicodedata.normalize('NFKD', content['title']).encode('ascii', 'ignore').decode('utf-8', 'ignore'),
            'content': unicodedata.normalize('NFKD', content['content']).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        }
        cleaned_content.append(clean_content)
    return cleaned_content

def basic_clean(content_list):
    content_list = make_normal(content_list)
    content_list = make_no_special_chars(content_list)
    content_list = make_lower(content_list)
    return content_list

In [5]:
scrape_codeup_content = basic_clean(scrape_codeup_content)
scrape_inshort_content = basic_clean(scrape_inshort_content)

2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [6]:
def tokenize(s):
    tokenizer = ToktokTokenizer()
    return tokenizer.tokenize(s)

def mass_tokenize(content_list):
    for content in content_list:
        content['clean'] = tokenize(content['content'])
    return content_list

In [7]:
tokenize('here is a string')

['here', 'is', 'a', 'string']

In [8]:
scrape_codeup_content = mass_tokenize(scrape_codeup_content)
scrape_inshort_content = mass_tokenize(scrape_inshort_content)

3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [9]:
def stem(s):
    ps = nltk.porter.PorterStemmer()
    return [ps.stem(word) for word in s]

def mass_stem(content_list):
    for content in content_list:
        content['stemmed'] = stem(content['clean'])
    return content_list

In [10]:
scrape_codeup_content = mass_stem(scrape_codeup_content)
scrape_inshort_content = mass_stem(scrape_inshort_content)

4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [11]:
def lemmatize(s):
    wnl = nltk.stem.WordNetLemmatizer()
    return [wnl.lemmatize(word) for word in s]

def mass_lemmatize(content_list):
    for content in content_list:
        content['lemmatized'] = lemmatize(content['clean'])
    return content_list

In [12]:
scrape_codeup_content = mass_lemmatize(scrape_codeup_content)
scrape_inshort_content = mass_lemmatize(scrape_inshort_content)

5. 

Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.


In [20]:
def remove_stopwords(s, extra_words = ['codeup'], exclude_words = ['shan']):
    stopwords_list = nltk.corpus.stopwords.words('english')
    stopwords_list = stopwords_list + extra_words
    return [word for word in s if word not in stopwords_list]

def mass_remove_stopwords(content_list):
    for content in content_list:
        content['clean'] = remove_stopwords(content['clean'])
    return content_list

In [21]:
scrape_codeup_content = mass_remove_stopwords(scrape_codeup_content)
scrape_inshort_content = mass_remove_stopwords(scrape_inshort_content)

6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe `news_df`.

In [15]:
news_df = pd.DataFrame(acquire.read_url_or_file_inshort())
news_df

Found File


Unnamed: 0,title,content,category
0,Rupee drops 9 paise to close at all-time low o...,The rupee declined by 9 paise to close at a ne...,business
1,"Rupee edges closer to 80 per US dollar, opens ...",The rupee on Friday opened at a record low of ...,business
2,Will prove our position in court & we believe ...,Following a lawsuit against Tesla CEO Elon Mus...,business
3,Musk accused lawyers of causing trouble by see...,"In its lawsuit against Tesla CEO Elon Musk, Tw...",business
4,"Like all firms, we are not immune to economic ...","In an internal memo to the staff, Google CEO S...",business
...,...,...,...
95,"John, 'Special OPS' director Shivam team up fo...","Shivam Nair, who helmed 'Special OPS' and 'Naa...",entertainment
96,"'Chup' is a commercial thriller, will release ...",Director R Balki has said his upcoming film 'C...,entertainment
97,He'd keep telling Alia and me 'Go and get marr...,Discussing his late father and actor Rishi Kap...,entertainment
98,"Tara to make singing debut in 'Ek Villain...',...",Actress Tara Sutaria is all set to make her si...,entertainment


7. Make another dataframe for the Codeup blog posts. Name the dataframe `codeup_df`.

In [16]:
codeup_df = pd.DataFrame(acquire.read_url_or_file_codeup())
codeup_df

Found File


Unnamed: 0,title,content
0,What Jobs Can You Get After a Coding Bootcamp?...,Have you been considering a career in Cloud Ad...
1,What Jobs Can You Get After a Coding Bootcamp?...,If you are interested in embarking on a career...
2,Is Our Cloud Administration Program Right for ...,Changing careers can be scary. The first thing...
3,5 Reasons To Attend Our New Cloud Administrati...,Come Work In The Cloud\nWhen your Monday rolls...
4,What Jobs Can You Get After a Coding Bootcamp?...,Have you been considering a career in Cloud Ad...
5,What Jobs Can You Get After a Coding Bootcamp?...,If you are interested in embarking on a career...
6,In-Person Workshop: Learn to Code – JavaScript...,Join us for our live in-person JavaScript cras...
7,In-Person Workshop: Learn to Code – Python on ...,"According to LinkedIn, the “#1 Most Promising ..."
8,Free JavaScript Workshop at Codeup Dallas on 6/28,Event Info: \nLocation – Codeup Dallas\nTime –...
9,Is Our Cloud Administration Program Right for ...,Changing careers can be scary. The first thing...


8. 

For each dataframe, produce the following columns:

    title to hold the title
    original to hold the original article/post content
    clean to hold the normalized and tokenized original with the stopwords removed.
    stemmed to hold the stemmed version of the cleaned data.
    lemmatized to hold the lemmatized version of the cleaned data.



In [17]:
def make_dataframe(text_dict):
    text_dict = basic_clean(text_dict)
    text_dict = mass_tokenize(text_dict)
    text_dict = mass_remove_stopwords(text_dict)
    text_dict = mass_stem(text_dict)
    text_dict = mass_lemmatize(text_dict)
    return pd.DataFrame(text_dict)

In [18]:
codeup_df = make_dataframe(acquire.read_url_or_file_codeup())
codeup_df

Found File


Unnamed: 0,title,content,clean,stemmed,lemmatized
0,what jobs can you get after a coding bootcamp ...,have you been considering a career in cloud ad...,"[considering, career, cloud, administration, i...","[consid, career, cloud, administr, idea, job, ...","[considering, career, cloud, administration, i..."
1,what jobs can you get after a coding bootcamp ...,if you are interested in embarking on a career...,"[interested, embarking, career, tech, youre, p...","[interest, embark, career, tech, your, probabl...","[interested, embarking, career, tech, youre, p..."
2,is our cloud administration program right for you,changing careers can be scary the first thing ...,"[changing, careers, scary, first, thing, may, ...","[chang, career, scari, first, thing, may, ask,...","[changing, career, scary, first, thing, may, a..."
3,5 reasons to attend our new cloud administrati...,come work in the cloud\nwhen your monday rolls...,"[come, work, cloud, monday, rolls, around, sta...","[come, work, cloud, monday, roll, around, star...","[come, work, cloud, monday, roll, around, star..."
4,what jobs can you get after a coding bootcamp ...,have you been considering a career in cloud ad...,"[considering, career, cloud, administration, i...","[consid, career, cloud, administr, idea, job, ...","[considering, career, cloud, administration, i..."
5,what jobs can you get after a coding bootcamp ...,if you are interested in embarking on a career...,"[interested, embarking, career, tech, youre, p...","[interest, embark, career, tech, your, probabl...","[interested, embarking, career, tech, youre, p..."
6,inperson workshop learn to code javascript on...,join us for our live inperson javascript crash...,"[join, us, live, inperson, javascript, crash, ...","[join, us, live, inperson, javascript, crash, ...","[join, u, live, inperson, javascript, crash, c..."
7,inperson workshop learn to code python on 719,according to linkedin the 1 most promising job...,"[according, linkedin, 1, promising, job, data,...","[accord, linkedin, 1, promis, job, data, scien...","[according, linkedin, 1, promising, job, data,..."
8,free javascript workshop at codeup dallas on 628,event info \nlocation codeup dallas\ntime 6 ...,"[event, info, location, codeup, dallas, time, ...","[event, info, locat, codeup, dalla, time, 6, p...","[event, info, location, codeup, dallas, time, ..."
9,is our cloud administration program right for you,changing careers can be scary the first thing ...,"[changing, careers, scary, first, thing, may, ...","[chang, career, scari, first, thing, may, ask,...","[changing, career, scary, first, thing, may, a..."


In [19]:
news_df = make_dataframe(acquire.read_url_or_file_inshort())
news_df

Found File


Unnamed: 0,title,content,clean,stemmed,lemmatized
0,rupee drops 9 paise to close at alltime low of...,the rupee declined by 9 paise to close at a ne...,"[rupee, declined, 9, paise, close, new, record...","[rupe, declin, 9, pais, close, new, record, lo...","[rupee, declined, 9, paisa, close, new, record..."
1,rupee edges closer to 80 per us dollar opens a...,the rupee on friday opened at a record low of ...,"[rupee, friday, opened, record, low, 7994, us,...","[rupe, friday, open, record, low, 7994, us, do...","[rupee, friday, opened, record, low, 7994, u, ..."
2,will prove our position in court we believe w...,following a lawsuit against tesla ceo elon mus...,"[following, lawsuit, tesla, ceo, elon, musk, 4...","[follow, lawsuit, tesla, ceo, elon, musk, 44, ...","[following, lawsuit, tesla, ceo, elon, musk, 4..."
3,musk accused lawyers of causing trouble by see...,in its lawsuit against tesla ceo elon musk twi...,"[lawsuit, tesla, ceo, elon, musk, twitter, sha...","[lawsuit, tesla, ceo, elon, musk, twitter, sha...","[lawsuit, tesla, ceo, elon, musk, twitter, sha..."
4,like all firms we are not immune to economic h...,in an internal memo to the staff google ceo su...,"[internal, memo, staff, google, ceo, sundar, p...","[intern, memo, staff, googl, ceo, sundar, pich...","[internal, memo, staff, google, ceo, sundar, p..."
...,...,...,...,...,...
95,john special ops director shivam team up for a...,shivam nair who helmed special ops and naam sh...,"[shivam, nair, helmed, special, ops, naam, sha...","[shivam, nair, helm, special, op, naam, shaban...","[shivam, nair, helmed, special, ops, naam, sha..."
96,chup is a commercial thriller will release in ...,director r balki has said his upcoming film ch...,"[director, r, balki, said, upcoming, film, chu...","[director, r, balki, said, upcom, film, chup, ...","[director, r, balki, said, upcoming, film, chu..."
97,hed keep telling alia and me go and get marrie...,discussing his late father and actor rishi kap...,"[discussing, late, father, actor, rishi, kapoo...","[discuss, late, father, actor, rishi, kapoor, ...","[discussing, late, father, actor, rishi, kapoo..."
98,tara to make singing debut in ek villain song ...,actress tara sutaria is all set to make her si...,"[actress, tara, sutaria, set, make, singing, d...","[actress, tara, sutaria, set, make, sing, debu...","[actress, tara, sutaria, set, make, singing, d..."


9.

Ask yourself:

    If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
    If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
    If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?


*Answers:*

For these texts, which are small, either is good.  A text of 200TB would be better stemmed because if hosted services are being used; especially if those services cost extra money.