In [36]:
import pandas as pd
import numpy as np
import unicodedata
import nltk
from nltk.corpus import stopwords
import re
import warnings
warnings.filterwarnings("ignore")
import acquire

### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:
> - Lowercase everything
> - Normalize unicode characters
> - Replace anything that is not a letter, number, whitespace or a single quote.


In [37]:
original = "Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"
original

"Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [38]:
def basic_clean(string):
    #cast to lower case
    string = string.lower()
    #remove accented and non Ascii characters
    string = unicodedata.normalize("NFKD", string)\
            .encode("ascii", "ignore")\
            .decode("utf-8")
    # remove special characters
    string = re.sub(r'[^a-z0-9\s]',"", string)
    
    return string

In [39]:
basic_clean(original)

'paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdoss name contains the hungarian letter o o with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity'

### 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string

In [40]:
def tokenize(string):
    #create tokenizer
    tokenize = nltk.tokenize.ToktokTokenizer()
    #use tokenizer
    string = tokenize.tokenize(string, return_str=True)
    
    return string
    

In [41]:
string = basic_clean(original)
tokenize(string)

'paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdoss name contains the hungarian letter o o with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity'

### 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words

In [42]:
def stem(string):
    #create porter stemmer
    ps = nltk.porter.PorterStemmer()
    #apply stemmer
    #this is going to give out a list
    stems = [ps.stem(word) for word in string.split()]
    #join the list back together
    string = " ".join(stems)
    
    return string

In [43]:
string = basic_clean(original)
string = tokenize(string)
string = stem(string)
string

'paul erdo and georg polya are influenti hungarian mathematician who contribut a lot to the field erdoss name contain the hungarian letter o o with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess'

### 4.Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [44]:
def lemmatize(string):
    #create lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()
    #use lemmatizer
    #splits back a list of words
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    #join word back together
    string = " ".join(lemmas)
    
    return string
    
    

In [45]:
string = basic_clean(original)
string = tokenize(string)
string= lemmatize(string)
string

'paul erdos and george polya are influential hungarian mathematician who contributed a lot to the field erdoss name contains the hungarian letter o o with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity'

### 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

In [46]:
#save stopwords
stopwords_list = stopwords.words("english")
stopwords_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [47]:
#split the lemmatised version
words = string.split()
words[:9]

['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'are',
 'influential',
 'hungarian',
 'mathematician']

In [48]:
#word count
len(words)

45

In [49]:
#filtered words is words minus the stopwords
filtered_words = [word for word in words if word not in stopwords_list]
filtered_words[:9]

['paul',
 'erdos',
 'george',
 'polya',
 'influential',
 'hungarian',
 'mathematician',
 'contributed',
 'lot']

In [50]:
len(filtered_words)

27

In [51]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    # define stopword
    stopwords_list = stopwords.words("english")
    # exclude words you do not want to remove
    stopwords_list = set(stopwords_list)-set(exclude_words)
    # include extra words manauly in stop word list
    stopwords_list = stopwords_list.union(set(extra_words))
    #split lemmaztised paragraph 
    words = string.split() 
    #give me everything that is not stopword
    filtered_words = [word for word in words if word not in stopwords_list]
    #join filtered words
    string = " ".join(filtered_words)
    
    return string

In [52]:
string = basic_clean(original)
string = tokenize(string)
string= lemmatize(string)
string = remove_stopwords(string)
string

'paul erdos george polya influential hungarian mathematician contributed lot field erdoss name contains hungarian letter double acute accent often incorrectly written erdos erdos either mistake typographical necessity'

### 6 Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.



In [53]:
base_url = "https://inshorts.com/en/read"

In [54]:
news_df = acquire.get_all_shorts(base_url)
news_df

Unnamed: 0,title,category,body
0,Bharti Airtel rakes in 61% profit,india,"Bharti Airtel, India's top telecommunications ..."
1,Infosys Gifts Sikka Shares Worth Rs 8.2cr,india,"In a regulatory filing to the BSE on Friday, I..."
2,Zimbabwe players ask India for cricketing tips,india,After getting thrashed by India by 5-0 in the ...
3,"AAP drops Rajouri Garden candidate, a week bef...",india,"Only a week before Delhi Assembly polls, Aam A..."
4,Kashmir's famous Dal Lake freezes,india,After the recent snowfall in upper reaches of ...
...,...,...,...
280,Vintage cars on display to promote wildlife pr...,automobile,"To create awareness about wildlife week, the K..."
281,"Tesla delivered record 83,135 China-made EVs i...",automobile,A report by China Passenger Car Association ha...
282,Porsche becomes Europe's most valuable automak...,automobile,Porsche overtook parent company Volkswagen to ...
283,Passenger vehicle wholesales rise by 92% in Se...,automobile,Passenger vehicle wholesales in India surged b...


### 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [55]:
base_url = 'https://codeup.com/blog/'

In [56]:
codeup_df = acquire.get_blog_content(base_url)
codeup_df

Unnamed: 0,title,content
0,Coding Bootcamp or Computer Science Degree?,"For many people, deciding between a coding boo..."
1,Diversity Equity and Inclusion Report,Codeup is excited to launch our first Diversit...
2,Codeup Honored as SABJ Diversity and Inclusion...,Codeup has been named the 2022 Diversity and I...
3,How Can I Finance My Career Transition?,Deciding to transition into a tech career is a...
4,Tips for Women Beginning a Career in Tech,"Codeup strongly values diversity, and inclusio..."
5,What is Cloud Computing and AWS?,With many companies switching to cloud service...


### 8. For each dataframe, produce the following columns:
- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.



In [57]:
news_df.head()

Unnamed: 0,title,category,body
0,Bharti Airtel rakes in 61% profit,india,"Bharti Airtel, India's top telecommunications ..."
1,Infosys Gifts Sikka Shares Worth Rs 8.2cr,india,"In a regulatory filing to the BSE on Friday, I..."
2,Zimbabwe players ask India for cricketing tips,india,After getting thrashed by India by 5-0 in the ...
3,"AAP drops Rajouri Garden candidate, a week bef...",india,"Only a week before Delhi Assembly polls, Aam A..."
4,Kashmir's famous Dal Lake freezes,india,After the recent snowfall in upper reaches of ...


In [58]:
#change column names
news_df.columns = ["title", "category", "original"]
news_df = news_df[["title","original"]]

In [59]:
news_df.head()

Unnamed: 0,title,original
0,Bharti Airtel rakes in 61% profit,"Bharti Airtel, India's top telecommunications ..."
1,Infosys Gifts Sikka Shares Worth Rs 8.2cr,"In a regulatory filing to the BSE on Friday, I..."
2,Zimbabwe players ask India for cricketing tips,After getting thrashed by India by 5-0 in the ...
3,"AAP drops Rajouri Garden candidate, a week bef...","Only a week before Delhi Assembly polls, Aam A..."
4,Kashmir's famous Dal Lake freezes,After the recent snowfall in upper reaches of ...


In [60]:
#create new column that uses basic clean and tokenoze function
news_df["clean"] = news_df.original.apply(basic_clean).apply(tokenize)

In [61]:
news_df.head(1)

Unnamed: 0,title,original,clean
0,Bharti Airtel rakes in 61% profit,"Bharti Airtel, India's top telecommunications ...",bharti airtel indias top telecommunications co...


In [62]:
#create new column that uses clean column and uses stem funtion 
news_df["stemmed"] = news_df.clean.apply(stem)

In [63]:
news_df.head(1)

Unnamed: 0,title,original,clean,stemmed
0,Bharti Airtel rakes in 61% profit,"Bharti Airtel, India's top telecommunications ...",bharti airtel indias top telecommunications co...,bharti airtel india top telecommun compani ha ...


In [64]:
#create new column that takes in clean and applies lemmatize function
news_df["lemmatized"] = news_df.clean.apply(lemmatize)

In [65]:

news_df.head(2)

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Bharti Airtel rakes in 61% profit,"Bharti Airtel, India's top telecommunications ...",bharti airtel indias top telecommunications co...,bharti airtel india top telecommun compani ha ...,bharti airtel india top telecommunication comp...
1,Infosys Gifts Sikka Shares Worth Rs 8.2cr,"In a regulatory filing to the BSE on Friday, I...",in a regulatory filing to the bse on friday in...,in a regulatori file to the bse on friday info...,in a regulatory filing to the bse on friday in...


#### now with code up data

In [66]:
codeup_df.head(1)

Unnamed: 0,title,content
0,Coding Bootcamp or Computer Science Degree?,"For many people, deciding between a coding boo..."


In [67]:
#rename columns
codeup_df.columns = ["title","original"]
#create column that uses basic clean and tokenize function
codeup_df["clean"] = codeup_df.original.apply(basic_clean).apply(tokenize)
#create column that holds stemmed up column from clean column
codeup_df["stemmed"] = codeup_df.clean.apply(stem)
#create column that holds lemmatize values
codeup_df["lemmatized"] = codeup_df.clean.apply(lemmatize)


In [68]:
codeup_df.head(2)

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Coding Bootcamp or Computer Science Degree?,"For many people, deciding between a coding boo...",for many people deciding between a coding boot...,for mani peopl decid between a code bootcamp a...,for many people deciding between a coding boot...
1,Diversity Equity and Inclusion Report,Codeup is excited to launch our first Diversit...,codeup is excited to launch our first diversit...,codeup is excit to launch our first divers equ...,codeup is excited to launch our first diversit...


### 9 Ask yourself:

- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?

for files of 493kb and 25 mb, i would prefer lemmatize as it is quick on small files

for file of 200 TB, i would use stemmed