In [4]:
import pandas as pd
import numpy as np
import unicodedata
import nltk
from nltk.corpus import stopwords
import re
import warnings
warnings.filterwarnings("ignore")
import acquire

### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:
> - Lowercase everything
> - Normalize unicode characters
> - Replace anything that is not a letter, number, whitespace or a single quote.


In [2]:
original = "Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"
original

"Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [3]:
def basic_clean(original):
    #cast to lower case
    article = original.lower()
    #remove accented and non Ascii characters
    article = unicodedata.normalize("NFKD", article)\
            .encode("ascii", "ignore")\
            .decode("utf-8")
    # remove special characters
    article = re.sub(r'[^a-z0-9\'\s]',"", article)
    
    return article

In [4]:
basic_clean(original)

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

### 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string

In [5]:
def tokenize(article):
    #create tokenizer
    tokenize = nltk.tokenize.ToktokTokenizer()
    #use tokenizer
    article = tokenize.tokenize(article, return_str=True)
    
    return article
    

In [6]:
article = basic_clean(original)
tokenize(article)

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

### 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words

In [7]:
def stem(article):
    #create porter stemmer
    ps = nltk.porter.PorterStemmer()
    #apply stemmer
    #this is going to give out a list
    stems = [ps.stem(word) for word in article.split()]
    #join the list back together
    article_stemmed = " ".join(stems)
    
    return article_stemmed

In [8]:
article = basic_clean(original)
article = tokenize(article)
article_stemmed = stem(article)
article_stemmed

"paul erdo and georg polya are influenti hungarian mathematician who contribut a lot to the field erdo ' s name contain the hungarian letter ' o ' ' o ' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

### 4.Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [9]:
def lemmatize(article):
    #create lemmatizer
    wnl = nltk.stem.WordNetLemmatizer()
    #use lemmatizer
    #splits back a list of words
    lemmas = [wnl.lemmatize(word) for word in article.split()]
    #join word back together
    article_lemmatized = " ".join(lemmas)
    
    return article_lemmatized
    
    

In [10]:
article = basic_clean(original)
article = tokenize(article)
article_lemmatized = lemmatize(article)
article_lemmatized

"paul erdos and george polya are influential hungarian mathematician who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity"

### 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

In [11]:
#save stopwords
stopwords_list = stopwords.words("english")
stopwords_list

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [12]:
#split the lemmatised version
words = article_lemmatized.split()
words[:9]

['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'are',
 'influential',
 'hungarian',
 'mathematician']

In [13]:
#word count
len(words)

51

In [14]:
#filtered words is words minus the stopwords
filtered_words = [word for word in words if word not in stopwords_list]
filtered_words[:9]

['paul',
 'erdos',
 'george',
 'polya',
 'influential',
 'hungarian',
 'mathematician',
 'contributed',
 'lot']

In [15]:
len(filtered_words)

32

In [16]:
def remove_stopwords(article_lemmatized):
    # define stopword
    stopwords_list = stopwords.words("english")
    #split lemmaztised paragraph 
    words = article_lemmatized.split() 
    #give me everything that is not stopword
    filtered_words = [word for word in words if word not in stopwords_list]
    #join filtered words
    article_without_stopwords = " ".join(filtered_words)
    
    return article_without_stopwords

In [17]:
remove_stopwords(article_lemmatized)

"paul erdos george polya influential hungarian mathematician contributed lot field erdos ' name contains hungarian letter ' ' ' ' double acute accent often incorrectly written erdos erdos either mistake typographical necessity"

In [1]:
import acquire

In [3]:
base_url = "https://inshorts.com/en/read"

In [4]:
acquire.get_all_shorts(base_url)

Unnamed: 0,title,category,body
0,Zimbabwe players ask India for cricketing tips,india,After getting thrashed by India by 5-0 in the ...
1,"Nigerian weightlifter in dope net, India may gain",india,India may move up after Nigerian weightlifter ...
2,"Indian Navy gets VLF, easy communication with ...",india,The Indian navy has a new communication system...
3,India beat NZ 3-2 to enter CWG hockey finals,india,In the CWG men's hockey semi-final against New...
4,India's first Billiards Premier League,india,The Billiards and Snooker Association of Mahar...
...,...,...,...
280,Porsche becomes Europe's most valuable automak...,automobile,Porsche overtook parent company Volkswagen to ...
281,Fix for wheel issue that caused electric car r...,automobile,Toyota Motor said it has found a fix for the d...
282,Vehicle registrations during festivals doubled...,automobile,Vehicle registrations more than doubled in thi...
283,Vintage cars on display to promote wildlife pr...,automobile,"To create awareness about wildlife week, the K..."


In [2]:
base_url = 'https://codeup.com/blog/'

In [3]:
acquire.get_blog_content(base_url)

Unnamed: 0,title,content
0,Coding Bootcamp or Computer Science Degree?,"For many people, deciding between a coding boo..."
1,Diversity Equity and Inclusion Report,Codeup is excited to launch our first Diversit...
2,Codeup Honored as SABJ Diversity and Inclusion...,Codeup has been named the 2022 Diversity and I...
3,How Can I Finance My Career Transition?,Deciding to transition into a tech career is a...
4,Tips for Women Beginning a Career in Tech,"Codeup strongly values diversity, and inclusio..."
5,What is Cloud Computing and AWS?,With many companies switching to cloud service...
