In [None]:
# Datawrangling3
# Deploy NLTK and BeautifulSoup to parse a webpage
# https://www.nltk.org/

import os, sys
import requests

import nltk
nltk.download('punkt')
nltk.download("stopwords")
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words("english"))

from collections import Counter
from wordcloud import WordCloud, STOPWORDS

from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.request

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
# Webtext parsing functions
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)
    return (u" ".join(t.strip() for t in visible_texts))

In [None]:
# Text filter function; filter based on some conditions
def filter_words(x):
    # Not in common English words
    cond_1 = x.lower() not in stop_words
    # Not a number
    cond_2 = not x.isnumeric()
    # Length of at least 3
    cond_3 = len(x)>2
    return (cond_1 and cond_2 and cond_3)

In [None]:
# Select a webpage
url = 'https://www.npr.org/2021/05/09/995173022/deadly-protests-against-economic-inequality-and-police-brutality-continue-in-col'
html = urllib.request.urlopen(url).read()
webtext = text_from_html(html)
print(webtext)

     Accessibility links   Skip to main content  Keyboard shortcuts for audio player         Open Navigation Menu           NPR Shop         >
                    Close Navigation Menu      Home     News  Expand/collapse submenu for News    National  World  Politics  Business  Health  Science  Climate  Race      Culture  Expand/collapse submenu for Culture    Books  Movies  Television  Pop Culture  Food  Art & Design   Performing Arts  Life Kit      Music  Expand/collapse submenu for Music    Tiny Desk   #NowPlaying   All Songs Considered   Music Features   Live Sessions      Podcasts & Shows  Expand/collapse submenu for Podcasts & Shows    Daily     Morning Edition     Weekend Edition Saturday     Weekend Edition Sunday     All Things Considered     Fresh Air     Up First    Featured    Planet Money    Life Kit    Invisibilia    NPR's Book of the Day    More Podcasts & Shows       Search       NPR Shop                 Tiny Desk   #NowPlaying   All Songs Considered   Music Features   L

In [None]:
#Tokenize the text
token_text = word_tokenize(webtext)
print(token_text)
print()
n_token = len(token_text)
print(n_token)

['Accessibility', 'links', 'Skip', 'to', 'main', 'content', 'Keyboard', 'shortcuts', 'for', 'audio', 'player', 'Open', 'Navigation', 'Menu', 'NPR', 'Shop', '>', 'Close', 'Navigation', 'Menu', 'Home', 'News', 'Expand/collapse', 'submenu', 'for', 'News', 'National', 'World', 'Politics', 'Business', 'Health', 'Science', 'Climate', 'Race', 'Culture', 'Expand/collapse', 'submenu', 'for', 'Culture', 'Books', 'Movies', 'Television', 'Pop', 'Culture', 'Food', 'Art', '&', 'Design', 'Performing', 'Arts', 'Life', 'Kit', 'Music', 'Expand/collapse', 'submenu', 'for', 'Music', 'Tiny', 'Desk', '#', 'NowPlaying', 'All', 'Songs', 'Considered', 'Music', 'Features', 'Live', 'Sessions', 'Podcasts', '&', 'Shows', 'Expand/collapse', 'submenu', 'for', 'Podcasts', '&', 'Shows', 'Daily', 'Morning', 'Edition', 'Weekend', 'Edition', 'Saturday', 'Weekend', 'Edition', 'Sunday', 'All', 'Things', 'Considered', 'Fresh', 'Air', 'Up', 'First', 'Featured', 'Planet', 'Money', 'Life', 'Kit', 'Invisibilia', 'NPR', "'s", 'B