# data

> Web scraping and tools for data collection and processing

In [None]:
#| default_exp data

In [None]:
#| export
import requests
from bs4 import BeautifulSoup
import re
from collections import Counter

## Web Scraper

In [None]:
#| export
class Webpage:
    def __init__(self, url):
        self.url = url
        self.html = ""
        self.links = []
        self.text = []

    def get_html(self):
        page = requests.get(self.url)
        self.html = BeautifulSoup(page.content, "html.parser")

    def get_html_anchors(self, keyword="http"):
        for anchor in self.html.findAll('a'):
            link = anchor.get('href')
            if link == None or link == "":
                continue
            if keyword in link:
                self.links.append(link)
                
    def get_html_text(self, tag="p"):
        rx = "[^a-zA-Z0-9 ]+"
        for p in self.html.findAll(tag):
            p_text = p.getText().strip()
            p_text = re.sub(rx,'',p_text).strip()
            if p_text == None or p_text == '':
                continue
            self.text.append(p_text)

    def most_common_words(self, k=10, ignore=["the","to","of","and","a","in","on","is","for","by"]):
        all_text = ' '.join(self.text).lower()
        split = all_text.split()
        split_ignore = [word for word in split if word not in ignore]
        counts = Counter(split_ignore)
        k_most_common = counts.most_common(k)
        return k_most_common



In [None]:
url = "https://gist.githubusercontent.com/deekayen/4148741/raw/98d35708fa344717d8eee15d11987de6c8e26d7d/1-1000.txt"
common_english = Webpage(url)
common_english.get_html()
english_words = common_english.html.getText().split('\n')
english_words

['the',
 'of',
 'to',
 'and',
 'a',
 'in',
 'is',
 'it',
 'you',
 'that',
 'he',
 'was',
 'for',
 'on',
 'are',
 'with',
 'as',
 'I',
 'his',
 'they',
 'be',
 'at',
 'one',
 'have',
 'this',
 'from',
 'or',
 'had',
 'by',
 'not',
 'word',
 'but',
 'what',
 'some',
 'we',
 'can',
 'out',
 'other',
 'were',
 'all',
 'there',
 'when',
 'up',
 'use',
 'your',
 'how',
 'said',
 'an',
 'each',
 'she',
 'which',
 'do',
 'their',
 'time',
 'if',
 'will',
 'way',
 'about',
 'many',
 'then',
 'them',
 'write',
 'would',
 'like',
 'so',
 'these',
 'her',
 'long',
 'make',
 'thing',
 'see',
 'him',
 'two',
 'has',
 'look',
 'more',
 'day',
 'could',
 'go',
 'come',
 'did',
 'number',
 'sound',
 'no',
 'most',
 'people',
 'my',
 'over',
 'know',
 'water',
 'than',
 'call',
 'first',
 'who',
 'may',
 'down',
 'side',
 'been',
 'now',
 'find',
 'any',
 'new',
 'work',
 'part',
 'take',
 'get',
 'place',
 'made',
 'live',
 'where',
 'after',
 'back',
 'little',
 'only',
 'round',
 'man',
 'year',
 'ca

In [None]:
sources = ["http://www.ageofautism.com/",
 "http://www.naturalnews.com", 
 "https://foodbabe.com/starthere/",
 "http://www.chopra.com"
 ]

In [None]:
url = sources[3]
test_page = Webpage(url)
test_page.get_html()
test_page.get_html_anchors()
test_page.get_html_text()
test_page.most_common_words(k=10,ignore=english_words+["i","im","its"])

[('wellbeing', 3),
 ('practices', 3),
 ('chopra', 3),
 ('deepen', 2),
 ('others', 2),
 ('health', 2),
 ('meditation', 2),
 ('holiday', 2),
 ('meditations', 2),
 ('knowledge', 2)]

In [None]:
url = sources[1]
test_page = Webpage(url)
test_page.get_html()
test_page.get_html_anchors()
dict = {}
for link in test_page.links:
    try:
        page = Webpage(link)
        page.get_html()
        page.get_html_text()
        common_words = page.most_common_words(k=5,ignore=english_words+["i","im","its"])
        print(link,"\n",common_words)
        dict[link] = common_words
    except:
        print("Error: link didn't work: ",link)

https://www.goodgopher.com 
 [('account', 3), ('gopher', 2), ('mail', 2), ('gmail', 2), ('instructions', 1)]
https://protonmail.com 
 [('proton', 22), ('email', 11), ('mail', 11), ('data', 8), ('encrypted', 6)]
https://www.naturalnews.com/2022-11-22-who-pandemic-treaty-in-final-stages-overrule-us-constitution-medical-dictatorship.html 
 [('treaty', 33), ('boyle', 22), ('adams', 20), ('dr', 20), ('health', 17)]
https://www.brighteon.com/f6c011ae-e0a1-4693-be9b-60e0ca13ddf5 
 [('content', 4), ('brighteon', 4), ('treaty', 3), ('dr', 3), ('boyle', 3)]
Error link didn't work:  https://www.naturalnews.com/hrr/mp3/Situation-Update-HRR-2022-11-22.mp3
https://www.brighteon.com/855bd044-2f01-4ca8-8851-4d66008760da 
 [('content', 4), ('brighteon', 4), ('twitter', 3), ('depopulation', 3), ('jab', 3)]
