# data

> Web scraping and tools for data collection and processing

In [None]:
#| default_exp data

In [None]:
#| export
import requests
from bs4 import BeautifulSoup
import re
from collections import Counter

## Web Scraper

In [None]:
#| export
class Webpage:
    def __init__(self, url):
        self.url = url
        self.html = ""
        self.links = []
        self.text = []

    def get_html(self, timeout = 5):
        page = requests.get(self.url, timeout=timeout)
        self.html = BeautifulSoup(page.content, "html.parser")

    def get_html_anchors(self, keyword="http"):
        for anchor in self.html.findAll('a'):
            link = anchor.get('href')
            if link == None or link == "":
                continue
            if keyword in link:
                self.links.append(link)
                
    def get_html_text(self, tag="p"):
        rx = "[^a-zA-Z0-9 ]+"
        for p in self.html.findAll(tag):
            p_text = p.getText().strip()
            p_text = re.sub(rx,'',p_text).strip()
            if p_text == None or p_text == '':
                continue
            self.text.append(p_text)

    def most_common_words(self, k=10, ignore=["the","to","of","and","a","in","on","is","for","by"]):
        all_text = ' '.join(self.text).lower()
        split = all_text.split()
        split_ignore = [word for word in split if word not in ignore]
        counts = Counter(split_ignore)
        k_most_common = counts.most_common(k)
        return k_most_common



In [None]:
url = "https://gist.githubusercontent.com/deekayen/4148741/raw/98d35708fa344717d8eee15d11987de6c8e26d7d/1-1000.txt"
common_english = Webpage(url)
common_english.get_html()
english_words = common_english.html.getText().lower()
english_words = english_words.split('\n')
print(len(english_words),"most common English words")
#english_words

1000 most common English words


In [None]:
pseudo_sources = ["http://www.ageofautism.com/",
 "http://www.naturalnews.com", 
 "https://foodbabe.com/starthere/",
 "http://www.chopra.com"
 ]

d_pse = {}

In [None]:
url = pseudo_sources[3]
test_page = Webpage(url)
test_page.get_html()
test_page.get_html_anchors()
test_page.get_html_text()
test_page.most_common_words(k=20,ignore=english_words[:100])

[('wellbeing', 3),
 ('practices', 3),
 ('chopra', 3),
 ('life', 2),
 ('deepen', 2),
 ('others', 2),
 ('health', 2),
 ('meditation', 2),
 ('holiday', 2),
 ('help', 2),
 ('join', 2),
 ('us', 2),
 ('meditations', 2),
 ('knowledge', 2),
 ('share', 2),
 ('wisdom', 2),
 ('teachers', 2),
 ('me', 2),
 ('everything', 1),
 ('need', 1)]

In [None]:
def get_k_common_words(url, k, ignore):
    page = Webpage(url)
    page.get_html()
    page.get_html_text()
    return page, page.most_common_words(k=k,ignore=ignore)

def get_k_common_words_links(url, dict, k, ignore):
    page, common_words = get_k_common_words(url, k, ignore)
    page.get_html_anchors()
    dict[url] = common_words

    for link in page.links:
        print(link)
        if all(x not in link for x in [".mp3",".jpg",".png",".mp4"]):
            try:
                _, common_words = get_k_common_words(link, k=k,ignore=ignore)
                if len(common_words) == k: dict[link] = common_words
            except:
                pass

In [None]:
#| hide
k=10
ignore=english_words[:100]

for source in pseudo_sources:
    get_k_common_words_links(source, d_pse, k, ignore)

len(d_pse)


In [None]:
d_pse[pseudo_sources[0]]

In [None]:
science_sources = ["https://sciencebasedmedicine.org/",
 "https://www.hopkinsmedicine.org/gim/research/method/ebm.html"]

d_sci = {}

In [None]:
#| hide
k=10
ignore=english_words[:100]

for source in science_sources:
    get_k_common_words_links(source, d_sci, k, ignore)

len(d_sci)

https://www.hopkinsmedicine.org/coronavirus/for-johns-hopkins-patients.html#masks
https://www.hopkinsmedicine.org/coronavirus/covid-19-vaccine/
https://www.hopkinsmedicine.org/coronavirus/testing-and-care.html
https://www.hopkinsmedicine.org/coronavirus/for-johns-hopkins-patients.html
https://www.hopkinsmedicine.org/coronavirus/visitor-guidelines.html
https://www.hopkinsmedicine.org/coronavirus/index.html
https://coronavirus.maryland.gov/pages/symptoms-testing#TestSitesList
https://hopkinsmedicine.org/
https://hopkinsmedicine.org/about/index.html
https://hopkinsmedicine.org/patient_care/index.html
https://hopkinsmedicine.org/health
https://hopkinsmedicine.org/research/index.html
https://hopkinsmedicine.org/education/index.html
https://www.hopkinsmedicine.org/mychart/
https://www.hopkinsmedicine.org/patient_care/johns_hopkins_medicine_request_appointment.html
https://www.hopkinsmedicine.org/profiles
https://www.hopkinsmedicine.org/patient_care/patients-visitors/billing-insurance/pay-bil

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://www.facebook.com/Johns.Hopkins.Medicine
https://twitter.com/HopkinsMedicine
https://www.linkedin.com/company/johns-hopkins-medicine
https://www.youtube.com/user/JohnsHopkinsMedicine
https://www.instagram.com/HopkinsMedicine/
https://www.weibo.com/hopkinsmedicine


85

In [None]:
d_sci

{'https://sciencebasedmedicine.org/': [],
 'https://www.hopkinsmedicine.org/gim/research/method/ebm.html': [('patient',
   7),
  ('care', 5),
  ('evidencebased', 5),
  ('medicine', 5),
  ('evidence', 5),
  ('testing', 4),
  ('best', 4),
  ('right', 3),
  ('practice', 3),
  ('masks', 2)],
 'https://www.hopkinsmedicine.org/coronavirus/for-johns-hopkins-patients.html#masks': [('our',
   19),
  ('care', 19),
  ('covid19', 15),
  ('masks', 9),
  ('facilities', 8),
  ('mask', 8),
  ('wear', 8),
  ('required', 6),
  ('patients', 6),
  ('testing', 5)],
 'https://www.hopkinsmedicine.org/coronavirus/covid-19-vaccine/': [('booster',
   11),
  ('covid19', 9),
  ('vaccine', 6),
  ('masks', 5),
  ('care', 5),
  ('patients', 5),
  ('doses', 5),
  ('vaccines', 5),
  ('johns', 5),
  ('hopkins', 5)],
 'https://www.hopkinsmedicine.org/coronavirus/testing-and-care.html': [('testing',
   19),
  ('covid19', 12),
  ('hours', 10),
  ('am', 10),
  ('johns', 9),
  ('hopkins', 9),
  ('hospital', 9),
  ('holiday'

In [None]:
s1 = d_sci['https://www.hopkinsmedicine.org/coronavirus/for-johns-hopkins-patients.html#masks']
s2 = d_sci['https://www.hopkinsmedicine.org/gim/research/method/ebm.html']

Counter(dict(s1))+Counter(dict(s2))

Counter({'our': 19,
         'care': 24,
         'covid19': 15,
         'masks': 11,
         'facilities': 8,
         'mask': 8,
         'wear': 8,
         'required': 6,
         'patients': 6,
         'testing': 9,
         'patient': 7,
         'evidencebased': 5,
         'medicine': 5,
         'evidence': 5,
         'best': 4,
         'right': 3,
         'practice': 3})

In [None]:
count = Counter()
for link in d_sci:
    count+=Counter(dict(d_sci[link]))

count.most_common()

[('14106144685', 930),
 ('711', 868),
 ('tty', 713),
 ('care', 475),
 ('testing', 304),
 ('our', 222),
 ('hopkins', 194),
 ('johns', 155),
 ('medicine', 147),
 ('de', 124),
 ('ng', 124),
 ('information', 105),
 ('patients', 99),
 ('tty711', 93),
 ('si', 93),
 ('ni', 93),
 ('online', 89),
 ('masks', 81),
 ('covid19', 78),
 ('research', 76),
 ('patient', 71),
 ('required', 64),
 ('any', 54),
 ('services', 53),
 ('facilities', 49),
 ('health', 47),
 ('professor', 35),
 ('platforms', 35),
 ('internal', 34),
 ('privacy', 32),
 ('inside', 28),
 ('general', 28),
 ('gim', 27),
 ('faculty', 25),
 ('us', 23),
 ('statement', 21),
 ('hospital', 20),
 ('please', 20),
 ('guidelines', 19),
 ('dr', 18),
 ('collected', 18),
 ('terms', 18),
 ('mask', 16),
 ('wear', 16),
 ('partners', 16),
 ('eligible', 16),
 ('clinical', 16),
 ('vaccinating', 16),
 ('division', 16),
 ('collect', 16),
 ('medical', 15),
 ('school', 13),
 ('fellowship', 13),
 ('conditions', 13),
 ('partner', 12),
 ('resources', 12),
 ('md'