## Introduction to Data Science

### Introduction to Scraping

In [38]:
#import pylab
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import requests
import urllib
from bs4 import BeautifulSoup as bs

%matplotlib inline
#%matplotlib notebook

### Using [Requests](https://docs.python-requests.org/en/master/)

In [51]:
website_url = 'https://www.wienerzeitung.at/nachrichten/politik/oesterreich/2107909-Haertere-steirische-Impfanreize.html' 
content = requests.get(website_url)
print(content.status_code)

200


Check [HTTP status codes](https://developer.mozilla.org/pt-BR/docs/Web/HTTP/Status) 

#### Getting html content of the page 

In [52]:
content.text[0:1000]

'<!DOCTYPE html>\n<html lang="de">\n<head>\n<meta charset="utf-8" />\n<meta http-equiv="cache-control" content="no-cache" />\n<meta http-equiv="pragma" content="no-cache" />\n<meta name="robots" content="index, follow, noarchive" />\n<meta name="content-language" content="de" />\n<meta name="description" content="Am Donnerstag ging es zwar &quot;nur&quot; um eine Impfkampagne. Der steirische Landeshauptmann Hermann Sch&uuml;tzenh&ouml;fer plant aber die Corona-Impfung als Voraussetzung bei Neueinstellung von Kindergartenpersonal." />\n<meta name="author" content="Martina Madner" />\n<meta name="copyright" content="Wiener Zeitung Online" />\n<meta property="og:type" content="article" />\n<meta property="og:title" content="Impfpflicht - H&auml;rtere steirische Impfanreize" />\n<meta property="og:description" content="Am Donnerstag ging es zwar &quot;nur&quot; um eine Impfkampagne. Der steirische Landeshauptmann Hermann Sch&uuml;tzenh&ouml;fer plant aber die Corona-Impfung als..." />\n<me

### Using [Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)

In [53]:
soup = bs(content.text,'lxml')
print(type(soup))

<class 'bs4.BeautifulSoup'>


#### Getting formatted html content of the page 

In [55]:
print(soup.prettify()[0:1000])

<!DOCTYPE html>
<html lang="de">
 <head>
  <meta charset="utf-8"/>
  <meta content="no-cache" http-equiv="cache-control"/>
  <meta content="no-cache" http-equiv="pragma"/>
  <meta content="index, follow, noarchive" name="robots"/>
  <meta content="de" name="content-language"/>
  <meta content='Am Donnerstag ging es zwar "nur" um eine Impfkampagne. Der steirische Landeshauptmann Hermann Schützenhöfer plant aber die Corona-Impfung als Voraussetzung bei Neueinstellung von Kindergartenpersonal.' name="description"/>
  <meta content="Martina Madner" name="author"/>
  <meta content="Wiener Zeitung Online" name="copyright"/>
  <meta content="article" property="og:type"/>
  <meta content="Impfpflicht - Härtere steirische Impfanreize" property="og:title"/>
  <meta content='Am Donnerstag ging es zwar "nur" um eine Impfkampagne. Der steirische Landeshauptmann Hermann Schützenhöfer plant aber die Corona-Impfung als...' property="og:description"/>
  <meta content="Österreich Politik - Nachrichten -

#### Getting text of the page 

In [63]:
print(soup.get_text()[9000:10000])

ete Werbekampagne f&uuml;rs Impfen in der Steiermark. Es spricht von einer \"gewissen Impfm&uuml;digkeit, die eingetreten ist, da rede ich nicht um den hei&szlig;en Brei herum: Die Anmeldungen sind zur&uuml;ckgegangen.\" 560.000 von 1,1 Millionen impfbaren Steirerinnen und Steirern sind erstgeimpft, 227.000 auch ein zweites Mal. Sein Appell: \"Wir werden alles versuchen, damit sich die Menschen impfen lassen. Es ist noch nicht vorbei.\"<\/p>\n<p>In der ORF-\"Pressestunde\" sprach Sch&uuml;tzenh&ouml;fer von einem \"sch&auml;bigem Verhalten\" der Impfunwilligen. Auf die Frage, ob er sich eine Sars-CoV-2-Schutzimpfungspflicht f&uuml;r das Kindergartenpersonal vorstellen k&ouml;nne, sagte er: \"Ich w&auml;re pers&ouml;nlich daf&uuml;r, der Verfassungsdienst des Bundes sagt aber: Nein, das geht nicht\" - und lie&szlig; dann aufhorchen: \"Aber: Jedes Land kann selber etwas bei der Aufnahme tun. Das will ich in den Bereichen des Landes selbstverst&auml;ndlich so machen.\"<\/p>\n<p>Impfnachz&

In [57]:
print(soup.title)

<title>Impfpflicht - Härtere steirische Impfanreize - Wiener Zeitung Online</title>

#### Extracting anchors and links

In [58]:
soup.find_all('a')[0:20]

[<a href="/abonnement">Abo</a>,
 <a href="/epaper" target="extern">E-Paper</a>,
 <a href="/beilagen">Magazine</a>,
 <a href="/">
 <i class="fal fa-history fa-lg fa-fw"></i><span class="notification-counter meldung-counter"></span>
 </a>,
 <a class="mobile-search-button mobile-search-closed" data-overlay="overlay-search" href="javascript:void(0);" title="Suche"><i class="far fa-search"></i></a>,
 <a class="logo-wzo" href="/" itemprop="url" title="">
 <img alt="Wiener Zeitung" height="30" src="/_em_daten/wzo/assets/img/WZ-Logo.svg"/>
 <img class="hidden" height="0" itemprop="logo" src="/_em_daten/wzo/assets/img/favicons/wzo-512.png"/>
 </a>,
 <a class="mobile-menu-button" href="javascript:void(0);" title="Hauptmenü"><i class="far fa-bars"></i></a>,
 <a data-preview="" data-preview-count="4" href="https://www.wienerzeitung.at/nachrichten/politik/" target="_top" title="Politik">Politik</a>,
 <a data-preview="" data-preview-count="4" href="https://www.wienerzeitung.at/nachrichten/kultur/" t

#### Extracting only the *href* of anchors and links

In [59]:
for link in soup.find_all('a')[0:30]:
    print(link.get('href'))

/abonnement
/epaper
/beilagen
/
javascript:void(0);
/
javascript:void(0);
https://www.wienerzeitung.at/nachrichten/politik/
https://www.wienerzeitung.at/nachrichten/kultur/
https://www.wienerzeitung.at/nachrichten/wirtschaft/
https://www.wienerzeitung.at/amtsblatt/aktuelle-ausgabe/
https://www.wienerzeitung.at/dossiers/
https://www.wienerzeitung.at/meinung/
javascript:void(0);
https://www.wienerzeitung.at/nachrichten/politik/
https://www.wienerzeitung.at/nachrichten/politik/
https://www.wienerzeitung.at/nachrichten/wirtschaft/
https://www.wienerzeitung.at/nachrichten/kultur/
https://www.wienerzeitung.at/nachrichten/chronik/
https://www.wienerzeitung.at/nachrichten/wissen/
https://www.wienerzeitung.at/nachrichten/sport/
https://www.wienerzeitung.at/nachrichten/wahlen/
https://www.wienerzeitung.at/nachrichten/zeitreisen-rubriken/
https://www.wienerzeitung.at/nachrichten/reflexionen/
https://www.wienerzeitung.at/nachrichten/digital/
https://www.wienerzeitung.at/amtsblatt/aktuelle-ausgabe/

#### Extracting only *http* and *https* from *href* links

In [60]:
for link in soup.findAll('a', attrs={'href': re.compile("^http[s]?://")}):
    print(link.get('href'))

https://www.wienerzeitung.at/nachrichten/politik/
https://www.wienerzeitung.at/nachrichten/kultur/
https://www.wienerzeitung.at/nachrichten/wirtschaft/
https://www.wienerzeitung.at/amtsblatt/aktuelle-ausgabe/
https://www.wienerzeitung.at/dossiers/
https://www.wienerzeitung.at/meinung/
https://www.wienerzeitung.at/nachrichten/politik/
https://www.wienerzeitung.at/nachrichten/politik/
https://www.wienerzeitung.at/nachrichten/wirtschaft/
https://www.wienerzeitung.at/nachrichten/kultur/
https://www.wienerzeitung.at/nachrichten/chronik/
https://www.wienerzeitung.at/nachrichten/wissen/
https://www.wienerzeitung.at/nachrichten/sport/
https://www.wienerzeitung.at/nachrichten/wahlen/
https://www.wienerzeitung.at/nachrichten/zeitreisen-rubriken/
https://www.wienerzeitung.at/nachrichten/reflexionen/
https://www.wienerzeitung.at/nachrichten/digital/
https://www.wienerzeitung.at/amtsblatt/aktuelle-ausgabe/
https://www.wienerzeitung.at/amtsblatt/amtliche-veroeffentlichungen-zu-corona/
https://www.wi

### Which one to use: [urllib or requests?](https://stackoverflow.com/questions/2018026/what-are-the-differences-between-the-urllib-urllib2-urllib3-and-requests-modul)

In [61]:
def getLinks(url):
    raw_page = requests.get(url)
    html_page = raw_page.text
    #html_page = urllib.request.urlopen(url)
    soup = bs(html_page)
    links = []
    for link in soup.findAll('a', attrs={'href': re.compile("^http[s]?://")}):
        links.append(link.get('href'))
    return links

print(getLinks("https://arstechnica.com"))

['https://arstechnica.com', 'http://video.arstechnica.com/', 'http://arstechnica.com/?view=grid', 'http://arstechnica.com/?view=archive', 'http://arstechnica.com/?theme=light', 'http://arstechnica.com/?theme=dark', 'https://arstechnica.com/civis/ucp.php?mode=login&return_to=%2F', 'https://arstechnica.com/civis/ucp.php?mode=register', 'https://arstechnica.com/gadgets/2021/06/rtx-3070-ti-review-nvidia-has-flipped-its-turn-signal-out-of-the-gpu-fast-lane/', 'https://arstechnica.com/gadgets/2021/06/rtx-3070-ti-review-nvidia-has-flipped-its-turn-signal-out-of-the-gpu-fast-lane/', 'https://arstechnica.com/gadgets/2021/06/rtx-3070-ti-review-nvidia-has-flipped-its-turn-signal-out-of-the-gpu-fast-lane/', 'https://arstechnica.com/author/samred/', 'https://arstechnica.com/gadgets/2021/06/rtx-3070-ti-review-nvidia-has-flipped-its-turn-signal-out-of-the-gpu-fast-lane/?comments=1', 'https://arstechnica.com/gaming/2021/06/elden-ring-shows-off-first-gameplay-footage-confirmed-for-jan-2022/', 'https://

### Extracting specific information

In [65]:
website_url = 'https://thegreenestworkforce.ca/index.php/en/schools/' 
content = requests.get(website_url).text
soup = bs(content,'lxml')

In [68]:
soup.get_text()[0:4000]

'\n\n\nA complete list of Canadian Universities and Colleges - The Greenest Workforce : The Greenest Workforce\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nif(typeof stLight != "undefined" ){\n\tstLight.options({\n\t\tpublisher: "df7e08d0-fa61-487f-be1c-0db70716d1aa",\n\t\tsnapsets:false,\n\t\tservicePopup:true,\n\t\tonhover: false\n\t\t}); \n\t//stLight.options({snapsets:false});\n}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n{"@context":"https://schema.org","@graph":[{"@type":"WebSite","@id":"http://thegreenestworkforce.ca/index.php/en/home/#website","url":"http://thegreenestworkforce.ca/index.php/en/home/","name":"The Greenest Workforce","description":"","potentialAction":[{"@type":"SearchAction","target":"http://thegreenestworkforce.ca/index.php/en/home/?s={search_term_string}","query-input":"required name=search_term_string"}],"inLanguage":"en-CA"},{"@type":"WebPage","@id":"https://thegreenestworkforce.ca/index.php/en/schools/#webpage","url":"https://thegreenestworkforce.ca/index.php/en/schools/","nam

#### Inspect page source code

In [69]:
My_table = soup.find('article',{'id':'post-17896'})

In [70]:
My_table.contents[0:10]

['\n',
 <h2>Universities</h2>,
 '\n',
 <h3>British Columbia</h3>,
 '\n',
 <p><a href="http://www.viu.ca/" target="_blank" title="Vancouver Island University">Vancouver Island University</a><br/>
 <a href="http://www.royalroads.ca/" target="_blank" title="Royal Roads University">Royal Roads University</a><br/>
 <a href="http://www.sfu.ca/" target="_blank" title="Simon Fraser University">Simon Fraser University</a><br/>
 <a href="https://www.twu.ca/" target="_blank" title="Trinity Western University">Trinity Western University</a><br/>
 <a href="http://www.tru.ca/" target="_blank" title="Thompson River University">Thompson River University</a><br/>
 <a href="http://www.ufv.ca/" target="_blank" title="University of the Fraser Valley">University of the Fraser Valley</a><br/>
 <a href="http://www.ubc.ca/" target="_blank" title="University of British Columbia">University of British Columbia</a><br/>
 <a href="http://www.unbc.ca/" target="_blank" title="University of Northern British Columbia

In [71]:
links = My_table.findAll('a')
links[0:10]

[<a href="http://www.viu.ca/" target="_blank" title="Vancouver Island University">Vancouver Island University</a>,
 <a href="http://www.royalroads.ca/" target="_blank" title="Royal Roads University">Royal Roads University</a>,
 <a href="http://www.sfu.ca/" target="_blank" title="Simon Fraser University">Simon Fraser University</a>,
 <a href="https://www.twu.ca/" target="_blank" title="Trinity Western University">Trinity Western University</a>,
 <a href="http://www.tru.ca/" target="_blank" title="Thompson River University">Thompson River University</a>,
 <a href="http://www.ufv.ca/" target="_blank" title="University of the Fraser Valley">University of the Fraser Valley</a>,
 <a href="http://www.ubc.ca/" target="_blank" title="University of British Columbia">University of British Columbia</a>,
 <a href="http://www.unbc.ca/" target="_blank" title="University of Northern British Columbia">University of Northern British Columbia</a>,
 <a href="http://www.uvic.ca/" target="_blank" title="Uni

In [72]:
colleges = []
for link in links:
    colleges.append(link.get('href'))
    
print(colleges)

['http://www.viu.ca/', 'http://www.royalroads.ca/', 'http://www.sfu.ca/', 'https://www.twu.ca/', 'http://www.tru.ca/', 'http://www.ufv.ca/', 'http://www.ubc.ca/', 'http://www.unbc.ca/', 'http://www.uvic.ca/', 'http://www.fnuniv.ca/', 'http://www.uregina.ca/', 'http://www.usask.ca/', 'http://www.athabascau.ca/', 'http://www.augustana.ualberta.ca/', 'http://concordia.ab.ca/', 'http://www.kingsu.ca/', 'http://www.ualberta.ca/', 'http://www.ucalgary.ca/', 'http://www.uleth.ca/', 'http://www.macewan.ca/wcm/index.htm', 'http://www.mtroyal.ca/', 'http://www.brandonu.ca/', 'http://umanitoba.ca/', 'http://www.uwinnipeg.ca/', 'http://www.cmu.ca/', 'http://umanitoba.ca/stpauls/', 'http://ustboniface.ca/', 'http://www.algomau.ca/', 'http://brescia.uwo.ca/', 'http://www.brocku.ca/', 'http://carleton.ca/', 'http://www.dominicanu.ca/', 'http://www.huronuc.on.ca/Home', 'http://www.kings.uwo.ca/', 'https://www.lakeheadu.ca/', 'http://laurentian.ca/', 'http://www.mcmaster.ca/', 'http://www.nipissingu.ca

In [73]:
df = pd.DataFrame()
df['University'] = colleges
df.head()

Unnamed: 0,University
0,http://www.viu.ca/
1,http://www.royalroads.ca/
2,http://www.sfu.ca/
3,https://www.twu.ca/
4,http://www.tru.ca/


### [Example](https://towardsdatascience.com/web-scraping-101-in-python-35f8653b1c97) with parameters

In [75]:
url = 'https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating'

def get_page_contents(url):
    page = requests.get(url, headers={"Accept-Language": "en-US"})
    return bs(page.text, "html.parser")

soup = get_page_contents(url)

In [78]:
movies = soup.findAll('h3', class_='lister-item-header')
movies[0:5]

[<h3 class="lister-item-header">
 <span class="lister-item-index unbold text-primary">1.</span>
 <a href="/title/tt0111161/">The Shawshank Redemption</a>
 <span class="lister-item-year text-muted unbold">(1994)</span>
 </h3>, <h3 class="lister-item-header">
 <span class="lister-item-index unbold text-primary">2.</span>
 <a href="/title/tt0068646/">The Godfather</a>
 <span class="lister-item-year text-muted unbold">(1972)</span>
 </h3>, <h3 class="lister-item-header">
 <span class="lister-item-index unbold text-primary">3.</span>
 <a href="/title/tt10189514/">Soorarai Pottru</a>
 <span class="lister-item-year text-muted unbold">(2020)</span>
 </h3>, <h3 class="lister-item-header">
 <span class="lister-item-index unbold text-primary">4.</span>
 <a href="/title/tt0468569/">The Dark Knight</a>
 <span class="lister-item-year text-muted unbold">(2008)</span>
 </h3>, <h3 class="lister-item-header">
 <span class="lister-item-index unbold text-primary">5.</span>
 <a href="/title/tt0071562/">The

In [80]:
titles = [movie.find('a').text for movie in movies]
titles[0:10]

['The Shawshank Redemption',
 'The Godfather',
 'Soorarai Pottru',
 'The Dark Knight',
 'The Godfather: Part II',
 '12 Angry Men',
 'The Lord of the Rings: The Return of the King',
 'Pulp Fiction',
 "Schindler's List",
 'Inception']

### Third [example](https://medium.com/the-andela-way/learn-how-to-scrape-the-web-2a7cc488e017)

Getting the website topics

The Daily Nation website has a number of topics such as news, counties, sports, photos, videos and business among others. We need to extract them from the html plus their accompanying urls.

Before we start writing the code, we need to first examine the html so that we understand where the data we need is located and how we can get it. Below is an image snippet of the navbar of the website containing the topics. You can also right click anywhere on the navbar and then click inspect and the html will appear in the developer tools section on Chrome.

In [38]:
class DailyNation:
    BASE_URL = 'https://www.nation.co.ke'

    @staticmethod
    def get_html(url):
        """
        Get the Daily Nation html
        :return:
        """
        daily_nation = requests.get(url)
        html = BeautifulSoup(daily_nation.text, 'html.parser')
        return html

    @staticmethod
    def get_topics_lis():
        """
        Get all the lis in the nav bar
        :return:
        """
        html = DailyNation.get_html(DailyNation.BASE_URL)
        if html:
            nav = html.find('nav', class_='container')
            topics_ul = nav.find_all('ul')[1]
            return topics_ul.find_all('li')
        return None

    @staticmethod
    def get_topics():
        """
        Get all topics in the nav bar
        :return: list
        """
        lis = DailyNation.get_topics_lis()
        topics = []
        for li in lis:
            topics.append(li.find('a').text.lower())
        return topics
    
    @staticmethod
    def get_topic_info(topic):
        """
        Get content for a given topic
        :param topic:
        :return:
        """
        topic_url = DailyNation.BASE_URL + DailyNation.get_topics_url()[topic.lower()]
        html = DailyNation.get_html(topic_url)
        if html:
            stories = []
            div_content = html.find('div', class_='five-eight column')
            stories_div = div_content.find_all('div', class_='story-teaser')
            for div in stories_div:
                story = {
                    'title': div.find('a').text, 'summary': div.find('p').text,
                    'story_url': div.find('a').get('href'),
                    'published_at': div.find('h6').text
                }
                if div.find('img'):
                    story['image_url'] = DailyNation.BASE_URL + div.find('img').get('src')
                story['image_url'] = ''
                stories.append(story)
            return stories
        
    @staticmethod
    def get_topics_url():
        """
        Get all topics in the nav bar with their corresponding urls
        :return:
        """
        lis = DailyNation.get_topics_lis()
        topics_with_url = {}
        for li in lis:
            topics_with_url[li.find('a').text.lower()] = li.find('a').get('href')
        return topics_with_url
    
    @staticmethod
    def get_photos(topic):
        """
        Get all the images in the photos section
        :param topic:
        :return:
        """
        if topic != 'photos':
            raise ValueError('Topic should be photos')
        topic_url = DailyNation.BASE_URL + DailyNation.get_topics_url()[topic.lower()]
        html = DailyNation.get_html(topic_url)
        if html:
            images = {}
            images_list_div = html.find('div', class_='cb-content videolist')
            images_topics = images_list_div.find_all('div', class_='vh-caption')
            images_caption_headings = []
            for caption_div in images_topics:
                images_caption_headings.append(DailyNation.clean_string(caption_div.find('h3').text))
            image_rows_divs = images_list_div.find_all('div', class_='row')
            images_caption_headings_count = 0
            for images_item_div in image_rows_divs:
                trs = images_item_div.find('table').find_all('tr')
                items = []
                for tr in trs:
                    tds = tr.find_all('td')
                    for td in tds:
                        items.append({
                            'caption': td.find('div', class_='v-desc').find('a').text,
                            'image_url': DailyNation.BASE_URL + td.find('div', class_='v-img').find('img').get('src'),
                            'story_url': DailyNation.BASE_URL + td.find('div', class_='v-img').find('a').get('href')
                        })
                images[images_caption_headings[images_caption_headings_count]] = items
                images_caption_headings_count += 1
            return images
        
    @staticmethod
    def clean_string(string):
        new_string = string.replace('&nbsp', '')
        return new_string.replace(';', '')
    
    @staticmethod
    def get_videos(topic):
        """
        Get all the videos in the photos section
        :param topic:
        :return:
        """
        if topic != 'videos':
            raise ValueError('Topic should be videos')
        topic_url = DailyNation.BASE_URL + DailyNation.get_topics_url()[topic.lower()]
        html = DailyNation.get_html(topic_url)
        if html:
            videos = {}
            videos_list_div = html.find('div', class_='cb-content videolist')
            videos_topics = videos_list_div.find_all('div', class_='vh-caption')
            videos_caption_headings = []
            for caption_div in videos_topics:
                videos_caption_headings.append(DailyNation.clean_string(caption_div.find('h3').text))
            video_rows_divs = videos_list_div.find_all('div', class_='row')
            videos_caption_headings_count = 0
            for videos_item_div in video_rows_divs:
                trs = videos_item_div.find('table').find_all('tr')
                items = []
                for tr in trs:
                    tds = tr.find_all('td')
                    for td in tds:
                        items.append({
                            'caption': td.find('div', class_='v-desc').text,
                            'image_url': td.find('div', class_='v-img').find('img').get('src'),
                            'story_url': DailyNation.BASE_URL + td.find(
                                'div',
                                class_='col-lg-3 col-xs-12 col-sm-6 videoitem'
                            ).find('a').get('href')
                        })
                videos[vi
                videos_caption_headings_count += 1
            return videos
                       
    @staticmethod
    def get_data(topic):
        """
        Get topic information based on the topic provided.
        :param topic:
        :return:
        """
        _topic = topic.lower()
        if _topic not in DailyNation.get_topics():
            raise ValueError('Topic does not exist')

        if _topic == 'photos':
            return DailyNation.get_photos(_topic)

        if _topic == 'videos':
            return DailyNation.get_videos(_topic)

        return DailyNation.get_topic_info(_topic)

SyntaxError: invalid syntax (<ipython-input-38-f29742e1522c>, line 151)