## Introduction to Data Science

### Introduction to Scraping

In [21]:
#import pylab
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import requests
from bs4 import BeautifulSoup as bs

%matplotlib inline
#%matplotlib notebook

### First and easiest [example](https://towardsdatascience.com/the-easiest-tutorial-of-web-scraping-on-the-internet-2439334dc243)

In [23]:
website_url = 'https://thegreenestworkforce.ca/index.php/en/schools/' 
content = requests.get(website_url).text
soup = bs(content,'lxml')

ConnectionError: HTTPSConnectionPool(host='thegreenestworkforce.ca', port=443): Max retries exceeded with url: /index.php/en/schools/ (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7fedc5d3b4e0>: Failed to establish a new connection: [Errno 113] No route to host',))

In [None]:
print(soup.prettify()[0:1000])
#print(soup.prettify())

In [None]:
My_table = soup.find('article',{'id':'post-17896'})

In [10]:
My_table.contents[0:10]

['\n',
 <h2>Universities</h2>,
 '\n',
 <h3>British Columbia</h3>,
 '\n',
 <p><a href="http://www.viu.ca/" target="_blank" title="Vancouver Island University">Vancouver Island University</a><br/>
 <a href="http://www.royalroads.ca/" target="_blank" title="Royal Roads University">Royal Roads University</a><br/>
 <a href="http://www.sfu.ca/" target="_blank" title="Simon Fraser University">Simon Fraser University</a><br/>
 <a href="https://www.twu.ca/" target="_blank" title="Trinity Western University">Trinity Western University</a><br/>
 <a href="http://www.tru.ca/" target="_blank" title="Thompson River University">Thompson River University</a><br/>
 <a href="http://www.ufv.ca/" target="_blank" title="University of the Fraser Valley">University of the Fraser Valley</a><br/>
 <a href="http://www.ubc.ca/" target="_blank" title="University of British Columbia">University of British Columbia</a><br/>
 <a href="http://www.unbc.ca/" target="_blank" title="University of Northern British Columbia

In [11]:
links = My_table.findAll('a')
links[0:10]

[<a href="http://www.viu.ca/" target="_blank" title="Vancouver Island University">Vancouver Island University</a>,
 <a href="http://www.royalroads.ca/" target="_blank" title="Royal Roads University">Royal Roads University</a>,
 <a href="http://www.sfu.ca/" target="_blank" title="Simon Fraser University">Simon Fraser University</a>,
 <a href="https://www.twu.ca/" target="_blank" title="Trinity Western University">Trinity Western University</a>,
 <a href="http://www.tru.ca/" target="_blank" title="Thompson River University">Thompson River University</a>,
 <a href="http://www.ufv.ca/" target="_blank" title="University of the Fraser Valley">University of the Fraser Valley</a>,
 <a href="http://www.ubc.ca/" target="_blank" title="University of British Columbia">University of British Columbia</a>,
 <a href="http://www.unbc.ca/" target="_blank" title="University of Northern British Columbia">University of Northern British Columbia</a>,
 <a href="http://www.uvic.ca/" target="_blank" title="Uni

In [12]:
colleges = []
for link in links:
    colleges.append(link.get('href'))
    
print(colleges)

['http://www.viu.ca/', 'http://www.royalroads.ca/', 'http://www.sfu.ca/', 'https://www.twu.ca/', 'http://www.tru.ca/', 'http://www.ufv.ca/', 'http://www.ubc.ca/', 'http://www.unbc.ca/', 'http://www.uvic.ca/', 'http://www.fnuniv.ca/', 'http://www.uregina.ca/', 'http://www.usask.ca/', 'http://www.athabascau.ca/', 'http://www.augustana.ualberta.ca/', 'http://concordia.ab.ca/', 'http://www.kingsu.ca/', 'http://www.ualberta.ca/', 'http://www.ucalgary.ca/', 'http://www.uleth.ca/', 'http://www.macewan.ca/wcm/index.htm', 'http://www.mtroyal.ca/', 'http://www.brandonu.ca/', 'http://umanitoba.ca/', 'http://www.uwinnipeg.ca/', 'http://www.cmu.ca/', 'http://umanitoba.ca/stpauls/', 'http://ustboniface.ca/', 'http://www.algomau.ca/', 'http://brescia.uwo.ca/', 'http://www.brocku.ca/', 'http://carleton.ca/', 'http://www.dominicanu.ca/', 'http://www.huronuc.on.ca/Home', 'http://www.kings.uwo.ca/', 'https://www.lakeheadu.ca/', 'http://laurentian.ca/', 'http://www.mcmaster.ca/', 'http://www.nipissingu.ca

In [13]:
df = pd.DataFrame()
df['University'] = colleges
df.head()

Unnamed: 0,University
0,http://www.viu.ca/
1,http://www.royalroads.ca/
2,http://www.sfu.ca/
3,https://www.twu.ca/
4,http://www.tru.ca/


### Second [example](https://towardsdatascience.com/web-scraping-101-in-python-35f8653b1c97)

In [24]:
url = 'https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating'

def get_page_contents(url):
    page = requests.get(url, headers={"Accept-Language": "en-US"})
    return bs(page.text, "html.parser")

soup = get_page_contents(url)

ConnectionError: HTTPSConnectionPool(host='www.imdb.com', port=443): Max retries exceeded with url: /search/title/?count=100&groups=top_1000&sort=user_rating (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7fedc5d45550>: Failed to establish a new connection: [Errno 113] No route to host',))

In [None]:
movies = soup.findAll('h3', class_='lister-item-header')

In [None]:
titles = [movie.find('a').text for movie in movies]

In [None]:
release = [movie.find('span', class_='lister-item-year text-muted unbold').text for movie in movies]

In [None]:
movie.find('div', 'inline-block ratings-imdb-rating')['data-value']

In [None]:
votes = movie.findAll('span' , {'name' : 'nv'})[0]['data-value']
earnings = movie.findAll('span' , {'name' : 'nv'})[1]['data-value']

In [None]:
director = movie.find('p').find('a').text

In [None]:
actors = [actor.text for actor in movie.find('p').findAll('a')[1:]]

In [25]:
def numeric_value(movie, tag, class_=None, order=None):
    if order:
        if len(movie.findAll(tag, class_)) > 1:
            to_extract = movie.findAll(tag, class_)[order]['data-value']
        else:
            to_extract = None
    else:
        to_extract = movie.find(tag, class_)['data-value']

    return to_extract


def text_value(movie, tag, class_=None):
    if movie.find(tag, class_):
        return movie.find(tag, class_).text
    else:
        return


def nested_text_value(movie, tag_1, class_1, tag_2, class_2, order=None):
    if not order:
        return movie.find(tag_1, class_1).find(tag_2, class_2).text
    else:
        return [val.text for val in movie.find(tag_1, class_1).findAll(tag_2, class_2)[order]]


def extract_attribute(soup, tag_1, class_1='', tag_2='', class_2='',
                      text_attribute=True, order=None, nested=False):
    movies = soup.findAll('div', class_='lister-item-content')
    data_list = []
    for movie in movies:
        if text_attribute:
            if nested:
                data_list.append(nested_text_value(movie, tag_1, class_1, tag_2, class_2, order))
            else:
                data_list.append(text_value(movie, tag_1, class_1))
        else:
            data_list.append(numeric_value(movie, tag_1, class_1, order))

    return data_list


titles = extract_attribute(soup, 'a')
release = extract_attribute(soup, 'span', 'lister-item-year text-muted unbold')
audience_rating = extract_attribute(soup, 'span', 'certificate')
runtime = extract_attribute(soup, 'span', 'runtime')
genre = extract_attribute(soup, 'span', 'genre')
imdb_rating = extract_attribute(soup, 'div', 'inline-block ratings-imdb-rating', False)
votes = extract_attribute(soup, 'span' , {'name' : 'nv'}, False, 0)
earnings = extract_attribute(soup, 'span' , {'name' : 'nv'}, False, 1)
directors = extract_attribute(soup, 'p', '', 'a', '', True, 0, True)
actors = extract_attribute(soup, 'p', '', 'a', '', True, slice(1, 5, None), True)


df_dict = {'Title': titles, 'Relase': release, 'Audience Rating': audience_rating,
           'Runtime': runtime, 'Genre': genre, 'IMDB Rating': imdb_rating,
           'Votes': votes, 'Box Office Earnings': earnings, 'Director': directors,
           'Actors': actors}
df = pd.DataFrame(df_dict)
df

Unnamed: 0,Title,Relase,Audience Rating,Runtime,Genre,IMDB Rating,Votes,Box Office Earnings,Director,Actors


### Third [example](https://medium.com/the-andela-way/learn-how-to-scrape-the-web-2a7cc488e017)

Getting the website topics

The Daily Nation website has a number of topics such as news, counties, sports, photos, videos and business among others. We need to extract them from the html plus their accompanying urls.

Before we start writing the code, we need to first examine the html so that we understand where the data we need is located and how we can get it. Below is an image snippet of the navbar of the website containing the topics. You can also right click anywhere on the navbar and then click inspect and the html will appear in the developer tools section on Chrome.

In [16]:
class DailyNation:
    BASE_URL = 'https://www.nation.co.ke'

    @staticmethod
    def get_html(url):
        """
        Get the Daily Nation html
        :return:
        """
        daily_nation = requests.get(url)
        html = BeautifulSoup(daily_nation.text, 'html.parser')
        return html

    @staticmethod
    def get_topics_lis():
        """
        Get all the lis in the nav bar
        :return:
        """
        html = DailyNation.get_html(DailyNation.BASE_URL)
        if html:
            nav = html.find('nav', class_='container')
            topics_ul = nav.find_all('ul')[1]
            return topics_ul.find_all('li')
        return None

    @staticmethod
    def get_topics():
        """
        Get all topics in the nav bar
        :return: list
        """
        lis = DailyNation.get_topics_lis()
        topics = []
        for li in lis:
            topics.append(li.find('a').text.lower())
        return topics
    
    @staticmethod
    def get_topic_info(topic):
        """
        Get content for a given topic
        :param topic:
        :return:
        """
        topic_url = DailyNation.BASE_URL + DailyNation.get_topics_url()[topic.lower()]
        html = DailyNation.get_html(topic_url)
        if html:
            stories = []
            div_content = html.find('div', class_='five-eight column')
            stories_div = div_content.find_all('div', class_='story-teaser')
            for div in stories_div:
                story = {
                    'title': div.find('a').text, 'summary': div.find('p').text,
                    'story_url': div.find('a').get('href'),
                    'published_at': div.find('h6').text
                }
                if div.find('img'):
                    story['image_url'] = DailyNation.BASE_URL + div.find('img').get('src')
                story['image_url'] = ''
                stories.append(story)
            return stories
        
    @staticmethod
    def get_topics_url():
        """
        Get all topics in the nav bar with their corresponding urls
        :return:
        """
        lis = DailyNation.get_topics_lis()
        topics_with_url = {}
        for li in lis:
            topics_with_url[li.find('a').text.lower()] = li.find('a').get('href')
        return topics_with_url
    
    @staticmethod
    def get_photos(topic):
        """
        Get all the images in the photos section
        :param topic:
        :return:
        """
        if topic != 'photos':
            raise ValueError('Topic should be photos')
        topic_url = DailyNation.BASE_URL + DailyNation.get_topics_url()[topic.lower()]
        html = DailyNation.get_html(topic_url)
        if html:
            images = {}
            images_list_div = html.find('div', class_='cb-content videolist')
            images_topics = images_list_div.find_all('div', class_='vh-caption')
            images_caption_headings = []
            for caption_div in images_topics:
                images_caption_headings.append(DailyNation.clean_string(caption_div.find('h3').text))
            image_rows_divs = images_list_div.find_all('div', class_='row')
            images_caption_headings_count = 0
            for images_item_div in image_rows_divs:
                trs = images_item_div.find('table').find_all('tr')
                items = []
                for tr in trs:
                    tds = tr.find_all('td')
                    for td in tds:
                        items.append({
                            'caption': td.find('div', class_='v-desc').find('a').text,
                            'image_url': DailyNation.BASE_URL + td.find('div', class_='v-img').find('img').get('src'),
                            'story_url': DailyNation.BASE_URL + td.find('div', class_='v-img').find('a').get('href')
                        })
                images[images_caption_headings[images_caption_headings_count]] = items
                images_caption_headings_count += 1
            return images
        
    @staticmethod
    def clean_string(string):
        new_string = string.replace('&nbsp', '')
        return new_string.replace(';', '')
    
    @staticmethod
    def get_videos(topic):
        """
        Get all the videos in the photos section
        :param topic:
        :return:
        """
        if topic != 'videos':
            raise ValueError('Topic should be videos')
        topic_url = DailyNation.BASE_URL + DailyNation.get_topics_url()[topic.lower()]
        html = DailyNation.get_html(topic_url)
        if html:
            videos = {}
            videos_list_div = html.find('div', class_='cb-content videolist')
            videos_topics = videos_list_div.find_all('div', class_='vh-caption')
            videos_caption_headings = []
            for caption_div in videos_topics:
                videos_caption_headings.append(DailyNation.clean_string(caption_div.find('h3').text))
            video_rows_divs = videos_list_div.find_all('div', class_='row')
            videos_caption_headings_count = 0
            for videos_item_div in video_rows_divs:
                trs = videos_item_div.find('table').find_all('tr')
                items = []
                for tr in trs:
                    tds = tr.find_all('td')
                    for td in tds:
                        items.append({
                            'caption': td.find('div', class_='v-desc').text,
                            'image_url': td.find('div', class_='v-img').find('img').get('src'),
                            'story_url': DailyNation.BASE_URL + td.find(
                                'div',
                                class_='col-lg-3 col-xs-12 col-sm-6 videoitem'
                            ).find('a').get('href')
                        })
                videos[vi
                videos_caption_headings_count += 1
            return videos
                       
    @staticmethod
    def get_data(topic):
        """
        Get topic information based on the topic provided.
        :param topic:
        :return:
        """
        _topic = topic.lower()
        if _topic not in DailyNation.get_topics():
            raise ValueError('Topic does not exist')

        if _topic == 'photos':
            return DailyNation.get_photos(_topic)

        if _topic == 'videos':
            return DailyNation.get_videos(_topic)

        return DailyNation.get_topic_info(_topic)