# Python Web Scraping
In this lab, you are to continue to build on the Dr. Who popularity solution. 
Recall, our research question is `who are the five most popular actors to play
the role of Dr. Who in the popular and long running BBC series Dr. Who?`

What remains to be completed is to evaluate the popularity of each Dr. Who actor by
using the page views of the actor’s Wikipedia page as a proxy for their popularity.

**DO NOT** attempt this lab until 
you have thoroughly reviewed the *BeautifulSoup_Module_INTRO_Lab*.   

##  Using the Names + BeautifulSoup to Get the Stats
Using the exact same principles used to collect the list of Dr. Who actors,
we now need to collect the 30-day page view stat for each actor.

The logic/pseudocode for this activity is roughly as follows:

1. Explore the HTML underlying an example Wikipedia stats page:
https://en.wikipedia.org/w/index.php?title=Jodie_Whittaker&action=info
Look (**hard**) for a pattern that will allow you to capture the Page views in the past 30 days.
Turns out there is perfect pattern you should be able to exploit.
2. For each actor, combine the actor name with the Wikipedia URL string as a parameter
 - Fetch the stats web page by GET(ting) the URL just constructed
 - Parse the returned HTML using Beautiful Soup
 - Find the stats using your previously observed exploitable pattern
 - remove any noise from the stats string number
 - convert stats string to integer via int()
 - track the actor’s stat using a list or dictionary
3. Sort the actor stats in descending order
4. print the top 5

Have a beer – you deserve it!

In [6]:
from requests.exceptions import HTTPError
import requests
from bs4 import BeautifulSoup
import re
import typing

EW_URL = 'http://ew.com/tv/doctor-who-actors/'

def simple_get(url, *args, **kwargs):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        resp = requests.get(url, *args, **kwargs)
        # If the response was successful, no Exception will be raised
        resp.raise_for_status()

    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
        raise http_err
    except Exception as err:
        print(f'Other error occurred: {err}')
        raise err

    return resp

# If you are having a hard time reading machine generate HTML you can
# "prettify" the soup and print or save to a file.
#  The prettify method prettifies the HTML with proper indents etc. making it much easier to read.
# Arguments:
# soup - BeautifulSoup object.  object representing HTML previously parsed by Beautiful Soup
# o_file - output file handle. Expects a file name which will be opened for write ('w')
#
def prettify_html(soup: BeautifulSoup, o_file: str) -> None:
    with open(o_file, mode='w', encoding='utf-8') as ofh:
        prettyHTML = soup.prettify()  
        print(prettyHTML, file=ofh, end=None)

def make_soup(url: str) -> BeautifulSoup:
    resp = simple_get(url, timeout=5)
    html = resp.text

    # sanity check. is this HTML?
    assert re.search('html', resp.headers['Content-Type'], re.IGNORECASE)

    soup = BeautifulSoup(html, 'html.parser')
    return soup


def who_actors(soup: BeautifulSoup) -> set:

    # data to be returned
    # Using a set here to ensure no duplicate names are returned.
    # Need a quick refresher on Python sets?  See https://www.w3schools.com/python/python_sets.asp
    actor_set = set()

    # The following find_all finds the pattern in the span tag attributes that are used
    # to isolate the Dr. Who actor names. When an attribute name like "class" collides with a Python
    # reserved word (like "class") the BS module will always postfix an underscore character to that attribute.
    # Hence, the "class_" attribute name below.
    # BS allows you specify a compiled RE patter to match tags and/or attributes. Needless to say, this feature is very useful!
    #
    for span in soup.find_all('span', class_='heading-toc', id=re.compile(r'^toc-[\w\-]+\-doctor\-(.*)$')):

        # I want the name from the id attribute which looks like this:
        # id="toc-thirteenth-doctor-jodie-whittaker"
        # Another good use for REs.
        print(span)
        #
        id = span['id']

        m = re.search(r'^toc-[\w\-]+\-doctor\-(.*)$', id)
        # if no match, then I've screwed up something
        assert m is not None
        actor = m.group(1).replace('-', ' ')
        actor_set.add(actor)

    # Great, got my list of actors. Return to caller
    return actor_set

'''
    # PHASE 2:
    # Collect the stats from Wikipedia
    # for each who actor
'''

def who_stats(dr_who):
    url = 'https://en.wikipedia.org/w/index.php'

    # Notice that navigation to the info page is a query param
    resp = simple_get(url, params=dict(title=dr_who, action ='info'))
    # --OR--
    #resp = simple_get(url, params={'title':dr_who, 'action': 'info'})


    # get the decoded payload.  the text() method uses metadata to devine encoding.
    html = resp.text

    # By inspection of HTTP results you will find that the
    # stat we seek is extremely easy to find:
    # <div class="mw-pvi-month">58,243</div>
    # the <div> tag has a class attr designed to display the "pvi" - page view in...months!

    soup = BeautifulSoup(html, 'html.parser')

    #  Only need a find (not find_all) since there is only a single tag
    # that has a class attr = mw-pvi-month

    div = soup.find('div', class_='mw-pvi-month')
    print(div)
    # sanity check
    assert div is not None
    
    # this text may have commas which need to be removed
    # prior to parsing as an int
    return int(div.text.replace(',',''))


def main():

    EW_URL = 'http://ew.com/tv/doctor-who-actors/'

    # PHASE 1
    # Get the Dr.Who actors from EW_URL
    soup = make_soup(EW_URL)
    #print(soup)

    # If you want to examine a more readable version of the HTML
    # use my prettify_html() function
    prettify_html(soup, 'pretty_dr_who.html')

    actor_set = who_actors(soup)

    # PHASE 2:
    # Collect the stats from Wikipedia for each who actor
    #
    actor_stats_dict = {}

    # Iterate over the actor set.  For each actor, find their Wiki stats
    # by constructing a name that is compatible with the URL requirements of Wikipedia.

    for a in actor_set:
        # the names from the EW are separated by \s chars.  In wikipedia urls,
        # those spaces need to become underscores (_) and the names are capitalized

        # Special case McCoy!  Yikes!
        first, last = a.split()
        first = first[0].upper() + first[1:]

        # If in the future, other such names occur, add the name prefixes to
        # to the RE group(1) using alternation (|) operator.  Ex: (mc|von)
        m = re.search(r'^(mc)(.*)$', last)
        if(m):
            last = m.group(1).title() + m.group(2).title()
        else:
            last = last[0].upper() + last[1:]

        wiki_a = f'{first}_{last}'
        pvim_stat = who_stats(wiki_a)
        actor_stats_dict[a] = pvim_stat

    # PHASE 3:
    # Sort number of views in desc order
    sorted_actor_list = sorted(actor_stats_dict, key=actor_stats_dict.get, reverse=True)
    print(sorted_actor_list)
    
    print("Drum roll please...\nThe top 5 most popular Dr. Who actors are:")
    for a in sorted_actor_list[0:5]:
        cnt = actor_stats_dict[a]
        # print numbers with thousands commas format
        print(f'\t{a} : {cnt:,}')


if __name__ == "__main__":
        main()

<span class="heading-toc" id="toc-ruth-claytonfugitive-doctor-jo-martin"></span>
<span class="heading-toc" id="toc-thirteenth-doctor-jodie-whittaker"></span>
<span class="heading-toc" id="toc-twelfth-doctor-peter-capaldi"></span>
<span class="heading-toc" id="toc-war-doctor-john-hurt"></span>
<span class="heading-toc" id="toc-eleventh-doctor-matt-smith"></span>
<span class="heading-toc" id="toc-tenth-doctor-david-tennant"></span>
<span class="heading-toc" id="toc-ninth-doctor-christopher-eccleston"></span>
<span class="heading-toc" id="toc-eighth-doctor-paul-mcgann"></span>
<span class="heading-toc" id="toc-seventh-doctor-sylvester-mccoy"></span>
<span class="heading-toc" id="toc-sixth-doctor-colin-baker"></span>
<span class="heading-toc" id="toc-fifth-doctor-peter-davison"></span>
<span class="heading-toc" id="toc-fourth-doctor-tom-baker"></span>
<span class="heading-toc" id="toc-third-doctor-jon-pertwee"></span>
<span class="heading-toc" id="toc-second-doctor-patrick-troughton"></spa