# Introduction to web scraping using Python
## QUT DMRC - 2016

###  Support for multiple pages

This notebook scrapes http://www.metacritic.com/browse/albums/artist and saves the data in a dataframe.
The script iterates through the webpage structure (structured by the first letter of the artist's name), and over multiple pages within each letter.

In [None]:
# initialise plotting in the notebook
%pylab inline

### Import packages

In [None]:
import bs4
import requests
import pandas as pd
from os.path import isfile

### Initialise global variables

In [None]:
# this is the base_url
base_url = "http://www.metacritic.com/browse/albums/artist"

In [None]:
# the bot pretends to be a standard Mozilla browser
hdrs = {"User-Agent": "Mozilla/5.0"}

In [None]:
# columns labels
colnames = ["artistname", "albumname", "release_date", "mc_score", "user_score", "url"]

In [None]:
# The pages are sorted in alphabetical order based on the artist's name.
# Artists with strange first letters are put in a page without a letter (the first one)
letters = ["","/a","/b","/c","/d","/e","/f","/g","/h","/i","/j","/k","/l","/m","/n","/o","/p","/q","/r","/s","/t","/u","/v","/w","/x","/y","/z"]

# if you want to limit the number of pages to scrape, you simply shorten this list - e.g.
letters = ["/a","/b"]

### Function definitions

Add a new function to find the number of pages of reviews available for the current letter.

In [None]:
# processes a beautiful_soup data structure and returns a the page count
def get_page_count(thesoup):
    # try to find all div tags of class "pages"
    page_divs = soup.find_all("div",class_=["pages"])
    page_count = 1
    
    # if there is a div of class "pages", then
    if len(page_divs)>0:
        
        # try to find all a-tags
        a_tags = page_divs[0].find_all("a")
        
        # if there were a-tag(s) to be found, then pick the last one in order to get the max page number
        if len(a_tags)>0:
            page_count = int(a_tags[-1].get_text())

    return page_count

```get_itemlist``` function is unchanged from the previous page

In [None]:
# processes a beautiful_soup data structure and returns new album_reviews in a dataframe
def get_itemlist(thesoup):
    
    #try to find all div-tags of class "product_wrap"
    lotsofitems = thesoup.find_all("div",class_=["product_wrap"])
    
    thelist = []
    for an_item in lotsofitems: 
        theitem = []
        
        # artistname
        temptemp = an_item.find("li",class_="product_artist")
        theitem += [temptemp.find("span",class_=["data"]).get_text()]

        thetitle = an_item.find("div",class_="product_title")

        # albumname
        temptemp = thetitle.get_text()
        temptemp = temptemp.split()
        theitem += [" ".join(temptemp)]
        
        # release_date
        temptemp = an_item.find("li",class_="release_date")
        theitem += [temptemp.find("span",class_=["data"]).get_text()]
        
        # mc_score
        theitem += [an_item.find("div",class_="metascore_w").get_text()]

        # user_score
        temptemp = an_item.find("li",class_="product_avguserscore")
        theitem += [temptemp.find("span",class_=["data"]).get_text()]
        
        # url
        theitem += ["http://www.metacritic.com"+thetitle.a.attrs["href"]]

        # not all albums have both expert reviews and user reviews. Those albums
        # that has data missing, use "tbd" instead. We only want to add items
        # that have both user_score and mc_score
        if not "tbd" in theitem:
            thelist = thelist + [theitem]
    return pd.DataFrame(thelist,columns=colnames)

### The script

In [None]:
# reset the dataframe

# if there is a file...
if isfile("reviews.pkl"):
    # ...load album_reviews from that file
    album_reviews = pd.read_pickle("reviews.pkl")
else:
    # otherwise, set up an empty dataframe
    album_reviews = pd.DataFrame(columns=colnames)

# show the number of reviews in the dataframe
print(len(album_reviews))

In [None]:
# iterate over the list of letters
for lett in letters:
    
    # initialise page_count and counter for this letter
    page_count = 1
    i = 0
    
    while i < page_count:
        
        # 1.build the url
        thepage = base_url+lett+"?page="+str(i)
        
        # 2.call the url
        stuff = requests.get(thepage, headers=hdrs)
        
        # 3.transform to soup using html.parser parser
        soup = bs4.BeautifulSoup(stuff.text, "html.parser")
        
        # 4.extract the new reviews from this page
        new_reviews = get_itemlist(soup)
        
        # 5.add the new reviews to the dataframe
        album_reviews = album_reviews.append(new_reviews)
        
        # 6.print something to show how the process progresses
        print("URL:",thepage,flush=True)
        
        
        # if this is the first page for this letter, then extract the page count
        if i == 0:
            page_count = get_page_count(soup)
        
        # increase the counter
        i += 1
        
    # *** Tidy up the data and save to disk after each letter has been scraped ***
    # make sure the review scores are numerical (float) types
    album_reviews["mc_score"] = album_reviews["mc_score"].map(float)
    album_reviews["user_score"] = album_reviews["user_score"].map(float)
        
    # remove duplicates in case the same page has been scraped more than once
    album_reviews = album_reviews.drop_duplicates()
        
    # save the reviews to a csv file
    album_reviews.to_csv("reviews.csv")
        
    # save the reviews to a pkl file
    album_reviews.to_pickle("reviews.pkl")


### Check the result

In [None]:
# how many reviews are there in the dataframe?
len(album_reviews)

In [None]:
# have a look at the first five items
album_reviews[:5]

### Data processing

In [None]:
album_reviews["user_score_inv"] = album_reviews["user_score"].map(lambda x:1/x)

In [None]:
album_reviews["user_score_log"] = album_reviews["user_score"].map(log)

### Plot the data

In [None]:
# histograms
pp = album_reviews.hist(figsize = (12,7))

In [None]:
# scatter diagram
pp = album_reviews.plot(kind="scatter",x="user_score",y="mc_score")

### Statistical analysis

In [None]:
# simple correlations
album_reviews.corr()