# Introduction to web scraping using Python
## QUT DMRC - 2016

### Restructure the code for clarity

This notebook is identical to the [previous step](web-scraping-intro-step5.ipynb), but the code is structured in a different way that will be easier to extend to multiple pages.

### Import packages

In [None]:
import bs4
import requests
import pandas as pd

### Initialise global variables

In [None]:
# this is the base_url
base_url = "http://www.metacritic.com/browse/albums/artist"

In [None]:
# the bot pretends to be a standard Mozilla browser
hdrs = {"User-Agent": "Mozilla/5.0"}

In [None]:
# columns labels
colnames = ["artistname", "albumname", "release_date", "mc_score", "user_score", "url"]

### Function definitions

In [None]:
# processes a beautiful_soup data structure and returns new album_reviews as a dataframe
def get_itemlist(thesoup):
    
    #try to find all div-tags of class "product_wrap"
    lotsofitems = thesoup.find_all("div",class_=["product_wrap"])
    
    thelist = []
    for an_item in lotsofitems: 
        theitem = []
        
        # artistname
        temptemp = an_item.find("li",class_="product_artist")
        theitem += [temptemp.find("span",class_=["data"]).get_text()]

        thetitle = an_item.find("div",class_="product_title")

        # albumname
        temptemp = thetitle.get_text()
        temptemp = temptemp.split()
        theitem += [" ".join(temptemp)]
        
        # release_date
        temptemp = an_item.find("li",class_="release_date")
        theitem += [temptemp.find("span",class_=["data"]).get_text()]
        
        # mc_score
        theitem += [an_item.find("div",class_="metascore_w").get_text()]

        # user_score
        temptemp = an_item.find("li",class_="product_avguserscore")
        theitem += [temptemp.find("span",class_=["data"]).get_text()]
        
        # url
        theitem += ["http://www.metacritic.com"+thetitle.a.attrs["href"]]

        # not all albums have both expert reviews and user reviews. Those albums
        # that has data missing, use "tbd" instead. We only want to add items
        # that have both user_score and mc_score
        if not "tbd" in theitem:
            thelist = thelist + [theitem]
    return pd.DataFrame(thelist,columns=colnames)

### The script

In [None]:
# reset the dataframe
album_reviews = pd.DataFrame(columns=colnames)

In [None]:
# select which page to scrape based on the first letter of the artist names
lett = "/a"

In [None]:
# build the url (only scrape the first page - page 0)
thepage = base_url+lett+"?page=0"

In [None]:
# call the url
stuff = requests.get(thepage, headers=hdrs)

In [None]:
# transform to soup using html.parser parser
soup = bs4.BeautifulSoup(stuff.text, "html.parser")

In [None]:
# extract the new album_reviews from this page
new_reviews = get_itemlist(soup)

In [None]:
# add the new reviews to the dataframe
album_reviews = album_reviews.append(new_reviews)

In [None]:
# print something to show how the process develops
print("URL:",thepage,flush=True)

### Check the result and save to disk

In [None]:
# how many reviews are there in the dataframe?
len(album_reviews)

In [None]:
# have a look at the first five items
album_reviews[:5]

In [None]:
# save the data as a csv file
album_reviews.to_csv("reviews.csv")

Now we are ready to move onto the seventh notebook - [Plotting, tiny stat analysis and improved I/O](web-scraping-intro-step7.ipynb)