# ODI Queensland workshop - Web Scraping 

## QUT DMRC - 2015

### Plotting, tiny stat analysis and improved I/O
 
This notebook scrapes http://www.qld.gov.au/transport/safety/signs/regulatory/ and saves the data in a dataframe.
The script iterates through the webpage structure (structured by the types of signs).

In [None]:
# initialise plotting in the notebook
%pylab inline

### Import packages

In [None]:
import bs4
import requests
import pandas as pd
from os.path import isfile

### Initialise global variables

In [None]:
# this is the base_url
base_url = "http://www.qld.gov.au/transport/safety/signs/"

In [None]:
# columns labels
colnames = ["sign_name", "sign_type", "description", "images"]

# not using these for now - but needed to clean up display of images column
images_cols = ["image_name", "url"]

### Function definitions

In [None]:
# processes a beautiful_soup data structure and returns new signs in a dataframe
def get_itemlist(thesoup):
    
    # find all the tables on the page
    tables = thesoup.findAll('table')
    thelist = []

    for table in tables:
        # find all the table rows
        lotsofitems = table.findAll('tr')

        # check if the first row contains a 'th' elements (table header)
        if lotsofitems[0].find('th'): 

            # get all header elements
            temp = lotsofitems[0].findAll('th')

            # check that the table header has the text we expect for the signs table
            if temp[0].get_text() == 'Sign' and temp[1].get_text() == 'Meaning':

                # print('Traffic sign table found')

                # process the table of traffic signs **** THIS IS THE UPDATED SECTION *****
                for an_item in lotsofitems[1:]: 
                    theitem = []

                    # sign description & title
                    sign_text = an_item.findAll("p")
                    description = ''
                    for para in sign_text:
                        if para.find("strong"):
                            # extract sign name (this assumes only the sign name is in bold)
                            temptemp = para.find("strong").get_text()
                            temptemp = temptemp.split()
                            sign_name = " ".join(temptemp)
                        else:
                            # extract sign description (may be multiple paragraphs)
                            temptemp = para.get_text()
                            temptemp = temptemp.split()
                            description += " ".join(temptemp) + '\n'

                    theitem += [sign_name]
                    theitem += [sign_type]
                    theitem += [description]

                    # sign images (may be more than one image per sign name) - save image name & image url
                    images = []
                    for image in  an_item.findAll("img"):
                        # get the image name & image url
                        images += [[image.attrs['alt'], image.attrs['src']]]
                    theitem += [images]

                    thelist += [theitem]

#            else:
#                print('Different table - with header row:', temp)
#        else:
#            print('Different table - no header row:', lotsofitems[0])

    return pd.DataFrame(thelist,columns=colnames)

### The script

In [None]:
# reset the dataframe

# if there already is a file...
if isfile("signs.pkl"):
    # ...load signs from that file
    signs = pd.read_pickle("signs.pkl")
else:
    # otherwise, set up an empty dataframe
    signs = pd.DataFrame(columns=colnames)

# show the number of signs in the dataframe
print(len(signs))

In [None]:
# select which page to scrape based on the type of road sign
sign_type = "regulatory"

In [None]:
# build the url
thepage = base_url + sign_type + '/'

In [None]:
# call the url
stuff = requests.get(thepage)

In [None]:
# transform to soup using lxml parser
soup = bs4.BeautifulSoup(stuff.text, "lxml")        

In [None]:
# extract the new signs from this page
new_signs = get_itemlist(soup)

In [None]:
# add the new signs to the dataframe
signs = signs.append(new_signs)

In [None]:
# print something to show how the process develops
print("URL:",thepage,flush=True)

In [None]:
signs

### Tidy up the data and save to disk

In [None]:
# remove duplicates in case the same page has been scraped more than once
signs = signs.drop_duplicates()

# this is failing - seems like can't use it when images column is holding an array?

In [None]:
# save the signs to a csv file
signs.to_csv("signs.csv")

In [None]:
# save the signs to a pkl file
signs.to_pickle("signs.pkl")

### Check the result

In [None]:
# how many signs are there in the dataframe?
len(signs)

In [None]:
# have a look at the first five items
signs.head(5)

### Data processing
Create new column based on count of number of images for the sign

In [None]:
signs["number_of_images"] = signs["images"].map(lambda x:len(x))

### Plot the data

In [None]:
# histograms
pp = signs.hist(figsize = (12,7))

### Statistical analysis

In [None]:
signs.describe()

Now we are ready to move onto the final notebook and add [Support for multiple pages](final.ipynb)