# Scraping Data

## Getting the Soup

In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
BASE_URL = 'http://en.wikipedia.org'
# Wikipedia will reject our request unless we add a 'User-Agent' attribute to our http header.
HEADERS= {'User-Agent': 'Mozilla/5.0'}

In [3]:
def get_Nobel_soup():
    """ Return a parsed tag tree of our Nobel prize page """
    # Make a request to the Nobel page, setting valid headers
    response = requests.get(BASE_URL + '/wiki/List_of_Nobel_laureates', headers=HEADERS)
    # Return the content of the response parsed by BeautifulSoup
    
    return BeautifulSoup(response.content, "lxml")

## Selecting Tags

In [4]:
soup = get_Nobel_soup()

In [8]:
soup.find('table', {'class':'wikitable sortable'})

<table class="wikitable sortable">\n<tr>\n<th>Year</th>\n<th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physics" title="List of Nobel laureates in Physics">Physics</a></th>\n<th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Chemistry" title="List of Nobel laureates in Chemistry">Chemistry</a></th>\n<th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine" title="List of Nobel laureates in Physiology or Medicine">Physiology<br/>\nor Medicine</a></th>\n<th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Literature" title="List of Nobel laureates in Literature">Literature</a></th>\n<th width="16%"><a href="/wiki/List_of_Nobel_Peace_Prize_laureates" title="List of Nobel Peace Prize laureates">Peace</a></th>\n<th width="15%"><a class="mw-redirect" href="/wiki/List_of_Nobel_laureates_in_Economics" title="List of Nobel laureates in Economics">Economics</a></th>\n</tr>\n<tr>\n<td align="center">1901</td>\n<td><span class="sortkey">R\xf6ntgen, Wi

In [9]:
soup.select('table.sortable.wikitable')

[<table class="wikitable sortable">\n<tr>\n<th>Year</th>\n<th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physics" title="List of Nobel laureates in Physics">Physics</a></th>\n<th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Chemistry" title="List of Nobel laureates in Chemistry">Chemistry</a></th>\n<th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine" title="List of Nobel laureates in Physiology or Medicine">Physiology<br/>\nor Medicine</a></th>\n<th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Literature" title="List of Nobel laureates in Literature">Literature</a></th>\n<th width="16%"><a href="/wiki/List_of_Nobel_Peace_Prize_laureates" title="List of Nobel Peace Prize laureates">Peace</a></th>\n<th width="15%"><a class="mw-redirect" href="/wiki/List_of_Nobel_laureates_in_Economics" title="List of Nobel laureates in Economics">Economics</a></th>\n</tr>\n<tr>\n<td align="center">1901</td>\n<td><span class="sortkey">R\xf6ntgen, W

In [10]:
table = soup.select_one('table.sortable.wikitable')

In [11]:
table.select('th')
# equivalent to table('th')

[<th>Year</th>,
 <th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physics" title="List of Nobel laureates in Physics">Physics</a></th>,
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Chemistry" title="List of Nobel laureates in Chemistry">Chemistry</a></th>,
 <th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine" title="List of Nobel laureates in Physiology or Medicine">Physiology<br/>\nor Medicine</a></th>,
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Literature" title="List of Nobel laureates in Literature">Literature</a></th>,
 <th width="16%"><a href="/wiki/List_of_Nobel_Peace_Prize_laureates" title="List of Nobel Peace Prize laureates">Peace</a></th>,
 <th width="15%"><a class="mw-redirect" href="/wiki/List_of_Nobel_laureates_in_Economics" title="List of Nobel laureates in Economics">Economics</a></th>,
 <th>Year</th>,
 <th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Physics" title="List of Nobel laureates in 

## Crafting selection patterns

In [12]:
def get_column_titles(table):
    """ Get the Nobel categories from the table header """
    cols = []
    for th in table.select_one('tr').select('th')[1:]:
        link = th.select_one('a')
        # Store the category name and any Wikipedia link it has
        if link:
            cols.append({'name':link.text, 'href':link.attrs['href']})
        else:
            cols.append({'name':th.text, 'href':None})
    
    return cols

In [13]:
wikitable = table
get_column_titles(wikitable)

[{'href': '/wiki/List_of_Nobel_laureates_in_Physics', 'name': u'Physics'},
 {'href': '/wiki/List_of_Nobel_laureates_in_Chemistry', 'name': u'Chemistry'},
 {'href': '/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine',
  'name': u'Physiology\nor Medicine'},
 {'href': '/wiki/List_of_Nobel_laureates_in_Literature',
  'name': u'Literature'},
 {'href': '/wiki/List_of_Nobel_Peace_Prize_laureates', 'name': u'Peace'},
 {'href': '/wiki/List_of_Nobel_laureates_in_Economics', 'name': u'Economics'}]

In [19]:
def get_Nobel_winners(table):
    cols = get_column_titles(table)
    winners = []
    for row in table.select('tr')[1:-2]:
        year = int(row.select_one('td').text) # Gets 1st <td>
        for i, td in enumerate(row.select('td')[1:]):
            for winner in td.select('a'):
                href = winner.attrs['href']
                if not href.startswith('#endnote'):
                    winners.append({'year':year, 'category':cols[i]['name'], 'name':winner.text, 'link':winner.attrs['href']})
    
    return winners

In [25]:
winners = get_Nobel_winners(wikitable)

## Caching the web pages

In [21]:
import requests
import requests_cache

requests_cache.install_cache()

In [22]:
requests_cache.install_cache('nobel_pages', backend='sqlite', expire_after=7200)

## Scraping the winners' nationalities

In [29]:
def get_winner_nationality(w):
    """ scrape biographic data from the winner's wikipedia page """
    data = requests.get('http://en.wikipedia.org' + w['link'], headers=HEADERS)
    soup = BeautifulSoup(data.content, "lxml")
    person_data = {'name': w['name']}
    attr_rows = soup.select('table.infobox tr')
    for tr in attr_rows:
        try:
            attribute = tr.select_one('th').text
            if attribute == 'Nationality':
                person_data[attribute] = tr.select_one('td').text
        except AttributeError:
            pass
    
    return person_data

In [30]:
wdata = []
# test first 50 winners
for w in winners[:50]:
    wdata.append(get_winner_nationality(w))
missing_nationality = []
for w in wdata:
    # if missing 'Nationality' add to list
    if not w.get('Nationality'):
        missing_nationality.append(w)
# output list
missing_nationality

[{'name': u'\xc9lie Ducommun'},
 {'name': u'Charles Albert Gobat'},
 {'name': u'Marie Curie'},
 {'name': u'Niels Ryberg Finsen'},
 {'name': u'Ivan Pavlov'},
 {'name': u'Institut de Droit International'},
 {'name': u'Bertha von Suttner'},
 {'name': u'Santiago Ram\xf3n y Cajal'},
 {'name': u'Theodore Roosevelt'},
 {'name': u'Ernesto Teodoro Moneta'},
 {'name': u'Louis Renault'},
 {'name': u'Paul Ehrlich'},
 {'name': u'Rudolf Christoph Eucken'},
 {'name': u'Klas Pontus Arnoldson'}]