In [1]:
import requests
import scrapy

In [2]:
# Setup our URL as a variable for easy reuse if needed

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_Nobel_laureates"

In [4]:
# Make a request to the URL with the requests library

In [5]:
resp = requests.get(url)

In [6]:
# The content of the response in Python 3 is a byte sequence.
# It looks like a string with a little b in front of it
# 'foo' is a string
# b'foo' is a byte sequence

In [7]:
body_bytes = resp.content

In [8]:
# Body is in bytes... turn it in to a string by decoding it to UTF-8
body_str = body_bytes.decode('utf-8')

In [9]:
# Create a scrapy selector from the string content
# https://doc.scrapy.org/en/latest/topics/selectors.html
sel = scrapy.Selector(text=body_str)

In [10]:
# Find the table using a CSS selector for the wikitable class
table = sel.css('.wikitable')
table

[<Selector xpath="descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' wikitable ')]" data='<table class="wikitable sortable">\n<tr>\n'>]

In [11]:
# In order to get the data from the class we must traverse the content using selectors

In [12]:
table.css('tr') # Get all the tr elements within the table
table.css('tr')[0] # Get the first tr element within the table using the pythonic index
table.css('tr')[0].css('th') # Get all the th elements from within the first tr
table.css('tr:nth-child(1) th') # Get all the th elements from the first tr element within the table using CSS selector
# !!!! Note the table on the page has a header and a footer and there will be TWO sets of tr's with th's

[<Selector xpath='descendant-or-self::tr[count(preceding-sibling::*) = 0]/descendant-or-self::*/th' data='<th>Year</th>'>,
 <Selector xpath='descendant-or-self::tr[count(preceding-sibling::*) = 0]/descendant-or-self::*/th' data='<th width="18%"><a href="/wiki/List_of_N'>,
 <Selector xpath='descendant-or-self::tr[count(preceding-sibling::*) = 0]/descendant-or-self::*/th' data='<th width="16%"><a href="/wiki/List_of_N'>,
 <Selector xpath='descendant-or-self::tr[count(preceding-sibling::*) = 0]/descendant-or-self::*/th' data='<th width="18%"><a href="/wiki/List_of_N'>,
 <Selector xpath='descendant-or-self::tr[count(preceding-sibling::*) = 0]/descendant-or-self::*/th' data='<th width="16%"><a href="/wiki/List_of_N'>,
 <Selector xpath='descendant-or-self::tr[count(preceding-sibling::*) = 0]/descendant-or-self::*/th' data='<th width="16%"><a href="/wiki/List_of_N'>,
 <Selector xpath='descendant-or-self::tr[count(preceding-sibling::*) = 0]/descendant-or-self::*/th' data='<th width="15%"><a hr

In [13]:
table.css('tr:nth-child(1) th')

[<Selector xpath='descendant-or-self::tr[count(preceding-sibling::*) = 0]/descendant-or-self::*/th' data='<th>Year</th>'>,
 <Selector xpath='descendant-or-self::tr[count(preceding-sibling::*) = 0]/descendant-or-self::*/th' data='<th width="18%"><a href="/wiki/List_of_N'>,
 <Selector xpath='descendant-or-self::tr[count(preceding-sibling::*) = 0]/descendant-or-self::*/th' data='<th width="16%"><a href="/wiki/List_of_N'>,
 <Selector xpath='descendant-or-self::tr[count(preceding-sibling::*) = 0]/descendant-or-self::*/th' data='<th width="18%"><a href="/wiki/List_of_N'>,
 <Selector xpath='descendant-or-self::tr[count(preceding-sibling::*) = 0]/descendant-or-self::*/th' data='<th width="16%"><a href="/wiki/List_of_N'>,
 <Selector xpath='descendant-or-self::tr[count(preceding-sibling::*) = 0]/descendant-or-self::*/th' data='<th width="16%"><a href="/wiki/List_of_N'>,
 <Selector xpath='descendant-or-self::tr[count(preceding-sibling::*) = 0]/descendant-or-self::*/th' data='<th width="15%"><a hr

In [14]:
table.css('tr')[0].css('th')

[<Selector xpath='descendant-or-self::th' data='<th>Year</th>'>,
 <Selector xpath='descendant-or-self::th' data='<th width="18%"><a href="/wiki/List_of_N'>,
 <Selector xpath='descendant-or-self::th' data='<th width="16%"><a href="/wiki/List_of_N'>,
 <Selector xpath='descendant-or-self::th' data='<th width="18%"><a href="/wiki/List_of_N'>,
 <Selector xpath='descendant-or-self::th' data='<th width="16%"><a href="/wiki/List_of_N'>,
 <Selector xpath='descendant-or-self::th' data='<th width="16%"><a href="/wiki/List_of_N'>,
 <Selector xpath='descendant-or-self::th' data='<th width="15%"><a href="/wiki/List_of_N'>]

In [15]:
# Get the string values of all the th's
# Note they are all Selector objects but the data is just the string content...
table.css('tr')[0].css('th').xpath('string()') 

[<Selector xpath='string()' data='Year'>,
 <Selector xpath='string()' data='Physics'>,
 <Selector xpath='string()' data='Chemistry'>,
 <Selector xpath='string()' data='Physiology\nor Medicine'>,
 <Selector xpath='string()' data='Literature'>,
 <Selector xpath='string()' data='Peace'>,
 <Selector xpath='string()' data='Economics'>]

In [16]:
# We must call extract to pull the data out of the selectors
table.css('tr')[0].css('th').xpath('string()').extract()

['Year',
 'Physics',
 'Chemistry',
 'Physiology\nor Medicine',
 'Literature',
 'Peace',
 'Economics']

In [17]:
# Again, the same example using the CSS selector form
# Get all the column names from the first row's table heading (th) elements
# Using CSS selector for nth-child
table.css('tr:nth-child(1) th').xpath('string()').extract()

['Year',
 'Physics',
 'Chemistry',
 'Physiology\nor Medicine',
 'Literature',
 'Peace',
 'Economics']

In [18]:
# Save our column names in a variable
col_names = table.css('tr:nth-child(1) th').xpath('string()').extract()

In [19]:
col_names

['Year',
 'Physics',
 'Chemistry',
 'Physiology\nor Medicine',
 'Literature',
 'Peace',
 'Economics']

In [20]:
# Replace newlines with spaces to fix the 4th item
col_names = [x.replace('\n', ' ') for x in col_names]

In [21]:
col_names

['Year',
 'Physics',
 'Chemistry',
 'Physiology or Medicine',
 'Literature',
 'Peace',
 'Economics']

In [22]:
# Now get all the href attributes from the links in the header
col_links = table.css('tr:nth-child(1) th a').xpath('@href').extract()

In [23]:
col_links

['/wiki/List_of_Nobel_laureates_in_Physics',
 '/wiki/List_of_Nobel_laureates_in_Chemistry',
 '/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine',
 '/wiki/List_of_Nobel_laureates_in_Literature',
 '/wiki/List_of_Nobel_Peace_Prize_laureates',
 '/wiki/List_of_Nobel_laureates_in_Economics']

In [24]:
# In python the built-in zip function can combine two lists
# so we can have a single list of tuples with our column names and links

In [25]:
col_names_links = list(zip(col_names, col_links))

In [26]:
col_names_links

[('Year', '/wiki/List_of_Nobel_laureates_in_Physics'),
 ('Physics', '/wiki/List_of_Nobel_laureates_in_Chemistry'),
 ('Chemistry', '/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine'),
 ('Physiology or Medicine', '/wiki/List_of_Nobel_laureates_in_Literature'),
 ('Literature', '/wiki/List_of_Nobel_Peace_Prize_laureates'),
 ('Peace', '/wiki/List_of_Nobel_laureates_in_Economics')]

In [27]:
# Why is it wrong?

In [28]:
# If the lists do not have the same number of elements they will not line up correctly.
# The first elements from each list will be paired up and then it will continue pairing everything up until
# the shorter list is finished.

# We can fix this by creating a placeholder of None for the year column (which has no link on the page)

In [29]:
# Add a placeholder of None to the beginning of our links
col_links.insert(0, None)
col_names_links = list(zip(col_names, col_links))

# This could also be a 1-liner if we didn't mutate the col_links list
# col_names_links = list(zip(col_names, [None] + col_links))

In [30]:
col_names_links

[('Year', None),
 ('Physics', '/wiki/List_of_Nobel_laureates_in_Physics'),
 ('Chemistry', '/wiki/List_of_Nobel_laureates_in_Chemistry'),
 ('Physiology or Medicine',
  '/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine'),
 ('Literature', '/wiki/List_of_Nobel_laureates_in_Literature'),
 ('Peace', '/wiki/List_of_Nobel_Peace_Prize_laureates'),
 ('Economics', '/wiki/List_of_Nobel_laureates_in_Economics')]

In [31]:
# Now we can see all our categories and their correct links
# We can convert that data in to a dictionary to make is more accessible

In [1]:
col_names_links = [{"name": name, "href": link} for name, link in col_names_links]
col_names_links

NameError: name 'col_names_links' is not defined

In [33]:
# Once again we go back to the table to get the data from the remaining rows
table
























































































































































































































































































































































[<Selector xpath="descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), ' wikitable ')]" data='<table class="wikitable sortable">\n<tr>\n'>]

In [34]:
# This time we don't want the header row since we already parsed it so we use
# the :not selector along with the :nth-child in combination to
# select all the tr elements that are not the first one

In [35]:
other_rows = table.css('tr:not(:nth-child(1))')

In [58]:
# Now we want to map all our winners by year.
# Our end goal is a structured map:
{
    '1901': [
        {
            'birthday': '1845-03-27',
            'category': 'Physics',
            'href': '/wiki/Wilhelm_R%C3%B6ntgen',
            'name': 'Wilhelm Röntgen'
        }
    ]
}

{'1901': [{'birthday': '1845-03-27',
   'category': 'Physics',
   'href': '/wiki/Wilhelm_R%C3%B6ntgen',
   'name': 'Wilhelm Röntgen'}]}

In [38]:
# Start by stubbing out our data container...
# Map categories to winners by year
by_year = {}

In [None]:
# Now build a loop that adds our next piece- an array of winners per year
# Note that 2016 has a footnote and needs to be cleaned

In [39]:
for row in other_rows:
    try:
        year = row.css('td')[0].xpath('string()').extract()[0]
    except IndexError:
        continue
    # or year = row.css('td')[0].xpath('string()')[0].extract()
    if '[' in year:
        year = year.split('[')[0]

    year_data = []
    by_year[year] = year_data

In [40]:
by_year

{'1901': [],
 '1902': [],
 '1903': [],
 '1904': [],
 '1905': [],
 '1906': [],
 '1907': [],
 '1908': [],
 '1909': [],
 '1910': [],
 '1911': [],
 '1912': [],
 '1913': [],
 '1914': [],
 '1915': [],
 '1916': [],
 '1917': [],
 '1918': [],
 '1919': [],
 '1920': [],
 '1921': [],
 '1922': [],
 '1923': [],
 '1924': [],
 '1925': [],
 '1926': [],
 '1927': [],
 '1928': [],
 '1929': [],
 '1930': [],
 '1931': [],
 '1932': [],
 '1933': [],
 '1934': [],
 '1935': [],
 '1936': [],
 '1937': [],
 '1938': [],
 '1939': [],
 '1940': [],
 '1941': [],
 '1942': [],
 '1943': [],
 '1944': [],
 '1945': [],
 '1946': [],
 '1947': [],
 '1948': [],
 '1949': [],
 '1950': [],
 '1951': [],
 '1952': [],
 '1953': [],
 '1954': [],
 '1955': [],
 '1956': [],
 '1957': [],
 '1958': [],
 '1959': [],
 '1960': [],
 '1961': [],
 '1962': [],
 '1963': [],
 '1964': [],
 '1965': [],
 '1966': [],
 '1967': [],
 '1968': [],
 '1969': [],
 '1970': [],
 '1971': [],
 '1972': [],
 '1973': [],
 '1974': [],
 '1975': [],
 '1976': [],
 '1977': [],

In [None]:
# Now we want to update that loop to print any errors
# Remember the previous warning that the table will have the th's in the footer of the table as well...
# That last row will cause an error because it has th's not td's

In [59]:
for row in other_rows:
    try:
        year = row.css('td')[0].xpath('string()').extract()[0]
    except IndexError:
        print('error on {}'.format(row))
        continue
    # or year = row.css('td')[0].xpath('string()')[0].extract()
    if '[' in year:
        year = year.split('[')[0]

    year_data = []
    # Loop through each data cell's links
    for name_link in row.css('td:not(:nth-child(1)) a'):
        name = name_link.xpath('string()')[0].extract()
        name_href = name_link.xpath('@href')[0].extract()
        year_data.append({
            "name": name,
            "href": name_href,
        })
    by_year[year] = year_data
by_year

error on <Selector xpath='descendant-or-self::tr[not(count(preceding-sibling::*) = 0)]' data='<tr>\n<th>Year</th>\n<th width="16%"><a hr'>


{'1901': [{'href': '/wiki/Wilhelm_R%C3%B6ntgen', 'name': 'Wilhelm Röntgen'},
  {'href': '/wiki/Jacobus_Henricus_van_%27t_Hoff',
   'name': "Jacobus Henricus van 't Hoff"},
  {'href': '/wiki/Emil_Adolf_von_Behring', 'name': 'Emil Adolf von Behring'},
  {'href': '/wiki/Sully_Prudhomme', 'name': 'Sully Prudhomme'},
  {'href': '/wiki/Henry_Dunant', 'name': 'Henry Dunant'},
  {'href': '/wiki/Fr%C3%A9d%C3%A9ric_Passy', 'name': 'Frédéric Passy'}],
 '1902': [{'href': '/wiki/Hendrik_Lorentz', 'name': 'Hendrik Lorentz'},
  {'href': '/wiki/Pieter_Zeeman', 'name': 'Pieter Zeeman'},
  {'href': '/wiki/Hermann_Emil_Fischer', 'name': 'Hermann Emil Fischer'},
  {'href': '/wiki/Ronald_Ross', 'name': 'Ronald Ross'},
  {'href': '/wiki/Theodor_Mommsen', 'name': 'Theodor Mommsen'},
  {'href': '/wiki/%C3%89lie_Ducommun', 'name': 'Élie Ducommun'},
  {'href': '/wiki/Charles_Albert_Gobat', 'name': 'Charles Albert Gobat'}],
 '1903': [{'href': '/wiki/Henri_Becquerel', 'name': 'Henri Becquerel'},
  {'href': '/wiki

In [62]:
# Now we have urls and names for each year but we don't have any categories...
# In order to get the categories we'll use the built in enumerate function and
# loop through each cell in our td selector

# We tell the enumerate function to start counting at 1 instead of 0 (the default)
# because in our column / category list the first column, column 0, is "Year".

# Enumerate gives us an index value as we loop over a list of items:
# starting at 0
print(list(enumerate(['a', 'b', 'c'])))

# or starting at 1
print(list(enumerate(['a', 'b', 'c'], 1)))

[(0, 'a'), (1, 'b'), (2, 'c')]
[(1, 'a'), (2, 'b'), (3, 'c')]


In [67]:
by_year = {}
for row in other_rows:
    try:
        year = row.css('td')[0].xpath('string()').extract()[0]
    except IndexError:
        print('error on {}'.format(row))
        continue
    # or year = row.css('td')[0].xpath('string()')[0].extract()
    if '[' in year:
        year = year.split('[')[0]

    year_data = []
    # Loop through each data cell's links
    for i, cell in enumerate(row.css('td:not(:nth-child(1))'), 1):
        # Use the index from the enumberator to lookup the category by col position
        category = col_names_links[i]["name"]
        link = cell.css('a')
        name = link.xpath('string()').extract()
        name_href = link.xpath('@href').extract()
        year_data.append({
            "name": name,
            "href": name_href,
            "category": category,
        })
    by_year[year] = year_data
by_year['1901']

error on <Selector xpath='descendant-or-self::tr[not(count(preceding-sibling::*) = 0)]' data='<tr>\n<th>Year</th>\n<th width="16%"><a hr'>


[{'category': 'Physics',
  'href': ['/wiki/Wilhelm_R%C3%B6ntgen'],
  'name': ['Wilhelm Röntgen']},
 {'category': 'Chemistry',
  'href': ['/wiki/Jacobus_Henricus_van_%27t_Hoff'],
  'name': ["Jacobus Henricus van 't Hoff"]},
 {'category': 'Physiology or Medicine',
  'href': ['/wiki/Emil_Adolf_von_Behring'],
  'name': ['Emil Adolf von Behring']},
 {'category': 'Literature',
  'href': ['/wiki/Sully_Prudhomme'],
  'name': ['Sully Prudhomme']},
 {'category': 'Peace',
  'href': ['/wiki/Henry_Dunant', '/wiki/Fr%C3%A9d%C3%A9ric_Passy'],
  'name': ['Henry Dunant', 'Frédéric Passy']},
 {'category': 'Economics', 'href': [], 'name': []}]

In [68]:
# Look closely at the data above. Look at the Peace category... we have multiple winners.
# Now we need to handle multiple people per category

# We do this by nesting another loop to go through each individual anchor element
# and adding a distinct dictionary each time

In [70]:
by_year = {}
for row in other_rows:
    try:
        year = row.css('td')[0].xpath('string()').extract()[0]
    except IndexError:
        print('error on {}'.format(row))
        continue
    # or year = row.css('td')[0].xpath('string()')[0].extract()
    if '[' in year:
        year = year.split('[')[0]

    year_data = []
    # Loop through each data cell's links
    for i, cell in enumerate(row.css('td:not(:nth-child(1))'), 1):
        category = col_names_links[i]["name"]
        for link in cell.css('a'):
            name = link.xpath('string()').extract()
            name_href = link.xpath('@href').extract()
            year_data.append({
                "name": name,
                "href": name_href,
                "category": category,
            })
    by_year[year] = year_data
by_year['1902']

error on <Selector xpath='descendant-or-self::tr[not(count(preceding-sibling::*) = 0)]' data='<tr>\n<th>Year</th>\n<th width="16%"><a hr'>


[{'category': 'Physics',
  'href': ['/wiki/Hendrik_Lorentz'],
  'name': ['Hendrik Lorentz']},
 {'category': 'Physics',
  'href': ['/wiki/Pieter_Zeeman'],
  'name': ['Pieter Zeeman']},
 {'category': 'Chemistry',
  'href': ['/wiki/Hermann_Emil_Fischer'],
  'name': ['Hermann Emil Fischer']},
 {'category': 'Physiology or Medicine',
  'href': ['/wiki/Ronald_Ross'],
  'name': ['Ronald Ross']},
 {'category': 'Literature',
  'href': ['/wiki/Theodor_Mommsen'],
  'name': ['Theodor Mommsen']},
 {'category': 'Peace',
  'href': ['/wiki/%C3%89lie_Ducommun'],
  'name': ['Élie Ducommun']},
 {'category': 'Peace',
  'href': ['/wiki/Charles_Albert_Gobat'],
  'name': ['Charles Albert Gobat']}]

In [71]:
# Now we see multiple entries with the Peace category instead of just one

# But we must look closely at the data... each name and href is a list...
# We can fix that by indexing into the list when we assign the values to the dictionary

In [73]:
by_year = {}
for row in other_rows:
    try:
        year = row.css('td')[0].xpath('string()').extract()[0]
    except IndexError:
        print('error on {}'.format(row))
        continue
    # or year = row.css('td')[0].xpath('string()')[0].extract()
    if '[' in year:
        year = year.split('[')[0]

    year_data = []
    # Loop through each data cell's links
    for i, cell in enumerate(row.css('td:not(:nth-child(1))'), 1):
        category = col_names_links[i]["name"]
        for link in cell.css('a'):
            name = link.xpath('string()').extract()
            name_href = link.xpath('@href').extract()
            year_data.append({
                "name": name[0],
                "href": name_href[0],
                "category": category,
            })
    by_year[year] = year_data
by_year['1902']

error on <Selector xpath='descendant-or-self::tr[not(count(preceding-sibling::*) = 0)]' data='<tr>\n<th>Year</th>\n<th width="16%"><a hr'>


[{'category': 'Physics',
  'href': '/wiki/Hendrik_Lorentz',
  'name': 'Hendrik Lorentz'},
 {'category': 'Physics',
  'href': '/wiki/Pieter_Zeeman',
  'name': 'Pieter Zeeman'},
 {'category': 'Chemistry',
  'href': '/wiki/Hermann_Emil_Fischer',
  'name': 'Hermann Emil Fischer'},
 {'category': 'Physiology or Medicine',
  'href': '/wiki/Ronald_Ross',
  'name': 'Ronald Ross'},
 {'category': 'Literature',
  'href': '/wiki/Theodor_Mommsen',
  'name': 'Theodor Mommsen'},
 {'category': 'Peace',
  'href': '/wiki/%C3%89lie_Ducommun',
  'name': 'Élie Ducommun'},
 {'category': 'Peace',
  'href': '/wiki/Charles_Albert_Gobat',
  'name': 'Charles Albert Gobat'}]

In [75]:
# Our data looks really good now
by_year

{'1901': [{'category': 'Physics',
   'href': '/wiki/Wilhelm_R%C3%B6ntgen',
   'name': 'Wilhelm Röntgen'},
  {'category': 'Chemistry',
   'href': '/wiki/Jacobus_Henricus_van_%27t_Hoff',
   'name': "Jacobus Henricus van 't Hoff"},
  {'category': 'Physiology or Medicine',
   'href': '/wiki/Emil_Adolf_von_Behring',
   'name': 'Emil Adolf von Behring'},
  {'category': 'Literature',
   'href': '/wiki/Sully_Prudhomme',
   'name': 'Sully Prudhomme'},
  {'category': 'Peace', 'href': '/wiki/Henry_Dunant', 'name': 'Henry Dunant'},
  {'category': 'Peace',
   'href': '/wiki/Fr%C3%A9d%C3%A9ric_Passy',
   'name': 'Frédéric Passy'}],
 '1902': [{'category': 'Physics',
   'href': '/wiki/Hendrik_Lorentz',
   'name': 'Hendrik Lorentz'},
  {'category': 'Physics',
   'href': '/wiki/Pieter_Zeeman',
   'name': 'Pieter Zeeman'},
  {'category': 'Chemistry',
   'href': '/wiki/Hermann_Emil_Fischer',
   'name': 'Hermann Emil Fischer'},
  {'category': 'Physiology or Medicine',
   'href': '/wiki/Ronald_Ross',
   'na

In [77]:
# In order to get the birthday for each winner we will have to build
# a new url and request the data. We'll create a list of the first
# item in the by_year dictionary values by using the slice syntax:
# <list variable>[:n] means everything leading up to the nth item
# x[:!0] would be the first 10 items
# x[-10:] would mean the last 10 items
# x[5:10] would mean the items in the 5th-10th positions

# If we didn't slice by_year.values() down it would take a long time
# to run because it would call out to wikipedia for every page

In [81]:
for winner_list in list(by_year.values())[:1]:
    for winner in winner_list:
        new_url = 'https://en.wikipedia.org' + winner["href"]
        response = requests.get(new_url)
        content = response.content.decode('utf-8')
        sel = scrapy.Selector(text=content)

In [82]:
# We can see that our selector hold's the detail page's contents

In [83]:
sel

<Selector xpath=None data='<html class="client-nojs" lang="en" dir='>

In [None]:
# We can also see that the biography table rows contain the information we need
# to discover the winners birthday

In [84]:
sel.css('.vcard tr').xpath("string()").extract()

['\nFrédéric Passy\n',
 '\n\n',
 '\nBorn\nFrédéric Passy\n(1822-05-20)May 20, 1822\nParis, France\n',
 '\nDied\nJune 12, 1912(1912-06-12) (aged\xa090)\nNeuilly-sur-Seine, France\n',
 '\nNationality\nFrench\n',
 '\nEducation\nUniversity of Paris\n',
 '\nOccupation\nEconomist\n',
 '\nAwards\nNobel Peace Prize (1901)\n']

In [None]:
# Revisiting the page and inspecting the data with chrome
# reveals there is a hidden span with a class of "bday"

In [53]:
sel.css('.vcard span.bday').xpath("string()").extract()

['1822-05-20']

In [86]:
# We can create a function that takes a winner's dictionary
# and mutates it (edits it in place) and adds the birthday info
# This is important to note in python... dictionaries are mutable
# and editing them within a function causes them to retain the changes.

# We don't have to have our function return a new dictionary or alter the
# list we're looping over. The changes to the dictionary stick.
# Your can't do this with some datatypes in Python, such as strings.

# We'll print the variable before and after we update it to demonstrate

In [87]:
def update_with_bday(winner):
    print("updating {}".format(winner["name"]))
    print(winner)
    new_url = 'https://en.wikipedia.org' + winner["href"]
    response = requests.get(new_url)
    content = response.content.decode('utf-8')
    sel = scrapy.Selector(text=content)
    bday = sel.css('.vcard span.bday').xpath("string()").extract()[0]
    winner["birthday"] = bday
    print(winner)

In [88]:
print(winner_list[0])
for winner_list in list(by_year.values())[:1]:
    for winner in winner_list:
        update_with_bday(winner)
print(winner_list[0])

{'name': 'Wilhelm Röntgen', 'href': '/wiki/Wilhelm_R%C3%B6ntgen', 'category': 'Physics', 'birthday': '1845-03-27'}
updating Wilhelm Röntgen
{'name': 'Wilhelm Röntgen', 'href': '/wiki/Wilhelm_R%C3%B6ntgen', 'category': 'Physics', 'birthday': '1845-03-27'}
{'name': 'Wilhelm Röntgen', 'href': '/wiki/Wilhelm_R%C3%B6ntgen', 'category': 'Physics', 'birthday': '1845-03-27'}
updating Jacobus Henricus van 't Hoff
{'name': "Jacobus Henricus van 't Hoff", 'href': '/wiki/Jacobus_Henricus_van_%27t_Hoff', 'category': 'Chemistry', 'birthday': '1852-08-30'}
{'name': "Jacobus Henricus van 't Hoff", 'href': '/wiki/Jacobus_Henricus_van_%27t_Hoff', 'category': 'Chemistry', 'birthday': '1852-08-30'}
updating Emil Adolf von Behring
{'name': 'Emil Adolf von Behring', 'href': '/wiki/Emil_Adolf_von_Behring', 'category': 'Physiology or Medicine', 'birthday': '1854-03-15'}
{'name': 'Emil Adolf von Behring', 'href': '/wiki/Emil_Adolf_von_Behring', 'category': 'Physiology or Medicine', 'birthday': '1854-03-15'}
up

In [80]:
def update_with_bday(winner):
    print("updating {}".format(winner["name"]))
    print(winner)
    new_url = 'https://en.wikipedia.org' + winner["href"]
    print("Calling wikipedia: {}".format(new_url))
    response = requests.get(new_url)
    content = response.content.decode('utf-8')
    sel = scrapy.Selector(text=content)
    bday = sel.css('.vcard span.bday').xpath("string()").extract()
    birthday = None
    if bday:
        birthday = bday[0]
    winner["birthday"] = birthday
    print(winner)

print(winner_list[0])
for winner_list in list(by_year.values())[:1]:
    for winner in winner_list:
        if 'birthday' not in winner:
            update_with_bday(winner)
print(winner_list[0])

{'name': 'Wilhelm Röntgen', 'href': '/wiki/Wilhelm_R%C3%B6ntgen', 'category': 'Physics'}
updating Wilhelm Röntgen
{'name': 'Wilhelm Röntgen', 'href': '/wiki/Wilhelm_R%C3%B6ntgen', 'category': 'Physics'}
Calling wikipedia: https://en.wikipedia.org/wiki/Wilhelm_R%C3%B6ntgen
{'name': 'Wilhelm Röntgen', 'href': '/wiki/Wilhelm_R%C3%B6ntgen', 'category': 'Physics', 'birthday': '1845-03-27'}
updating Jacobus Henricus van 't Hoff
{'name': "Jacobus Henricus van 't Hoff", 'href': '/wiki/Jacobus_Henricus_van_%27t_Hoff', 'category': 'Chemistry'}
Calling wikipedia: https://en.wikipedia.org/wiki/Jacobus_Henricus_van_%27t_Hoff
{'name': "Jacobus Henricus van 't Hoff", 'href': '/wiki/Jacobus_Henricus_van_%27t_Hoff', 'category': 'Chemistry', 'birthday': '1852-08-30'}
updating Emil Adolf von Behring
{'name': 'Emil Adolf von Behring', 'href': '/wiki/Emil_Adolf_von_Behring', 'category': 'Physiology or Medicine'}
Calling wikipedia: https://en.wikipedia.org/wiki/Emil_Adolf_von_Behring
{'name': 'Emil Adolf vo

In [89]:
import pprint
pprint.pprint(winner_list[0])

{'birthday': '1845-03-27',
 'category': 'Physics',
 'href': '/wiki/Wilhelm_R%C3%B6ntgen',
 'name': 'Wilhelm Röntgen'}
