<a href="https://colab.research.google.com/github/schenzio/scraping/blob/main/full_python_scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Demo scraper code: Get data on Nobel Prize winners

* Source: [Nobel Prize Official Website](https://www.nobelprize.org/prizes/lists/all-nobel-prizes)



In [None]:
!pip install requests
!pip install requests_cache
!pip install lxml
!pip install pandas

## 1. Import Libraries

In [None]:
import requests
import requests_cache
from lxml import html
import pandas as pd
from fractions import Fraction
import time
import re
import json

## 2. Define functions and constants

In [None]:
BASE_URL = 'https://www.nobelprize.org'
OUTPUT_FILE = "winners.json"
requests_cache.install_cache('nobel_cache')


In [None]:
def get_root(url):
    r = requests.get(url)
    root = html.fromstring(r.text)
    if r.status_code == 200:
        return root
    else: 
        return None
       	print("HTTPS request failed with status code %s" % (r.status_code))


def process_winners(url):
    row = {}
    row['prize_id'] = abs(hash(url))
    row['source'] = url
    try:
        row['prize_year'] = int(url.split('/')[-4])
        row['prize_category'] = url.split('/')[-5]
    except ValueError:
        row['prize_year'] = int(url.split('/')[-2])
        row['prize_category'] = url.split('/')[-4]
        
    response_w = requests.get(url)
    root_w = html.fromstring(response_w.text)
    row['name'] = root_w.xpath('//header/h1')[0].text_content().strip()
    row['image_url'] = root_w.xpath('//div[@class="image"]/picture/source')[-1].get('data-srcset')
    row['prize_share'] = float(Fraction(root_w.xpath('//text()[contains(., "Prize share:")]')[0].split(':')[-1].strip()))
    
    
    #dead or alive?
    try:
        row['death_date'] = root_w.xpath('//text()[contains(., "Died:")]')[0].strip().split(':')[1].split(',')[0].split(' ')[-1].strip()
        row['alive_status'] = 'dead'
    except IndexError:
        row['death_date'] = ''
        row['alive_status'] = 'alive'
    
    
    #Check winner type
    
    try:
        row['birth_year'] = root_w.xpath('//text()[contains(., "Born:")]')[0].strip().split(':')[1].split(',')[0].split(' ')[-1].strip()
        row['entity'] = 'person'
    except IndexError:
        row['birth_year'] = ''
        row['entity'] = 'organization'
    

    #Check if prize accepted
    pattern = re.compile('declined the Nobel')
    match = re.search(pattern,response_w.text)
    if match:
        row['acceptance'] = 'declined'
    else:
        row['acceptance'] = 'accepted'
    
    #time.sleep(5)
    #print(row['alive_status']+"-"+row['name']+ '-'+str(row['death_date']))
    return row


def export_json(dictlist,output_file):
    with open(output_file, 'w') as f:
        json.dump(dictlist,f)
    print('Exported as .json to %s' % (output_file))


requests_cache.install_cache('nobel_cache')

## 3. Scrape and save

In [None]:
start_url = BASE_URL+'/nobel_prizes/lists/all/'
root = get_root(start_url)
winner_urls = [url.get('href') for url in root.xpath("//div[@class='by_year']/p/a")]

all_winners = len(winner_urls) # --> should be 962, see here: https://www.nobelprize.org/prizes/
data = []
processed = 0

for url in winner_urls:
    try:
        row = process_winners(url)
        data.append(row)
        processed +=1
        print("Processed : %d/%d > %.2f%%" % (processed, all_winners, round((processed/all_winners)*100,2)))
    except Exception as e:
        print(url)
        print(e)

export_json(data,OUTPUT_FILE)

Processed : 1/963 > 0.10%
Processed : 2/963 > 0.21%
Processed : 3/963 > 0.31%
Processed : 4/963 > 0.42%
Processed : 5/963 > 0.52%
Processed : 6/963 > 0.62%
Processed : 7/963 > 0.73%
Processed : 8/963 > 0.83%
Processed : 9/963 > 0.93%
Processed : 10/963 > 1.04%
Processed : 11/963 > 1.14%
Processed : 12/963 > 1.25%
Processed : 13/963 > 1.35%
Processed : 14/963 > 1.45%
Processed : 15/963 > 1.56%
Processed : 16/963 > 1.66%
Processed : 17/963 > 1.77%
Processed : 18/963 > 1.87%
Processed : 19/963 > 1.97%
Processed : 20/963 > 2.08%
Processed : 21/963 > 2.18%
Processed : 22/963 > 2.28%
https://www.nobelprize.org/prizes/peace/2019/summary/
invalid literal for int() with base 10: 'summary'
Processed : 23/963 > 2.39%
Processed : 24/963 > 2.49%
Processed : 25/963 > 2.60%
Processed : 26/963 > 2.70%
Processed : 27/963 > 2.80%
Processed : 28/963 > 2.91%
Processed : 29/963 > 3.01%
Processed : 30/963 > 3.12%
Processed : 31/963 > 3.22%
Processed : 32/963 > 3.32%
Processed : 33/963 > 3.43%
Processed : 34

## 4. Check


In [None]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,prize_id,source,prize_year,prize_category,name,image_url,prize_share,death_date,alive_status,birth_year,entity,acceptance
0,4147989774306426835,https://www.nobelprize.org/prizes/physics/2020...,2020,physics,Roger Penrose,https://www.nobelprize.org/images/penrose-1117...,0.5,,alive,1931,person,accepted
1,8667244052562133513,https://www.nobelprize.org/prizes/physics/2020...,2020,physics,Reinhard Genzel,https://www.nobelprize.org/images/genzel-11175...,0.25,,alive,1952,person,accepted
2,710472678332515686,https://www.nobelprize.org/prizes/physics/2020...,2020,physics,Andrea Ghez,https://www.nobelprize.org/images/ghez-111760-...,0.25,,alive,1965,person,accepted
3,143844596397185220,https://www.nobelprize.org/prizes/chemistry/20...,2020,chemistry,Emmanuelle Charpentier,https://www.nobelprize.org/images/charpentier-...,0.5,,alive,1968,person,accepted
4,4835365360172269807,https://www.nobelprize.org/prizes/chemistry/20...,2020,chemistry,Jennifer A. Doudna,https://www.nobelprize.org/images/doudna-11176...,0.5,,alive,1964,person,accepted
