In [56]:
import requests
import re

import csv
from lxml import html, etree

In [62]:
# Get data set for scraping
base_url = 'https://en.wikipedia.org/'
url  = base_url + 'wiki/List_of_composers_by_name'
response = requests.get(base_url)
assert(response.status_code == 200)
doc = response.text

In [65]:
# Parse and extract data
parsed = html.fromstring(doc)
parsed.make_links_absolute(base_url)

items = parsed.xpath('//li')

pattern = re.compile(r'([\w\s]*).*(\d{4}).*(\d{4})')

def parse_text(item):
    match = re.search(pattern, item.text_content())
    if match:
        return (match.group(1).strip(), match.group(2), match.group(3))

def extract_url(item):
    hrefs = item.xpath('a[@href]')
    if hrefs:
        return hrefs[0].attrib['href']

def process(item):
    composer_details = parse_text(item)
    href = extract_url(item)
    if composer_details and href:
        return {'name': composer_details[0],
                'born': composer_details[1],
                'died': composer_details[2],
                'url': href}
    
data = (process(item) for item in items)
data = (d for d in data if d is not None)

In [64]:
# Write our dataset to CSV
fieldnames = ['name', 'born', 'died', 'url']
output_file = '../data/composers.csv'

with open(output_file, 'w') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)

    writer.writeheader()
    
    for row in data:
        writer.writerow(row)
        
print('Done!')

Done!
