In [1]:
import requests
import re
import csv

from lxml import html

In [2]:
# Fetch page of compositions
base_url = 'https://en.wikipedia.org/'
name = 'Johann Sebastian Bach'

In [26]:
default_pattern = r'[[Op.|Opus|No.] (\d{1-4}): (.*)|(.*)[Op.|Opus] (\d{1-4})]'

# Change default_patterns to be an ordered list

patterns = {
    'Johann Sebastian Bach': r'BWV (\d{1,4}) – (.*)',
    'Wolfgang Amadeus Mozart': r'(.*), K. (\d{1,4}) \((\d{4})\)'
}

In [30]:
# Parse and extract items
def parse_text(item, composer_name):
    pattern = patterns.get(composer_name, default_pattern)
    match = re.search(pattern, item.text_content())
    if match:
        return (match.group(1), match.group(2))

def extract_url(item):
    hrefs = item.xpath('a[@href]')
    if hrefs:
        return hrefs[0].attrib['href']

def process(item, composer_name):
    details = parse_text(item, composer_name)
    url = extract_url(item)
    if details:
        return {'number': details[0], 'name': details[1], 'url': url, 'composer_name': composer_name}
    
def get_compositions_for_composer(composer_name):
    formatted_name = composer_name.replace(' ', '_')
    url = base_url + 'wiki/List_of_compositions_by_' + formatted_name

    response = requests.get(url)
    if response.status_code != 200:
        print('Failed to fetch compositions for %s' % composer_name)
        return
    
    print('Successfully fetched compositions page for %s' % composer_name)

    doc = response.text

    parsed = html.fromstring(doc)
    parsed.make_links_absolute(base_url)

    items = parsed.xpath("//div[@class='mw-content-ltr']//li")
    items_without_lists = (item for item in items
                           if 'List of' not in item.text_content()
                           and 'toc' not in item.attrib.get('class', ''))
    
    data = (process(item, composer_name) for item in items_without_lists)
    return [d for d in data if d is not None]

In [28]:
def get_composer_names_from_file(filepath='../data/composers.csv'):
    with open(filepath, 'r') as f:
        reader = csv.DictReader(f)
        composer_names = [row['name'] for row in reader]
        
    return composer_names

In [31]:
composer_names = get_composer_names_from_file()
for composer_name in list(composer_names)[230:240]:
    compositions = get_compositions_for_composer(composer_name)
    if compositions:
        print(composer_name + ': ' + len(compositions))

Failed to fetch compositions for Blaž Arnič
Failed to fetch compositions for Ralf Arnie
Failed to fetch compositions for Ernst Arnold
Successfully fetched compositions page for Malcolm Arnold
Failed to fetch compositions for Samuel Arnold
Failed to fetch compositions for Yuri Karlovich Arnold
Failed to fetch compositions for Robert Sterling Arnold
Successfully fetched compositions page for Juan Crisóstomo Arriaga
Failed to fetch compositions for Emilio Arrieta
Failed to fetch compositions for Claude Arrieu
