In [81]:
import requests
import re
import csv
import os

from lxml import html

In [2]:
# Fetch page of compositions
base_url = 'https://en.wikipedia.org/'
name = 'Johann Sebastian Bach'

In [72]:
default_pattern = r'[Op.|Opus|L] (\d{1,4})[,|:]\s*(\W.*)'

date_removal_pattern = r'\(.*\d{1,4}.*\)'

# Change default_patterns to be an ordered list

patterns = {
    'Johann Sebastian Bach': r'BWV (\d{1,4}) – (.*)',
    'Wolfgang Amadeus Mozart': r'(.*), K. (\d{1,4}) \((\d{4})\)'
}

In [78]:
# Parse and extract items
def parse_text(item, composer_name):
    pattern = patterns.get(composer_name, default_pattern)
    match = re.search(pattern, item.text_content(), flags=re.S)
    if match:
        number = match.group(1)
        name = re.sub(date_removal_pattern, '', match.group(2))
        return (number, name)

def extract_url(item):
    hrefs = item.xpath('a[@href]')
    if hrefs:
        return hrefs[0].attrib['href']

def process(item, composer_name):
    details = parse_text(item, composer_name)
    url = extract_url(item)
    if details:
        return {'number': details[0], 'name': details[1], 'url': url, 'composer_name': composer_name}
    
def get_compositions_for_composer(composer_name):
    formatted_name = composer_name.replace(' ', '_')
    url = base_url + 'wiki/List_of_compositions_by_' + formatted_name

    response = requests.get(url)
    if response.status_code != 200:
        print('Failed to fetch compositions for %s' % composer_name)
        return
    
    print('Successfully fetched compositions page for %s' % composer_name)

    doc = response.text

    parsed = html.fromstring(doc)
    parsed.make_links_absolute(base_url)

    items = parsed.xpath("//div[@class='mw-content-ltr']//li")
    items_without_lists = (item for item in items
                           if 'List of' not in item.text_content()
                           and 'toc' not in item.attrib.get('class', ''))
    
    data = (process(item, composer_name) for item in items_without_lists)
    return [d for d in data if d is not None]

In [1]:
def get_composer_names_from_file(filepath='../data/composers.csv'):
    with open(filepath, 'r') as f:
        reader = csv.DictReader(f)
        composer_names = [row['name'] for row in reader]
        
    return composer_names

In [92]:
columns = ['composer_name', 'number', 'name', 'url']
dialect = csv.register_dialect('custom', delimiter='|')

def write_compositions_to_file(compositions, filepath='../data/compositions.csv'):
    with open(filepath, 'a') as f:
        writer = csv.DictWriter(f, fieldnames=columns, dialect='custom')
        if not os.path.getsize(filepath) > 0:
            writer.writeheader()
        count = 0
        for composition in compositions:
            writer.writerow(composition)
            count = count + 1
    print('Wrote %d compositions to file' % count)

composer_names = get_composer_names_from_file()
for composer_name in list(composer_names):
    compositions = get_compositions_for_composer(composer_name)
    if compositions:
        write_compositions_to_file(compositions)

Failed to fetch compositions for Mary Anne à Beckett
Failed to fetch compositions for Thorvald Aagaard
Failed to fetch compositions for Truid Aagesen
Failed to fetch compositions for Heikki Aaltoila
Failed to fetch compositions for Juhan Aavik
Failed to fetch compositions for Evaristo Felice Dall
Failed to fetch compositions for Joseph Abaco
Failed to fetch compositions for Antonio Maria Abbatini
Failed to fetch compositions for Gamal Abdel
Failed to fetch compositions for Rosalina Abejo
Failed to fetch compositions for Carl Friedrich Abel
Failed to fetch compositions for Clamor Heinrich Abel
Failed to fetch compositions for Ludwig Abel
Failed to fetch compositions for Peter Abelard
Failed to fetch compositions for Nicanor Abelardo
Failed to fetch compositions for John Abell
Failed to fetch compositions for Johann Joseph Abert
Failed to fetch compositions for 4th Earl of Abingdon
Failed to fetch compositions for Lora Aborn
Failed to fetch compositions for Girolamo Abos
Failed to fetch 

ConnectionError: ('Connection aborted.', BadStatusLine("''",))