In [49]:
import requests
import re

from lxml import html

In [45]:
# Fetch page of compositions
base_url = 'https://en.wikipedia.org/'
name = 'Johann Sebastian Bach'
formatted_name = name.replace(' ', '_')
url = base_url + 'wiki/List_of_compositions_by_' + formatted_name

response = requests.get(url)
assert(response.status_code == 200)

doc = response.text

In [92]:
default_pattern = r'[Op.|Opus|No.] (\d{1-4}): (.*)'

patterns = {
    'Johann Sebastian Bach': r'BWV (\d{1,4}) – (.*)',
    'Wolfgang Amadeus Mozart': r'(.*), K. (\d{1,4}) \((\d{4})\)'
}

In [100]:
# Parse and extract items
parsed = html.fromstring(doc)
parsed.make_links_absolute(base_url)

items = parsed.xpath("//div[@class='mw-content-ltr']//li")
items_without_lists = (item for item in items
                       if 'List of' not in item.text_content()
                       and 'toc' not in item.attrib.get('class', ''))

def parse_text(item):
    pattern = patterns.get(name, default_pattern)
    match = re.search(pattern, item.text_content())
    if match:
        return (match.group(1), match.group(2))

def extract_url(item):
    hrefs = item.xpath('a[@href]')
    if hrefs:
        return hrefs[0].attrib['href']

def process(item):
    details = parse_text(item)
    url = extract_url(item)
    if details:
        return {'number': details[0], 'name': details[1], 'url': url}
    
data = (process(item) for item in items_without_lists)
data = (d for d in data if d is not None)
   
import pprint
pprint.pprint(list(data))

[{'name': 'Singet dem Herrn ein neues Lied',
  'number': '225',
  'url': 'https://en.wikipedia.org/wiki/Singet_dem_Herrn_ein_neues_Lied'},
 {'name': 'Der Geist hilft unser Schwachheit auf',
  'number': '226',
  'url': 'https://en.wikipedia.org/wiki/Der_Geist_hilft_unser_Schwachheit_auf,_BWV_226'},
 {'name': 'Jesu, meine Freude',
  'number': '227',
  'url': 'https://en.wikipedia.org/wiki/Jesu,_meine_Freude,_BWV_227'},
 {'name': 'Fürchte dich nicht',
  'number': '228',
  'url': 'https://en.wikipedia.org/wiki/F%C3%BCrchte_dich_nicht,_BWV_228'},
 {'name': 'Komm, Jesu, komm',
  'number': '229',
  'url': 'https://en.wikipedia.org/wiki/Komm,_Jesu,_komm'},
 {'name': 'Lobet den Herrn, alle Heiden (Psalm 117)',
  'number': '230',
  'url': 'https://en.wikipedia.org/wiki/Lobet_den_Herrn,_alle_Heiden'},
 {'name': 'Sei Lob und Preis mit Ehren (2nd section from composite motet '
          'BWV Anh. 160; based on Cantata no. 28)[1]',
  'number': '231',
  'url': None},
 {'name': 'Mass in B minor, with 