In [110]:
import functools

from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [188]:
composers = pd.DataFrame.from_csv('../data/composers.csv', sep='|')
composers = composers.reset_index()
composers

  """Entry point for launching an IPython kernel.


Unnamed: 0,name,born,died,url
0,Mary Anne à Beckett,1817,1863,https://en.wikipedia.org/wiki/Mary_Anne_%C3%A0...
1,Thorvald Aagaard,1877,1937,https://en.wikipedia.org/wiki/Thorvald_Aagaard
2,Truid Aagesen,1593,1625,https://en.wikipedia.org/wiki/Truid_Aagesen
3,Heikki Aaltoila,1905,1992,https://en.wikipedia.org/wiki/Heikki_Aaltoila
4,Juhan Aavik,1884,1982,https://en.wikipedia.org/wiki/Juhan_Aavik
5,Evaristo Felice Dall,1675,1742,https://en.wikipedia.org/wiki/Evaristo_Felice_...
6,Joseph Abaco,1710,1805,https://en.wikipedia.org/wiki/Joseph_Abaco
7,Antonio Maria Abbatini,1595,1680,https://en.wikipedia.org/wiki/Antonio_Maria_Ab...
8,Gamal Abdel,1924,1988,https://en.wikipedia.org/wiki/Gamal_Abdel-Rahim
9,Rosalina Abejo,1922,1991,https://en.wikipedia.org/wiki/Rosalina_Abejo


In [50]:
eras = [
    'Medieval',
    'Renaissance',
    'Baroque',
    'Classical-era',
    'Romantic-era',
    '20th-century',
    '21st-century',
]


def era_to_url(era):
    return 'https://en.wikipedia.org/wiki/List_of_{}_composers'.format(era)

In [190]:
@functools.lru_cache()
def soup(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')
    except requests.HTTPError:
        return None


@functools.lru_cache()
def wiki_links(soup):
    if soup is None:
        return []
  
    links = []
    for link in soup.find_all('a'):
        href = link.get('href')
        if href and href.startswith('/wiki'):
            links.append(link.get('title'))
    return links


@functools.lru_cache()
def names_by_era(era):
    url = era_to_url(era)
    link_titles = wiki_links(soup(url))
    return pd.DataFrame.from_records(list({(link_title, True) for link_title in link_titles}), columns=['title', era], index='title')

In [191]:
pd.merge(composers, names_by_era(eras[0]), how='left', left_on='name', right_index=True)

Unnamed: 0,name,born,died,url,Medieval
0,Mary Anne à Beckett,1817,1863,https://en.wikipedia.org/wiki/Mary_Anne_%C3%A0...,
1,Thorvald Aagaard,1877,1937,https://en.wikipedia.org/wiki/Thorvald_Aagaard,
2,Truid Aagesen,1593,1625,https://en.wikipedia.org/wiki/Truid_Aagesen,
3,Heikki Aaltoila,1905,1992,https://en.wikipedia.org/wiki/Heikki_Aaltoila,
4,Juhan Aavik,1884,1982,https://en.wikipedia.org/wiki/Juhan_Aavik,
5,Evaristo Felice Dall,1675,1742,https://en.wikipedia.org/wiki/Evaristo_Felice_...,
6,Joseph Abaco,1710,1805,https://en.wikipedia.org/wiki/Joseph_Abaco,
7,Antonio Maria Abbatini,1595,1680,https://en.wikipedia.org/wiki/Antonio_Maria_Ab...,
8,Gamal Abdel,1924,1988,https://en.wikipedia.org/wiki/Gamal_Abdel-Rahim,
9,Rosalina Abejo,1922,1991,https://en.wikipedia.org/wiki/Rosalina_Abejo,


In [195]:
def join_composers_to_eras(composers, eras):
    series = [names_by_era(era) for era in eras]
    for era in eras:
        composers = pd.merge(composers, names_by_era(era), how='left', left_on='name', right_index=True)
    return composers

composers_with_eras = join_composers_to_eras(composers, eras)
composers_with_eras = composers_with_eras.fillna(False)
composers_with_eras

Unnamed: 0,name,born,died,url,Medieval,Renaissance,Baroque,Classical-era,Romantic-era,20th-century,21st-century
0,Mary Anne à Beckett,1817,1863,https://en.wikipedia.org/wiki/Mary_Anne_%C3%A0...,False,False,False,False,False,False,False
1,Thorvald Aagaard,1877,1937,https://en.wikipedia.org/wiki/Thorvald_Aagaard,False,False,False,False,False,False,False
2,Truid Aagesen,1593,1625,https://en.wikipedia.org/wiki/Truid_Aagesen,False,True,False,False,False,False,False
3,Heikki Aaltoila,1905,1992,https://en.wikipedia.org/wiki/Heikki_Aaltoila,False,False,False,False,False,False,False
4,Juhan Aavik,1884,1982,https://en.wikipedia.org/wiki/Juhan_Aavik,False,False,False,False,False,False,False
5,Evaristo Felice Dall,1675,1742,https://en.wikipedia.org/wiki/Evaristo_Felice_...,False,False,False,False,False,False,False
6,Joseph Abaco,1710,1805,https://en.wikipedia.org/wiki/Joseph_Abaco,False,False,True,True,False,False,False
7,Antonio Maria Abbatini,1595,1680,https://en.wikipedia.org/wiki/Antonio_Maria_Ab...,False,False,True,False,False,False,False
8,Gamal Abdel,1924,1988,https://en.wikipedia.org/wiki/Gamal_Abdel-Rahim,False,False,False,False,False,False,False
9,Rosalina Abejo,1922,1991,https://en.wikipedia.org/wiki/Rosalina_Abejo,False,False,False,False,False,False,False


In [196]:
sequential_eras = list(zip(composers_with_eras.columns[3:], composers_with_eras.columns[4:]))

def column_name(a, b):
    a = re.sub(r'-era', '', a)
    b = re.sub(r'-era', '', b)
    return '{}-{}'.format(a, b)
    
for sequential_era in sequential_eras:
    composers_with_eras[column_name(*sequential_era)] = composers_with_eras.loc[:, sequential_era].all(axis=1)

composers_with_eras

Unnamed: 0,name,born,died,url,Medieval,Renaissance,Baroque,Classical-era,Romantic-era,20th-century,21st-century,url-Medieval,Medieval-Renaissance,Renaissance-Baroque,Baroque-Classical,Classical-Romantic,Romantic-20th-century,20th-century-21st-century
0,Mary Anne à Beckett,1817,1863,https://en.wikipedia.org/wiki/Mary_Anne_%C3%A0...,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,Thorvald Aagaard,1877,1937,https://en.wikipedia.org/wiki/Thorvald_Aagaard,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,Truid Aagesen,1593,1625,https://en.wikipedia.org/wiki/Truid_Aagesen,False,True,False,False,False,False,False,False,False,False,False,False,False,False
3,Heikki Aaltoila,1905,1992,https://en.wikipedia.org/wiki/Heikki_Aaltoila,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,Juhan Aavik,1884,1982,https://en.wikipedia.org/wiki/Juhan_Aavik,False,False,False,False,False,False,False,False,False,False,False,False,False,False
5,Evaristo Felice Dall,1675,1742,https://en.wikipedia.org/wiki/Evaristo_Felice_...,False,False,False,False,False,False,False,False,False,False,False,False,False,False
6,Joseph Abaco,1710,1805,https://en.wikipedia.org/wiki/Joseph_Abaco,False,False,True,True,False,False,False,False,False,False,True,False,False,False
7,Antonio Maria Abbatini,1595,1680,https://en.wikipedia.org/wiki/Antonio_Maria_Ab...,False,False,True,False,False,False,False,False,False,False,False,False,False,False
8,Gamal Abdel,1924,1988,https://en.wikipedia.org/wiki/Gamal_Abdel-Rahim,False,False,False,False,False,False,False,False,False,False,False,False,False,False
9,Rosalina Abejo,1922,1991,https://en.wikipedia.org/wiki/Rosalina_Abejo,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [197]:
era_columns = [column_name(*s_e) for s_e in sequential_eras] + eras
ordered_columns = ['born', 'died', 'url'] + era_columns
composers_with_eras_ordered = composers_with_eras.loc[:, ordered_columns]

In [199]:
earliest_era = composers_with_eras_ordered.loc[:, era_columns].replace(to_replace=False, value=np.nan).idxmin(axis=1)
earliest_era = pd.DataFrame(earliest_era)
earliest_era = earliest_era.fillna('Unknown')
composers['era'] = earliest_era

In [202]:
composers.to_csv('../data/composers-annotated.csv', sep='|', index=False)