In [1]:
'''
1. Scrape alternate name forms of authors from VIAF
2. Save them in a file with the corresponding DLL Identifier in an adjacent column
'''

import csv
import requests
from bs4 import BeautifulSoup
import time
import random

In [4]:
def extract_h2_text(url):
    # Look like a human.
    headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5)'
               'AppleWebKit 537.36 (KHTML, like Gecko) Chrome',
               'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
               'Accept_Language':'en-GB,en;q=0.5'}
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.content, 'html.parser')
    title_div = soup.find('div', id='Title')
    h2_tags = title_div.find_all('h2') if title_div else []
    h2_texts = [h2_tag.get_text(separator=' ', strip=True) for h2_tag in h2_tags]
    return h2_texts

def read_csv(file_path):
    with open(file_path, newline='') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # Skip the header row
        # Ignore any empty rows; add real values to VIAF_IDS
        data = [(row[2], row[1]) for row in reader if row[2] != '']
        return data

def write_csv(file_path, data):
    with open(file_path, mode='w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Name', 'DLL Identifier'])
        writer.writerows(data)

def main(input_csv, output_csv):
    rows = read_csv(input_csv)
    
    with open(output_csv, mode='w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['H2 Text', 'Identifier'])
        
        for url, identifier in rows:
            h2_texts = extract_h2_text(url)
            for text in h2_texts:
                writer.writerow([text, identifier])
                print(text, ", ", identifier)
            
            # Introduce a random delay between 1 and 5 seconds
            delay = random.uniform(1, 5)
            time.sleep(delay)

if __name__ == "__main__":
    input_csv = 'input/authors-viaf.csv'  # Replace with your input CSV file path
    output_csv = 'output/viaf-authors-output.csv'  # Replace with your desired output CSV file path
    main(input_csv, output_csv)

Albert, of Aachen, active 11th-12th century ,  A5558
Albert, d'Aix, 11e/12e s. ,  A5558
Albertus Aquensis ,  A5558
Marullo, Michele, 1453-1500 ,  A5552
Marullus 1453-1500 ,  A5552
Marullo Tarcaniota, Michele, -1500 ,  A5552
Marullo Tarcaniota, Michele 1453-1500 ,  A5552
Marullus, Michael Tarchaniota, 1453-1500 ,  A5552
Marullus, Michael, detto Tarchaniota, 1453-1500 ,  A5552
Michael Tarchaniota Marullus ,  A5552
Marullo Torcaniota, Michele ,  A5552
Marullo, Michele Tarchaniota‪ (1453-1500) ,  A5552
Marullo Tarcaniota, Michele d. 1500 ,  A5552
Marullo Tarcaniota, Michele, m. 1500 ,  A5552
Michael Marullus, detto Tarchaniota, 1453-1500 ,  A5552
Marullo, Michele, detto Tarchaniota, 1453-1500 ,  A5552
Marullus, Michael Tarchaniota ,  A5552
Corvinus, Laurentius, 1465-1527 ,  A5553
Korwin, Wawrzyniec ,  A5553
Korwin, Wawrzyniec (około 1465-1527). ,  A5553
Corvinus, Laurentius, ca 1465-1527 ,  A5553
Corvinus, Laurentius, approximately 1465-1527 ,  A5553
Korwin, Wawrzyniec (ok. 1465-1527) ,  A