In [91]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import json
import utils
import importlib

In [92]:
with open('royaltree.json', 'r') as file:
    data = json.load(file)
data

{'/wiki/Charlemagne': {'id': '/wiki/Charlemagne',
  'name': 'Charlemagne',
  'title': '/wiki/Holy_Roman_Emperor',
  'picture': '//upload.wikimedia.org/wikipedia/commons/thumb/f/fb/Charlemagne_denier_Mayence_812_814.jpg/225px-Charlemagne_denier_Mayence_812_814.jpg',
  'birthDate': '0747-04-02T00:00:00',
  'birthPlace': '/wiki/Li%C3%A8ge',
  'deathDate': '0814-01-28T00:00:00',
  'deathPlace': '/wiki/Francia',
  'spouseList': ['/wiki/Desiderata_of_the_Lombards',
   '/wiki/Hildegard_of_the_Vinzgau',
   '/wiki/Fastrada',
   '/wiki/Luitgard_(Frankish_queen)'],
  'issueList': ['/wiki/Pepin_the_Hunchback',
   '/wiki/Charles_the_Younger',
   '/wiki/Pepin_of_Italy',
   '/wiki/Louis_the_Pious'],
  'dynasty': '/wiki/Carolingian_dynasty',
  'father': '/wiki/Pepin_the_Short',
  'mother': '/wiki/Bertrada_of_Laon',
  'religion': '/wiki/Chalcedonian_Christianity'},
 '/wiki/Charles_the_Younger': {'id': '/wiki/Charles_the_Younger',
  'name': 'Charles the Younger',
  'title': '/wiki/King_of_the_Franks',
 

In [93]:
missingData = {}
fields = ['birthDate', 'birthPlace', 'deathDate', 'deathPlace']
for personId in data:
    for field in fields:
        if field not in data[personId] or not data[personId][field]:
            personMissing = missingData.get(personId, [])
            personMissing.append(field)
            missingData[personId] = personMissing
missingData

{'/wiki/Charles_the_Younger': ['birthPlace', 'deathPlace'],
 '/wiki/Pepin_the_Short': ['birthPlace'],
 '/wiki/Hildegard_of_the_Vinzgau': ['birthPlace'],
 '/wiki/Gisela,_Abbess_of_Chelles': ['birthDate',
  'birthPlace',
  'deathDate',
  'deathPlace'],
 '/wiki/Pepin_of_Italy': ['deathPlace'],
 '/wiki/Luitgard_(Frankish_queen)': ['birthDate', 'birthPlace', 'deathPlace'],
 '/wiki/Rotrude_of_Hesbaye': ['birthDate', 'birthPlace', 'deathPlace'],
 '/wiki/Grifo_(noble)': ['birthDate', 'birthPlace', 'deathDate', 'deathPlace'],
 '/wiki/Pepin_of_Herstal': ['birthPlace'],
 '/wiki/Begga': ['birthPlace', 'deathPlace'],
 '/wiki/Bernard,_son_of_Charles_Martel': ['birthDate',
  'birthPlace',
  'deathDate',
  'deathPlace'],
 '/wiki/Gerold_of_Vinzgau': ['birthPlace', 'deathPlace'],
 '/wiki/Remigius_of_Rouen': ['birthDate',
  'birthPlace',
  'deathDate',
  'deathPlace'],
 '/wiki/Hieronymus,_son_of_Charles_Martel': ['birthDate',
  'birthPlace',
  'deathDate',
  'deathPlace'],
 '/wiki/Lothair_I': ['birthPlac

In [94]:
def loadInfoBox(wiki_url):
    soup = utils.getWikiPage(wiki_url)
    return soup.find("table", class_="infobox")
loadInfoBox('/wiki/Charlemagne')

<table class="infobox vcard"><tbody><tr><th class="infobox-above fn" colspan="2" style="background-color: #cbe; font-size: 125%">Charlemagne</th></tr><tr><td class="infobox-subheader" colspan="2"><i><a href="/wiki/Holy_Roman_Emperor" title="Holy Roman Emperor">Emperor of the Romans</a></i></td></tr><tr><td class="infobox-image photo" colspan="2"><a class="image" href="/wiki/File:Charlemagne_denier_Mayence_812_814.jpg"><img alt="Charlemagne denier Mayence 812 814.jpg" data-file-height="587" data-file-width="559" decoding="async" height="236" src="//upload.wikimedia.org/wikipedia/commons/thumb/f/fb/Charlemagne_denier_Mayence_812_814.jpg/225px-Charlemagne_denier_Mayence_812_814.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/f/fb/Charlemagne_denier_Mayence_812_814.jpg/338px-Charlemagne_denier_Mayence_812_814.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/f/fb/Charlemagne_denier_Mayence_812_814.jpg/450px-Charlemagne_denier_Mayence_812_814.jpg 2x" width="225"/></a><div

In [95]:
missingBirthDate = []
missingBirthPlace = []
missingDeathDate = []
missingDeathPlace = []
for personId in missingData:
    infobox = loadInfoBox(personId)
    if infobox:
        if 'birthDate' in missingData[personId]:
            dateRow = infobox.find("th", text="Born").parent if infobox.find("th", text="Born") else None
            if dateRow:
                missingBirthDate.append(personId)
        if 'birthPlace' in missingData[personId]:
            dateRow = infobox.find("th", text="Born").parent if infobox.find("th", text="Born") else None
            if dateRow:
                missingBirthPlace.append(personId)
        if 'deathDate' in missingData[personId]:
            dateRow = infobox.find("th", text="Died").parent if infobox.find("th", text="Died") else None
            if dateRow:
                missingDeathDate.append(personId)
        if 'deathPlace' in missingData[personId]:
            dateRow = infobox.find("th", text="Died").parent if infobox.find("th", text="Died") else None
            if dateRow:
                missingDeathPlace.append(personId)

In [96]:
print(missingBirthDate)
print(missingBirthPlace)
print(missingDeathDate)
print(missingDeathPlace)

['/wiki/Edith_of_Polesworth', '/wiki/Eadhild', '/wiki/%C3%86thelstan_%C3%86theling', '/wiki/Tove_of_the_Obotrites', '/wiki/Euphemia_of_Kiev', '/wiki/Davyd_Sviatoslavich', '/wiki/Boris_and_Gleb', '/wiki/Cristina_(daughter_of_Edward_the_Exile)', '/wiki/Ingibiorg_Finnsdottir', '/wiki/Agnes_of_Burgundy,_Duchess_of_Aquitaine', '/wiki/Gyula_II', '/wiki/Gunnhildr_Sveinsd%C3%B3ttir', '/wiki/Louis_I,_Landgrave_of_Thuringia', '/wiki/Judith_d%27%C3%89vreux', '/wiki/Reginar_III,_Count_of_Hainaut', '/wiki/Gottschalk_(Obotrite_prince)']
['/wiki/Charles_the_Younger', '/wiki/Pepin_the_Short', '/wiki/Hildegard_of_the_Vinzgau', '/wiki/Pepin_of_Herstal', '/wiki/Begga', '/wiki/Gerold_of_Vinzgau', '/wiki/Lothair_I', '/wiki/Carloman_I', '/wiki/Alpaida', '/wiki/Himiltrude', '/wiki/Ansegisel', '/wiki/Desiderius', '/wiki/Ermengarde_of_Hesbaye', '/wiki/Louis_the_German', '/wiki/Carloman_(mayor_of_the_palace)', '/wiki/Rothilde', '/wiki/Theodrada', '/wiki/Gisela,_daughter_of_Louis_the_Pious', '/wiki/Ermentrude_of

In [112]:
importlib.reload(utils)
for id in missingBirthPlace:
    person = { 'id': id }
    utils.extractDate(loadInfoBox(id), person, birth=True)
    print(person)

<tr><th class="infobox-label" scope="row">Born</th><td class="infobox-data" style="text-align: left;"><abbr title="circa">c.</abbr><span style="white-space:nowrap;"> 1015</span></td></tr>
{'id': '/wiki/Eustace_II,_Count_of_Boulogne', 'birthDate': datetime.datetime(1015, 1, 1, 0, 0), 'birthPlace': None}
<tr><th class="infobox-label" scope="row">Born</th><td class="infobox-data" style="text-align: left;"><abbr title="circa">c.</abbr><span style="white-space:nowrap;"> 1045-1055</span></td></tr>
{'id': '/wiki/Robert_Fitzhamon', 'birthDate': datetime.datetime(1045, 1, 1, 0, 0), 'birthPlace': None}
<tr><th class="infobox-label" scope="row">Born</th><td class="infobox-data">c. 1073</td></tr>
{'id': '/wiki/Philippa,_Countess_of_Toulouse', 'birthDate': datetime.datetime(1073, 1, 1, 0, 0), 'birthPlace': None}
<tr><th class="infobox-label" scope="row">Born</th><td class="infobox-data" style="text-align: left;"><abbr title="circa">c.</abbr><span style="white-space:nowrap;"> 958</span></td></tr>
{'

In [53]:

datetime.strptime('768', '%Y')

ValueError: time data '768' does not match format '%Y'