In [68]:
# load complete.csv
import pandas as pd

df = pd.read_csv('complete.csv')
df.columns

Index(['awardYear', 'category', 'categoryFullName', 'sortOrder', 'portion',
       'prizeAmount', 'prizeAmountAdjusted', 'dateAwarded', 'prizeStatus',
       'motivation', 'categoryTopMotivation', 'award_link', 'id', 'name',
       'knownName', 'givenName', 'familyName', 'fullName', 'penName', 'gender',
       'laureate_link', 'birth_date', 'birth_city', 'birth_cityNow',
       'birth_continent', 'birth_country', 'birth_countryNow',
       'birth_locationString', 'death_date', 'death_city', 'death_cityNow',
       'death_continent', 'death_country', 'death_countryNow',
       'death_locationString', 'orgName', 'nativeName', 'acronym',
       'org_founded_date', 'org_founded_city', 'org_founded_cityNow',
       'org_founded_continent', 'org_founded_country',
       'org_founded_countryNow', 'org_founded_locationString', 'ind_or_org',
       'residence_1', 'residence_2', 'affiliation_1', 'affiliation_2',
       'affiliation_3', 'affiliation_4'],
      dtype='object')

# add Photo information

In [70]:
import urllib.request
import requests
import json
import lxml
import tqdm
import urllib

img_online_paths = []
img_local_paths = []
for link in tqdm.tqdm(df['laureate_link'], total=len(df)):
    # get page from link
    page = requests.get(link).text
    # parse json
    page = json.loads(page)
    if len(page) > 1:
        print("STRANGE", len(page))
    page = page[0]
    wiki_url = page["wikidata"]["url"]
    img_name = page["wikidata"]["id"]
    # get page from link
    page = requests.get(wiki_url).text
    # get all image links inside /html/body/div[3]/div[3]/div[5]/div[1]/div[1]/div[3]/div[1]/div[2], a box contaning images in the wikidata page
    div = lxml.html.fromstring(page).xpath('/html/body/div[3]/div[3]/div[5]/div[1]/div[1]/div[3]/div[1]/div[2]')[0]
    imgs = div.xpath('.//img')
    if len(imgs) == 0:
        img_online_paths.append("https://static.vecteezy.com/system/resources/thumbnails/009/292/244/small/default-avatar-icon-of-social-media-user-vector.jpg")
        img_local_paths.append("faces/default.jpg")
        continue
    else:
        img_path = "https:" + [img.attrib['src'] for img in imgs][0]

    urllib.request.urlretrieve(img_path, f'faces/{img_name}.jpg') # download it locally
    img_online_paths.append(img_path)
    img_local_paths.append(f'faces/{img_name}.jpg')

100%|██████████| 950/950 [35:06<00:00,  2.22s/it]


In [71]:
df['img_online_path'] = img_online_paths
df['img_local_path'] = img_local_paths

In [73]:
df = df[df["ind_or_org"] == "Individual"].sort_values(by="awardYear")

In [74]:
df = df[["awardYear", "category", "name", "gender", "birth_date", "affiliation_1", "birth_cityNow", "birth_countryNow", "motivation", "award_link", "laureate_link", "img_online_path", "img_local_path"]]

In [75]:
df["age"] = df["awardYear"] - df["birth_date"].str.split("-", expand=True)[0].astype(int)
df.head(2)

Unnamed: 0,awardYear,category,name,gender,birth_date,affiliation_1,birth_cityNow,birth_countryNow,motivation,award_link,laureate_link,img_online_path,img_local_path,age
211,1901,Physiology or Medicine,Emil von Behring,male,1854-03-15,"Marburg University, Marburg, Germany",Lawice,Poland,"for his work on serum therapy, especially its ...",https://masterdataapi.nobelprize.org/2/nobelPr...,http://masterdataapi.nobelprize.org/2/laureate...,https://upload.wikimedia.org/wikipedia/commons...,faces/Q76425.jpg,47
415,1901,Chemistry,Jacobus H. van 't Hoff,male,1852-08-30,"Berlin University, Berlin, Germany",Rotterdam,the Netherlands,in recognition of the extraordinary services h...,https://masterdataapi.nobelprize.org/2/nobelPr...,http://masterdataapi.nobelprize.org/2/laureate...,https://upload.wikimedia.org/wikipedia/commons...,faces/Q102822.jpg,49


In [76]:
# map category "Physiology or Medicine" to "Medicine"
df["category"] = df["category"].map({"Physiology or Medicine": "Medicine"}).fillna(df["category"])

In [77]:
aff_countries = []
for aff in df["affiliation_1"]:
    try:
        aff_res = aff.split(",")[-1].strip()
    except:
        aff_res = ""
    if aff_res == "MIT":
        print(aff_res, aff)

    aff_countries.append(aff_res)
df["affiliation_country"] = aff_countries
# remap "USSR (now Russia)" to "Russia"
df["affiliation_country"] = df["affiliation_country"].str.replace("USSR (now Russia)", "Russia")
df["affiliation_country"] = df["affiliation_country"].str.replace("Czechoslovakia (now Czech Republic)", "Czech Republic")
df["affiliation_country"] = df["affiliation_country"].str.replace("Germany (now France)", "France")

In [78]:
latlon = {
    "USA": (38.51, -100.30),
    "United Kingdom": (52.94, -1.23),
    "Germany": (51.42, 10.60),
    "France": (46.97, 2.70),
    "Switzerland": (47.00, 8.43),
    "Japan": (36.98, 139.59),
    "Sweden": (60.80, 15.21),
    "Russia": (61.10, 81.38),
    "the Netherlands": (52.52, 5.74),
    "Canada": (54.98, -100.53),
    "Denmark": (56.31, 9.69),
    "Austria": (47.35, 14.47),
    "Italy": (42.41, 12.74),
    "Belgium": (50.67, 4.86),
    "Norway": (61.19, 8.66),
    "Israel": (31.51, 34.93),
    "Australia": (-26.32, 134.63),
    "Argentina": (-37.57, -66.30),
    "Spain": (40.37, -3.14),
    "Tunisia": (33.34, 9.60),
    "India": (20.60, 78.66),
    "Hungary": (46.66, 18.98),
    "Finland": (63.55, 27.32),
    "Portugal": (39.76, -8.27),
    "Ireland": (53.39, -8.32),
    "Czech Republic": (49.90, 15.61),
    "China": (32.87, 105.22),
    "Europe": (48.55, 5.72),
    "North America": (38.51, -100.30),
}
# map EU countries (Germany, France...) to EU value
df["affiliation_zone"] = df["affiliation_country"].map(lambda x: "Europe" if x in ["United Kingdom", "Germany", "France", "Switzerland", "Sweden", "the Netherlands", "Denmark", "Austria", "Italy", "Belgium", "Norway", "Spain", "Hungary", "Finland", "Portugal", "Ireland", "Czech Republic"] else x)
df["affiliation_zone"] = df["affiliation_zone"].map(lambda x: "North America" if x in ["USA", "Canada"] else x)
df["birth_zone"] = df["birth_countryNow"].map(lambda x: "Europe" if x in ["United Kingdom", "Germany", "France", "Switzerland", "Sweden", "the Netherlands", "Denmark", "Austria", "Italy", "Belgium", "Norway", "Spain", "Hungary", "Finland", "Portugal", "Ireland", "Czech Republic"] else x)
df["birth_zone"] = df["birth_zone"].map(lambda x: "North America" if x in ["USA", "Canada"] else x)

df["affiliation_lat"] = df["affiliation_country"].map(lambda x: latlon.get(x, (0, 0))[0])
df["affiliation_lon"] = df["affiliation_country"].map(lambda x: latlon.get(x, (0, 0))[1])
df["affiliation_zone_lat"] = df["affiliation_zone"].map(lambda x: latlon.get(x, (0, 0))[0])
df["affiliation_zone_lon"] = df["affiliation_zone"].map(lambda x: latlon.get(x, (0, 0))[1])

df["birth_lat"] = df["birth_countryNow"].map(lambda x: latlon.get(x, (0, 0))[0])
df["birth_lon"] = df["birth_countryNow"].map(lambda x: latlon.get(x, (0, 0))[1])
df["birth_zone_lat"] = df["birth_zone"].map(lambda x: latlon.get(x, (0, 0))[0])
df["birth_zone_lon"] = df["birth_zone"].map(lambda x: latlon.get(x, (0, 0))[1])
df.head(2)

Unnamed: 0,awardYear,category,name,gender,birth_date,affiliation_1,birth_cityNow,birth_countryNow,motivation,award_link,...,affiliation_zone,birth_zone,affiliation_lat,affiliation_lon,affiliation_zone_lat,affiliation_zone_lon,birth_lat,birth_lon,birth_zone_lat,birth_zone_lon
211,1901,Medicine,Emil von Behring,male,1854-03-15,"Marburg University, Marburg, Germany",Lawice,Poland,"for his work on serum therapy, especially its ...",https://masterdataapi.nobelprize.org/2/nobelPr...,...,Europe,Poland,51.42,10.60,48.55,5.72,0.00,0.00,0.00,0.00
415,1901,Chemistry,Jacobus H. van 't Hoff,male,1852-08-30,"Berlin University, Berlin, Germany",Rotterdam,the Netherlands,in recognition of the extraordinary services h...,https://masterdataapi.nobelprize.org/2/nobelPr...,...,Europe,Europe,51.42,10.60,48.55,5.72,52.52,5.74,48.55,5.72
836,1901,Literature,Sully Prudhomme,male,1839-03-16,,Paris,France,in special recognition of his poetic compositi...,https://masterdataapi.nobelprize.org/2/nobelPr...,...,,Europe,0.00,0.00,0.00,0.00,46.97,2.70,48.55,5.72
360,1901,Peace,Henry Dunant,male,1828-05-08,,Geneva,Switzerland,for his humanitarian efforts to help wounded s...,https://masterdataapi.nobelprize.org/2/nobelPr...,...,,Europe,0.00,0.00,0.00,0.00,47.00,8.43,48.55,5.72
901,1901,Physics,Wilhelm Conrad Röntgen,male,1845-03-27,"Munich University, Munich, Germany",Remscheid,Germany,in recognition of the extraordinary services h...,https://masterdataapi.nobelprize.org/2/nobelPr...,...,Europe,Europe,51.42,10.60,48.55,5.72,51.42,10.60,48.55,5.72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559,2019,Chemistry,M. Stanley Whittingham,male,1941-12-22,"Binghamton University, State University of New...",,United Kingdom,for the development of lithium-ion batteries,https://masterdataapi.nobelprize.org/2/nobelPr...,...,North America,Europe,38.51,-100.30,38.51,-100.30,52.94,-1.23,48.55,5.72
428,2019,Physics,James Peebles,male,1935-04-25,"Princeton University, Princeton, NJ, USA",Winnipeg,Canada,for theoretical discoveries in physical cosmology,https://masterdataapi.nobelprize.org/2/nobelPr...,...,North America,North America,38.51,-100.30,38.51,-100.30,54.98,-100.53,38.51,-100.30
918,2019,Medicine,William G. Kaelin Jr,male,1957-11-23,"Harvard Medical School, Boston, MA, USA","New York, NY",USA,for their discoveries of how cells sense and a...,https://masterdataapi.nobelprize.org/2/nobelPr...,...,North America,North America,38.51,-100.30,38.51,-100.30,38.51,-100.30,38.51,-100.30
322,2019,Medicine,Gregg L. Semenza,male,1956-07-12,"Johns Hopkins University, Baltimore, MD, USA","New York, NY",USA,for their discoveries of how cells sense and a...,https://masterdataapi.nobelprize.org/2/nobelPr...,...,North America,North America,38.51,-100.30,38.51,-100.30,38.51,-100.30,38.51,-100.30


In [79]:
df[df["affiliation_zone"] == "Tunisia"]

Unnamed: 0,awardYear,category,name,gender,birth_date,affiliation_1,birth_cityNow,birth_countryNow,motivation,award_link,...,affiliation_zone,birth_zone,affiliation_lat,affiliation_lon,affiliation_zone_lat,affiliation_zone_lon,birth_lat,birth_lon,birth_zone_lat,birth_zone_lon
131,1928,Medicine,Charles Nicolle,male,1866-09-21,"Institut Pasteur, Tunis, Tunisia",Rouen,France,for his work on typhus,https://masterdataapi.nobelprize.org/2/nobelPr...,...,Tunisia,Europe,33.34,9.6,33.34,9.6,46.97,2.7,48.55,5.72


In [81]:
df.to_csv("complete_processed.csv", index=False)
json = df.to_json(orient="records")
with open("complete_processed.json", "w") as f:
    f.write(json)