In [1]:
import numpy as np
import pandas as pd
from pymongo import MongoClient
DB_NOBEL_PRIZE = 'nobel_prize'
COLL_WINNERS = 'winners'

In [2]:
def get_mongo_database(db_name, host='localhost', port=27017, username=None, password=None):
    if username and password:
        mongo_uri = 'mongodb://{}:{}@{}/()'.format(username, password, host, db_name)
        conn = MongoClient(mongo_uri)
    else:
        conn = MongoClient(host, port)
    
    return conn[db_name]

def mongo_to_dataframe(db_name, collection, query=None, host='localhost', port=27017,
                       username=None, password=None, no_id=True):
    db = get_mongo_database(db_name, host, port, username, password)
    
    if query == None:
        query = {}
    cursor = db[collection].find(query)
    df = pd.DataFrame(list(cursor))
    
    if no_id:
        del df['_id']
        
    return df

def dataframe_to_mongo(df, db_name, collection, host='localhost', port=27017,
                       username=None, password=None):
    db = get_mongo_database(db_name, host, port, username, password)
    
    records = df.to_dict('records')
    db[collection].insert_many(records)

In [3]:
fname = '/Users/simonsu/git/dataviz/scrapy/nobel_winners/nobel_full.json'
df = pd.read_json(fname)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1065 entries, 0 to 1064
Data columns (total 12 columns):
born_in           1065 non-null object
catagory          1065 non-null object
country           1065 non-null object
date_of_birth     1056 non-null object
date_of_death     709 non-null object
gender            1056 non-null object
link              1065 non-null object
name              1065 non-null object
place_of_birth    1056 non-null object
place_of_death    709 non-null object
text              1065 non-null object
year              1065 non-null int64
dtypes: int64(1), object(11)
memory usage: 108.2+ KB


In [4]:
df.columns

Index(['born_in', 'catagory', 'country', 'date_of_birth', 'date_of_death',
       'gender', 'link', 'name', 'place_of_birth', 'place_of_death', 'text',
       'year'],
      dtype='object')

In [5]:
df[df.catagory=='']

Unnamed: 0,born_in,catagory,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
538,,,Ukraine,3 May 1845,2 July 1916,male,https://en.wikipedia.org/wiki/Ilya_Ilyich_Mech...,Ilya Ilyich Mechnikov,Ivanivka,Paris,"Ilya Ilyich Mechnikov , Physiology and Medicin...",1908
559,,,Ukraine,22 July 1888,16 August 1973,male,https://en.wikipedia.org/wiki/Selman_Waksman,Selman Waksman,Kiev Governorate,Barnstable County,"Selman Waksman , Physiology and Medicine, 1952",1952
649,Poland,,,30 November 1926,,male,https://en.wikipedia.org/wiki/Andrew_Schally,Andrew Schally *,Vilnius,,"Andrew Schally *, born in Wilno, Second Polis...",1977
766,India,,,9 January 1922,9 November 2011,male,https://en.wikipedia.org/wiki/Har_Gobind_Khorana,Har Gobind Khorana *,Raipur,Concord,"Har Gobind Khorana *, as a United States Citi...",1968
780,,,Hungary,22 April 1876,8 April 1936,male,https://en.wikipedia.org/wiki/R%C3%B3bert_B%C3...,Róbert Bárány,Vienna,Uppsala domkyrkoförsamling,"Róbert Bárány , born in Austria-Hungary, Medi...",1914
782,,,Hungary,13 September 1887,26 September 1976,male,https://en.wikipedia.org/wiki/Leopold_Ru%C5%BE...,Leopold Ružička born in Kingdom of Hungary,Vukovar,Mammern,Leopold Ružička born in Kingdom of Hungary,0
895,,,France,28 June 1873,5 November 1944,male,https://en.wikipedia.org/wiki/Alexis_Carrel,Alexis Carrel,Sainte-Foy-lès-Lyon,Paris,"Alexis Carrel , Medicine, 1912",1912


In [6]:
df.describe(include=['object'])

Unnamed: 0,born_in,catagory,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text
count,1065.0,1065,1065,1056,709,1056,1065,1065,1056,709,1065
unique,34.0,7,58,868,589,2,913,1003,607,316,1058
top,,Physiology or Medicine,United States,7 November 1867,4 July 1934,male,https://en.wikipedia.org/wiki/Marie_Curie,Aziz Sancar,New York City,Cambridge,"John Polanyi , born in Germany , Chemistry, 1986"
freq,934.0,254,352,4,4,999,4,2,44,36,2


In [7]:
df.head()

Unnamed: 0,born_in,catagory,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
0,Austria,Chemistry,,3 December 1900,31 July 1967,male,https://en.wikipedia.org/wiki/Richard_Kuhn,Richard Kuhn *,Vienna,Heidelberg,"Richard Kuhn *, Chemistry, 1938",1938
1,Austria,Physiology or Medicine,,3 June 1873,25 December 1961,male,https://en.wikipedia.org/wiki/Otto_Loewi,Otto Loewi *,Frankfurt,New York City,"Otto Loewi *, Physiology or Medicine, 1936",1936
2,,Physics,Austria,24 June 1883,17 December 1964,male,https://en.wikipedia.org/wiki/Victor_Francis_Hess,Victor Francis Hess,Deutschfeistritz,Mount Vernon,"Victor Francis Hess , Physics, 1936",1936
3,,Physics,Austria,12 August 1887,4 January 1961,male,https://en.wikipedia.org/wiki/Erwin_Schr%C3%B6...,Erwin Schrödinger,Vienna,Vienna,"Erwin Schrödinger , Physics, 1933",1933
4,,Physiology or Medicine,Austria,14 June 1868,26 June 1943,male,https://en.wikipedia.org/wiki/Karl_Landsteiner,Karl Landsteiner,Vienna,New York City,"Karl Landsteiner , Physiology or Medicine, 1930",1930


In [8]:
mask = df.year > 2000
winners_since_2000 = df[mask]
winners_since_2000.count()

born_in           230
catagory          230
country           230
date_of_birth     228
date_of_death      35
gender            228
link              230
name              230
place_of_birth    228
place_of_death     35
text              230
year              230
dtype: int64

In [9]:
winners_since_2000.head()

Unnamed: 0,born_in,catagory,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
12,,Physiology or Medicine,Ireland,28 June 1930,,male,https://en.wikipedia.org/wiki/William_C._Campb...,William C. Campbell,Ramelton,,"William C. Campbell , Physiology or Medicine, ...",2015
21,,Chemistry,Switzerland,4 October 1938,,male,https://en.wikipedia.org/wiki/Kurt_W%C3%BCthrich,Kurt Wüthrich,Aarberg,,"Kurt Wüthrich , Chemistry, 2002",2002
39,,Literature,United States,24 May 1941,,male,https://en.wikipedia.org/wiki/Bob_Dylan,Bob Dylan,Duluth,,"Bob Dylan , Literature, 2016",2016
40,,Chemistry,United States,15 March 1930,,male,https://en.wikipedia.org/wiki/Martin_Karplus,Martin Karplus,Vienna,,"Martin Karplus , born in Austria , Chemistry,...",2013
44,,Physiology or Medicine,United States,29 January 1947,,female,https://en.wikipedia.org/wiki/Linda_B._Buck,Linda B. Buck,Seattle,,"Linda B. Buck , Physiology or Medicine, 2004",2004


In [10]:
df.born_in.describe()

count     1065
unique      34
top           
freq       934
Name: born_in, dtype: object

In [11]:
set(df.born_in.apply(type))

{str}

In [12]:
bi_col = df.born_in
bi_col.replace('', np.NaN, inplace=True)
df.born_in.describe()

count         131
unique         33
top       Germany
freq           24
Name: born_in, dtype: object

In [13]:
df.replace('', np.NaN, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1065 entries, 0 to 1064
Data columns (total 12 columns):
born_in           131 non-null object
catagory          1058 non-null object
country           934 non-null object
date_of_birth     1056 non-null object
date_of_death     709 non-null object
gender            1056 non-null object
link              1065 non-null object
name              1065 non-null object
place_of_birth    1056 non-null object
place_of_death    709 non-null object
text              1065 non-null object
year              1065 non-null int64
dtypes: int64(1), object(11)
memory usage: 108.2+ KB


In [14]:
df.head()

Unnamed: 0,born_in,catagory,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
0,Austria,Chemistry,,3 December 1900,31 July 1967,male,https://en.wikipedia.org/wiki/Richard_Kuhn,Richard Kuhn *,Vienna,Heidelberg,"Richard Kuhn *, Chemistry, 1938",1938
1,Austria,Physiology or Medicine,,3 June 1873,25 December 1961,male,https://en.wikipedia.org/wiki/Otto_Loewi,Otto Loewi *,Frankfurt,New York City,"Otto Loewi *, Physiology or Medicine, 1936",1936
2,,Physics,Austria,24 June 1883,17 December 1964,male,https://en.wikipedia.org/wiki/Victor_Francis_Hess,Victor Francis Hess,Deutschfeistritz,Mount Vernon,"Victor Francis Hess , Physics, 1936",1936
3,,Physics,Austria,12 August 1887,4 January 1961,male,https://en.wikipedia.org/wiki/Erwin_Schr%C3%B6...,Erwin Schrödinger,Vienna,Vienna,"Erwin Schrödinger , Physics, 1933",1933
4,,Physiology or Medicine,Austria,14 June 1868,26 June 1943,male,https://en.wikipedia.org/wiki/Karl_Landsteiner,Karl Landsteiner,Vienna,New York City,"Karl Landsteiner , Physiology or Medicine, 1930",1930


In [15]:
df[df.name.str.contains('\*')]['name']

0                     Richard Kuhn *
1                       Otto Loewi *
17                      Lev Landau *
28                   Patrick White *
38                     Ronald Ross *
73                     T. S. Eliot *
152               Ben R. Mottelson *
186             Har Gobind Khorana *
225                 Czesław Miłosz *
232     Subrahmanyan Chandrasekhar *
276                Daniel Kahneman *
343                Rudyard Kipling *
347                  Robert Aumann *
372       Venkatraman Ramakrishnan *
391                  Arieh Warshel *
392                 Michael Levitt *
402                 Shuji Nakamura *
403                   John O'Keefe *
416                 Joseph Rotblat *
420     John James Rickard Macleod *
425                  Michael Smith *
428                Niels Kaj Jerne *
470              Christian de Duve *
505            Clive W. J. Granger *
506             Anthony J. Leggett *
519                Oliver Smithies *
533                 Michael Levitt *
5

In [16]:
df[df.name.str.contains('\*')]['name'].count()

131

In [17]:
df.name = df.name.str.replace('*', '')

In [18]:
df.name = df.name.str.strip()

In [19]:
df.name.head()

0           Richard Kuhn
1             Otto Loewi
2    Victor Francis Hess
3      Erwin Schrödinger
4       Karl Landsteiner
Name: name, dtype: object

In [20]:
df[df.name.str.contains('\*')]

Unnamed: 0,born_in,catagory,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1065 entries, 0 to 1064
Data columns (total 12 columns):
born_in           131 non-null object
catagory          1058 non-null object
country           934 non-null object
date_of_birth     1056 non-null object
date_of_death     709 non-null object
gender            1056 non-null object
link              1065 non-null object
name              1065 non-null object
place_of_birth    1056 non-null object
place_of_death    709 non-null object
text              1065 non-null object
year              1065 non-null int64
dtypes: int64(1), object(11)
memory usage: 108.2+ KB


In [22]:
df.count()

born_in            131
catagory          1058
country            934
date_of_birth     1056
date_of_death      709
gender            1056
link              1065
name              1065
place_of_birth    1056
place_of_death     709
text              1065
year              1065
dtype: int64

In [23]:
df[df.name.str.contains('Martin')]

Unnamed: 0,born_in,catagory,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
40,,Chemistry,United States,15 March 1930,,male,https://en.wikipedia.org/wiki/Martin_Karplus,Martin Karplus,Vienna,,"Martin Karplus , born in Austria , Chemistry,...",2013
114,,Peace,United States,15 January 1929,4 April 1968,male,https://en.wikipedia.org/wiki/Martin_Luther_Ki...,Martin Luther King,Atlanta,Memphis,"Martin Luther King, Jr. , Peace, 1964",1964
165,,Physiology or Medicine,United States,10 December 1934,9 February 1994,male,https://en.wikipedia.org/wiki/Howard_Martin_Temin,Howard Martin Temin,Philadelphia,Madison,"Howard Martin Temin , Physiology or Medicine, ...",1975
286,,Physiology or Medicine,United States,1 December 1925,7 December 1998,male,https://en.wikipedia.org/wiki/Martin_Rodbell,Martin Rodbell,Baltimore,Chapel Hill,"Martin Rodbell , Physiology or Medicine, 1994",1994
294,,Physics,United States,24 June 1927,30 September 2014,male,https://en.wikipedia.org/wiki/Martin_L._Perl,Martin L. Perl,New York City,Palo Alto,"Martin L. Perl , Physics, 1995",1995
369,,Chemistry,United States,15 January 1947,,male,https://en.wikipedia.org/wiki/Martin_Chalfie,Martin Chalfie,Chicago,,"Martin Chalfie , Chemistry, 2008",2008
411,,Physics,United Kingdom,27 September 1918,14 October 1984,male,https://en.wikipedia.org/wiki/Martin_Ryle,Martin Ryle,Brighton,Cambridge,"Martin Ryle , Physics, 1974",1974
461,,Chemistry,United Kingdom,1 March 1910,28 July 2002,male,https://en.wikipedia.org/wiki/Archer_John_Port...,Archer John Porter Martin,London,Llangarron,"Archer John Porter Martin , Chemistry, 1952",1952
518,,Physiology or Medicine,United Kingdom,1 January 1941,,male,https://en.wikipedia.org/wiki/Martin_Evans,Sir Martin J. Evans,Stroud,,"Sir Martin J. Evans , Physiology or Medicine, ...",2007
577,,Literature,Sweden,6 May 1904,11 February 1978,male,https://en.wikipedia.org/wiki/Harry_Martinson,Harry Martinson,Blekinge,Stockholm,"Harry Martinson , Literature, 1974",1974


In [24]:
df[df.duplicated('name')]

Unnamed: 0,born_in,catagory,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
156,,Peace,United States,28 February 1901,19 August 1994,male,https://en.wikipedia.org/wiki/Linus_Pauling,Linus C. Pauling,Portland,Big Sur,"Linus C. Pauling , Peace, 1962",1962
203,,Physics,United States,23 May 1908,30 January 1991,male,https://en.wikipedia.org/wiki/John_Bardeen,John Bardeen,Madison,Boston,"John Bardeen , Physics, 1972",1972
228,,Physiology or Medicine,United States,29 October 1920,2 August 2011,male,https://en.wikipedia.org/wiki/Baruj_Benacerraf,Baruj Benacerraf,Caracas,Boston,"Baruj Benacerraf , born in Venezuela , Physio...",1980
399,,Physiology or Medicine,United States,28 June 1930,,male,https://en.wikipedia.org/wiki/William_C._Campb...,William C. Campbell,Ramelton,,"William C. Campbell , born in Ireland , Physi...",2015
426,,Economic,United Kingdom,29 December 1910,2 September 2013,male,https://en.wikipedia.org/wiki/Ronald_Coase,Ronald Coase,London,Chicago,"Ronald Coase , based in the United States , E...",1991
444,,Literature,United Kingdom,26 September 1888,4 January 1965,male,https://en.wikipedia.org/wiki/T._S._Eliot,T. S. Eliot,St. Louis,London,"T. S. Eliot , born in the United States , Lit...",1948
482,,Chemistry,United Kingdom,13 August 1918,19 November 2013,male,https://en.wikipedia.org/wiki/Frederick_Sanger,Frederick Sanger,Rendcomb,Cambridge,"Frederick Sanger , Chemistry, 1980",1980
497,,Physiology or Medicine,United Kingdom,13 January 1927,,male,https://en.wikipedia.org/wiki/Sydney_Brenner,Sydney Brenner,Germiston,,"Sydney Brenner , born in South Africa , Physi...",2002
506,United Kingdom,Physics,,26 March 1938,,male,https://en.wikipedia.org/wiki/Anthony_J._Leggett,Anthony J. Leggett,London,,"Anthony J. Leggett *, Physics, 2003",2003
519,United Kingdom,Physiology or Medicine,,23 June 1925,10 January 2017,male,https://en.wikipedia.org/wiki/Oliver_Smithies,Oliver Smithies,Halifax,Chapel Hill,"Oliver Smithies *, Physiology or Medicine, 2007",2007


In [25]:
df[df.born_in.isnull()].count()

born_in             0
catagory          929
country           934
date_of_birth     925
date_of_death     624
gender            925
link              934
name              934
place_of_birth    925
place_of_death    624
text              934
year              934
dtype: int64

In [26]:
df = df[df.born_in.isnull()]

In [27]:
df.count()

born_in             0
catagory          929
country           934
date_of_birth     925
date_of_death     624
gender            925
link              934
name              934
place_of_birth    925
place_of_death    624
text              934
year              934
dtype: int64

In [28]:
df = df.drop('born_in', axis=1)

In [29]:
df.count()

catagory          929
country           934
date_of_birth     925
date_of_death     624
gender            925
link              934
name              934
place_of_birth    925
place_of_death    624
text              934
year              934
dtype: int64

In [30]:
df[df.duplicated('name')].count()

catagory          50
country           51
date_of_birth     50
date_of_death     31
gender            50
link              51
name              51
place_of_birth    50
place_of_death    31
text              51
year              51
dtype: int64

In [31]:
df[df.duplicated('name')]

Unnamed: 0,catagory,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
156,Peace,United States,28 February 1901,19 August 1994,male,https://en.wikipedia.org/wiki/Linus_Pauling,Linus C. Pauling,Portland,Big Sur,"Linus C. Pauling , Peace, 1962",1962
203,Physics,United States,23 May 1908,30 January 1991,male,https://en.wikipedia.org/wiki/John_Bardeen,John Bardeen,Madison,Boston,"John Bardeen , Physics, 1972",1972
228,Physiology or Medicine,United States,29 October 1920,2 August 2011,male,https://en.wikipedia.org/wiki/Baruj_Benacerraf,Baruj Benacerraf,Caracas,Boston,"Baruj Benacerraf , born in Venezuela , Physio...",1980
399,Physiology or Medicine,United States,28 June 1930,,male,https://en.wikipedia.org/wiki/William_C._Campb...,William C. Campbell,Ramelton,,"William C. Campbell , born in Ireland , Physi...",2015
426,Economic,United Kingdom,29 December 1910,2 September 2013,male,https://en.wikipedia.org/wiki/Ronald_Coase,Ronald Coase,London,Chicago,"Ronald Coase , based in the United States , E...",1991
482,Chemistry,United Kingdom,13 August 1918,19 November 2013,male,https://en.wikipedia.org/wiki/Frederick_Sanger,Frederick Sanger,Rendcomb,Cambridge,"Frederick Sanger , Chemistry, 1980",1980
497,Physiology or Medicine,United Kingdom,13 January 1927,,male,https://en.wikipedia.org/wiki/Sydney_Brenner,Sydney Brenner,Germiston,,"Sydney Brenner , born in South Africa , Physi...",2002
521,Physics,Switzerland,23 October 1905,10 September 1983,male,https://en.wikipedia.org/wiki/Felix_Bloch,Felix Bloch,Zürich,Zürich,"Felix Bloch , Physics, 1952",1952
526,Chemistry,United Kingdom,24 May 1942,,male,https://en.wikipedia.org/wiki/Fraser_Stoddart,Fraser Stoddart,Edinburgh,,"Fraser Stoddart , Chemistry, 2016",2016
527,Physics,United Kingdom,21 September 1934,,male,https://en.wikipedia.org/wiki/David_J._Thouless,David J. Thouless,Bearsden,,"David J. Thouless , Physics, 2016",2016


In [32]:
len(df)

934

In [33]:
len(df[df.duplicated('name')])

51

In [34]:
df[df.name=="Linus C. Pauling"]

Unnamed: 0,catagory,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
149,Chemistry,United States,28 February 1901,19 August 1994,male,https://en.wikipedia.org/wiki/Linus_Pauling,Linus C. Pauling,Portland,Big Sur,"Linus C. Pauling , Chemistry, 1954",1954
156,Peace,United States,28 February 1901,19 August 1994,male,https://en.wikipedia.org/wiki/Linus_Pauling,Linus C. Pauling,Portland,Big Sur,"Linus C. Pauling , Peace, 1962",1962


In [35]:
df[df.name=="John Bardeen"]

Unnamed: 0,catagory,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
87,Physics,United States,23 May 1908,30 January 1991,male,https://en.wikipedia.org/wiki/John_Bardeen,John Bardeen,Madison,Boston,"John Bardeen , Physics, 1956",1956
203,Physics,United States,23 May 1908,30 January 1991,male,https://en.wikipedia.org/wiki/John_Bardeen,John Bardeen,Madison,Boston,"John Bardeen , Physics, 1972",1972


In [36]:
df[df.name=="Baruj Benacerraf"]

Unnamed: 0,catagory,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
81,Physiology or Medicine,Venezuela,29 October 1920,2 August 2011,male,https://en.wikipedia.org/wiki/Baruj_Benacerraf,Baruj Benacerraf,Caracas,Boston,"Baruj Benacerraf , Physiology or Medicine, 1980",1980
228,Physiology or Medicine,United States,29 October 1920,2 August 2011,male,https://en.wikipedia.org/wiki/Baruj_Benacerraf,Baruj Benacerraf,Caracas,Boston,"Baruj Benacerraf , born in Venezuela , Physio...",1980


In [37]:
all_dupes = df[df.duplicated('name') | df.duplicated('name', keep='last')]
all_dupes

Unnamed: 0,catagory,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
6,Chemistry,Austria,1 April 1865,23 September 1929,male,https://en.wikipedia.org/wiki/Richard_Adolf_Zs...,Richard Adolf Zsigmondy,Vienna,Göttingen,"Richard Adolf Zsigmondy , Chemistry, 1925",1925
8,Physiology or Medicine,Austria,22 April 1876,8 April 1936,male,https://en.wikipedia.org/wiki/R%C3%B3bert_B%C3...,Róbert Bárány,Vienna,Uppsala domkyrkoförsamling,"Róbert Bárány , Physiology or Medicine, 1914",1914
12,Physiology or Medicine,Ireland,28 June 1930,,male,https://en.wikipedia.org/wiki/William_C._Campb...,William C. Campbell,Ramelton,,"William C. Campbell , Physiology or Medicine, ...",2015
22,Peace,Switzerland,,,,https://en.wikipedia.org/wiki/M%C3%A9decins_Sa...,Médecins Sans Frontières,,,"Médecins Sans Frontières , Peace, 1999",1999
33,Physics,United Kingdom,5 June 1900,9 February 1979,male,https://en.wikipedia.org/wiki/Dennis_Gabor,Dennis Gabor,Budapest,London,"Dennis Gabor , born in Hungary , Physics, 1971",1971
52,Economic,United States,29 December 1910,2 September 2013,male,https://en.wikipedia.org/wiki/Ronald_Coase,Ronald Coase,London,Chicago,"Ronald Coase , born in the United Kingdom , E...",1991
61,Chemistry,United States,7 May 1939,,male,https://en.wikipedia.org/wiki/Sidney_Altman,Sidney Altman,Montreal,,"Sidney Altman , born in Canada , Chemistry, 1990",1990
81,Physiology or Medicine,Venezuela,29 October 1920,2 August 2011,male,https://en.wikipedia.org/wiki/Baruj_Benacerraf,Baruj Benacerraf,Caracas,Boston,"Baruj Benacerraf , Physiology or Medicine, 1980",1980
87,Physics,United States,23 May 1908,30 January 1991,male,https://en.wikipedia.org/wiki/John_Bardeen,John Bardeen,Madison,Boston,"John Bardeen , Physics, 1956",1956
135,Physics,United States,23 October 1905,10 September 1983,male,https://en.wikipedia.org/wiki/Felix_Bloch,Felix Bloch,Zürich,Zürich,"Felix Bloch , born in Switzerland , Physics, ...",1952


In [38]:
df[df.duplicated('name', keep='last')]

Unnamed: 0,catagory,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
6,Chemistry,Austria,1 April 1865,23 September 1929,male,https://en.wikipedia.org/wiki/Richard_Adolf_Zs...,Richard Adolf Zsigmondy,Vienna,Göttingen,"Richard Adolf Zsigmondy , Chemistry, 1925",1925
8,Physiology or Medicine,Austria,22 April 1876,8 April 1936,male,https://en.wikipedia.org/wiki/R%C3%B3bert_B%C3...,Róbert Bárány,Vienna,Uppsala domkyrkoförsamling,"Róbert Bárány , Physiology or Medicine, 1914",1914
12,Physiology or Medicine,Ireland,28 June 1930,,male,https://en.wikipedia.org/wiki/William_C._Campb...,William C. Campbell,Ramelton,,"William C. Campbell , Physiology or Medicine, ...",2015
22,Peace,Switzerland,,,,https://en.wikipedia.org/wiki/M%C3%A9decins_Sa...,Médecins Sans Frontières,,,"Médecins Sans Frontières , Peace, 1999",1999
33,Physics,United Kingdom,5 June 1900,9 February 1979,male,https://en.wikipedia.org/wiki/Dennis_Gabor,Dennis Gabor,Budapest,London,"Dennis Gabor , born in Hungary , Physics, 1971",1971
52,Economic,United States,29 December 1910,2 September 2013,male,https://en.wikipedia.org/wiki/Ronald_Coase,Ronald Coase,London,Chicago,"Ronald Coase , born in the United Kingdom , E...",1991
61,Chemistry,United States,7 May 1939,,male,https://en.wikipedia.org/wiki/Sidney_Altman,Sidney Altman,Montreal,,"Sidney Altman , born in Canada , Chemistry, 1990",1990
81,Physiology or Medicine,Venezuela,29 October 1920,2 August 2011,male,https://en.wikipedia.org/wiki/Baruj_Benacerraf,Baruj Benacerraf,Caracas,Boston,"Baruj Benacerraf , Physiology or Medicine, 1980",1980
87,Physics,United States,23 May 1908,30 January 1991,male,https://en.wikipedia.org/wiki/John_Bardeen,John Bardeen,Madison,Boston,"John Bardeen , Physics, 1956",1956
135,Physics,United States,23 October 1905,10 September 1983,male,https://en.wikipedia.org/wiki/Felix_Bloch,Felix Bloch,Zürich,Zürich,"Felix Bloch , born in Switzerland , Physics, ...",1952


In [39]:
df[df.name=='Friedrich Hayek']

Unnamed: 0,catagory,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
404,Economic,United Kingdom,8 May 1899,23 March 1992,male,https://en.wikipedia.org/wiki/Friedrich_Hayek,Friedrich Hayek,Vienna,Freiburg im Breisgau,"Friedrich Hayek , born in Austria , Economics...",1974
1054,Economic,Austria,8 May 1899,23 March 1992,male,https://en.wikipedia.org/wiki/Friedrich_Hayek,Friedrich Hayek,Vienna,Freiburg im Breisgau,"Friedrich Hayek , Economics, 1974",1974


In [40]:
all_duped = pd.concat([g for _,g in df.groupby('name') if len(g) > 1])[['name','catagory','country','year']]
all_duped

Unnamed: 0,name,catagory,country,year
458,Aaron Klug,Chemistry,United Kingdom,1982
581,Aaron Klug,Chemistry,South Africa,1982
496,Albert Einstein,Physics,Switzerland,1921
813,Albert Einstein,Physics,Germany,1921
409,Angus Deaton,Economic,United States,2015
530,Angus Deaton,Economic,United Kingdom,2015
398,Aziz Sancar,Chemistry,United States,2015
555,Aziz Sancar,Chemistry,Turkey,2015
81,Baruj Benacerraf,Physiology or Medicine,Venezuela,1980
228,Baruj Benacerraf,Physiology or Medicine,United States,1980


In [41]:
df = df.reindex(np.random.permutation(df.index))
df = df.drop_duplicates(['name', 'year'])
df = df.sort_index()
df.count()

catagory          886
country           890
date_of_birth     882
date_of_death     599
gender            882
link              890
name              890
place_of_birth    882
place_of_death    599
text              890
year              890
dtype: int64

In [42]:
pd.concat([g for _,g in df.groupby('name') if len(g) > 1])[['name','catagory','country','year']]

Unnamed: 0,name,catagory,country,year
465,Frederick Sanger,Chemistry,United Kingdom,1958
482,Frederick Sanger,Chemistry,United Kingdom,1980
87,John Bardeen,Physics,United States,1956
203,John Bardeen,Physics,United States,1972
149,Linus C. Pauling,Chemistry,United States,1954
156,Linus C. Pauling,Peace,United States,1962
909,Marie Curie,Chemistry,France,1911
918,Marie Curie,Physics,France,1903
651,Marie Skłodowska-Curie,Chemistry,Poland,1911
657,Marie Skłodowska-Curie,Physics,Poland,1903


In [43]:
df = df[~(df.name=='Marie Curie')]
pd.concat([g for _,g in df.groupby('name') if len(g) > 1])[['name','catagory','country','year']]

Unnamed: 0,name,catagory,country,year
465,Frederick Sanger,Chemistry,United Kingdom,1958
482,Frederick Sanger,Chemistry,United Kingdom,1980
87,John Bardeen,Physics,United States,1956
203,John Bardeen,Physics,United States,1972
149,Linus C. Pauling,Chemistry,United States,1954
156,Linus C. Pauling,Peace,United States,1962
651,Marie Skłodowska-Curie,Chemistry,Poland,1911
657,Marie Skłodowska-Curie,Physics,Poland,1903
595,Ragnar Granit,Physiology or Medicine,Sweden,1967
954,Ragnar Granit,Physiology or Medicine,Finland,1809


In [44]:
df = df[~(df.year==1809)]
pd.concat([g for _,g in df.groupby('name') if len(g) > 1])[['name','catagory','country','year']]

Unnamed: 0,name,catagory,country,year
465,Frederick Sanger,Chemistry,United Kingdom,1958
482,Frederick Sanger,Chemistry,United Kingdom,1980
87,John Bardeen,Physics,United States,1956
203,John Bardeen,Physics,United States,1972
149,Linus C. Pauling,Chemistry,United States,1954
156,Linus C. Pauling,Peace,United States,1962
651,Marie Skłodowska-Curie,Chemistry,Poland,1911
657,Marie Skłodowska-Curie,Physics,Poland,1903
61,Sidney Altman,Chemistry,United States,1990
1032,Sidney Altman,Chemistry,Canada,1989


In [45]:
df = df[~((df.name=='Sidney Altman') & (df.year==1990))]
pd.concat([g for _,g in df.groupby('name') if len(g) > 1])[['name','catagory','country','year']]

Unnamed: 0,name,catagory,country,year
465,Frederick Sanger,Chemistry,United Kingdom,1958
482,Frederick Sanger,Chemistry,United Kingdom,1980
87,John Bardeen,Physics,United States,1956
203,John Bardeen,Physics,United States,1972
149,Linus C. Pauling,Chemistry,United States,1954
156,Linus C. Pauling,Peace,United States,1962
651,Marie Skłodowska-Curie,Chemistry,Poland,1911
657,Marie Skłodowska-Curie,Physics,Poland,1903


In [46]:
df.count()

catagory          880
country           884
date_of_birth     876
date_of_death     594
gender            876
link              884
name              884
place_of_birth    876
place_of_death    594
text              884
year              884
dtype: int64

In [47]:
set(df.catagory)

{nan,
 'Economic',
 'Peace',
 'Literature',
 'Chemistry',
 'Physiology or Medicine',
 'Physics'}

In [48]:
df.replace('', np.nan, inplace=True)

In [49]:
df.count()

catagory          880
country           884
date_of_birth     876
date_of_death     594
gender            876
link              884
name              884
place_of_birth    876
place_of_death    594
text              884
year              884
dtype: int64

In [50]:
set(df.catagory)

{nan,
 'Economic',
 'Peace',
 'Literature',
 'Chemistry',
 'Physiology or Medicine',
 'Physics'}

In [51]:
df[df.catagory.isnull()][['name','year','text']]

Unnamed: 0,name,year,text
538,Ilya Ilyich Mechnikov,1908,"Ilya Ilyich Mechnikov , Physiology and Medicin..."
559,Selman Waksman,1952,"Selman Waksman , Physiology and Medicine, 1952"
782,Leopold Ružička born in Kingdom of Hungary,0,Leopold Ružička born in Kingdom of Hungary
895,Alexis Carrel,1912,"Alexis Carrel , Medicine, 1912"


In [52]:
df.ix[df.name=='Ilya Ilyich Mechnikov', 'catagory'] = 'Physiology or Medicine'
df.ix[df.name=='Róbert Bárány', 'catagory'] = 'Physiology or Medicine'
df.ix[df.name=='Selman Waksman', 'catagory'] = 'Physiology or Medicine'
df.ix[df.name=='Alexis Carrel', 'catagory'] = 'Physiology or Medicine'
df.ix[df.name=='Leopold Ružička', 'catagory'] = 'Chemistry'
df = df[~(df.year==0)]
df[df.catagory.isnull()][['name','year','text']]

Unnamed: 0,name,year,text


In [53]:
pd.concat([g for _,g in df.groupby('name') if len(g) > 1])[['name','catagory','country','year']]

Unnamed: 0,name,catagory,country,year
465,Frederick Sanger,Chemistry,United Kingdom,1958
482,Frederick Sanger,Chemistry,United Kingdom,1980
87,John Bardeen,Physics,United States,1956
203,John Bardeen,Physics,United States,1972
149,Linus C. Pauling,Chemistry,United States,1954
156,Linus C. Pauling,Peace,United States,1962
651,Marie Skłodowska-Curie,Chemistry,Poland,1911
657,Marie Skłodowska-Curie,Physics,Poland,1903


In [56]:
df.ix[df.name=='Leopold Ružička']

Unnamed: 0,catagory,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
547,Chemistry,Switzerland,13 September 1887,26 September 1976,male,https://en.wikipedia.org/wiki/Leopold_Ru%C5%BE...,Leopold Ružička,Vukovar,Mammern,"Leopold Ružička , born in then Austria-Hungar...",1939


In [57]:
df[df.year<1900][['name','year','text']]

Unnamed: 0,name,year,text


In [58]:
df[df.gender.isnull()][['name','year','country','catagory']]

Unnamed: 0,name,year,country,catagory
117,American Friends Service Committee (The Quakers),1947,United States,Peace
448,Friends Service Council,1947,United Kingdom,Peace
488,Amnesty International,1977,United Kingdom,Peace
554,Tunisian National Dialogue Quartet,2015,Tunisia,Peace
945,Médecins Sans Frontières,1999,France,Peace
1024,International Atomic Energy Agency,2005,Austria,Peace
1050,Institut de Droit International,1904,Belgium,Peace
1051,Pugwash Conferences on Science and World Affairs,1995,Canada,Peace


In [59]:
df = df[df.gender.notnull()]
df[df.gender.isnull()][['name','year','country','catagory']]

Unnamed: 0,name,year,country,catagory


In [60]:
df[df.name=='Ragnar Granit']

Unnamed: 0,catagory,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
595,Physiology or Medicine,Sweden,30 October 1900,12 March 1991,male,https://en.wikipedia.org/wiki/Ragnar_Granit,Ragnar Granit,Riihimäki,Stockholm,"Ragnar Granit , born in the Grand Duchy of Fi...",1967


In [61]:
df.count()

catagory          875
country           875
date_of_birth     875
date_of_death     593
gender            875
link              875
name              875
place_of_birth    875
place_of_death    593
text              875
year              875
dtype: int64

In [62]:
df[['name', 'date_of_birth', 'date_of_death', 'place_of_birth', 'place_of_death']]

Unnamed: 0,name,date_of_birth,date_of_death,place_of_birth,place_of_death
2,Victor Francis Hess,24 June 1883,17 December 1964,Deutschfeistritz,Mount Vernon
3,Erwin Schrödinger,12 August 1887,4 January 1961,Vienna,Vienna
4,Karl Landsteiner,14 June 1868,26 June 1943,Vienna,New York City
5,Julius Wagner-Jauregg,7 March 1857,27 September 1940,Wels,Vienna
6,Richard Adolf Zsigmondy,1 April 1865,23 September 1929,Vienna,Göttingen
7,Fritz Pregl,3 September 1869,13 December 1930,Ljubljana,Graz
8,Róbert Bárány,22 April 1876,8 April 1936,Vienna,Uppsala domkyrkoförsamling
9,Alfred Hermann Fried,11 November 1864,5 May 1921,Vienna,Vienna
10,Adolfo Pérez Esquivel,26 November 1931,,Buenos Aires,
11,Luis Federico Leloir,6 September 1906,2 December 1987,Paris,Catamarca Province


In [63]:
df.date_of_birth = pd.to_datetime(df.date_of_birth)

In [64]:
df['award_age'] = df.year - pd.DatetimeIndex(df.date_of_birth).year

In [65]:
df.sort_values('award_age')[['name', 'award_age', 'catagory', 'year']]

Unnamed: 0,name,award_age,catagory,year
684,Malala Yousafzai,17,Peace,2014
340,William Lawrence Bragg,25,Physics,1915
512,Georges J. F. Köhler,30,Physiology or Medicine,1976
489,Betty Williams,30,Peace,1976
85,Tsung-Dao Lee,31,Physics,1957
110,Carl Anderson,31,Physics,1936
797,Werner Karl Heisenberg,31,Physics,1932
436,Paul Dirac,31,Physics,1933
83,Tawakkol Karman,32,Peace,2011
863,Rudolf Mössbauer,32,Physics,1961


In [66]:
df.date_of_death = pd.to_datetime(df.date_of_death)

In [67]:
df[['name', 'date_of_birth', 'date_of_death', 'place_of_birth', 'place_of_death']]

Unnamed: 0,name,date_of_birth,date_of_death,place_of_birth,place_of_death
2,Victor Francis Hess,1883-06-24,1964-12-17,Deutschfeistritz,Mount Vernon
3,Erwin Schrödinger,1887-08-12,1961-01-04,Vienna,Vienna
4,Karl Landsteiner,1868-06-14,1943-06-26,Vienna,New York City
5,Julius Wagner-Jauregg,1857-03-07,1940-09-27,Wels,Vienna
6,Richard Adolf Zsigmondy,1865-04-01,1929-09-23,Vienna,Göttingen
7,Fritz Pregl,1869-09-03,1930-12-13,Ljubljana,Graz
8,Róbert Bárány,1876-04-22,1936-04-08,Vienna,Uppsala domkyrkoförsamling
9,Alfred Hermann Fried,1864-11-11,1921-05-05,Vienna,Vienna
10,Adolfo Pérez Esquivel,1931-11-26,NaT,Buenos Aires,
11,Luis Federico Leloir,1906-09-06,1987-12-02,Paris,Catamarca Province


In [68]:
df.count()

catagory          875
country           875
date_of_birth     875
date_of_death     593
gender            875
link              875
name              875
place_of_birth    875
place_of_death    593
text              875
year              875
award_age         875
dtype: int64

In [69]:
df.ix[df.date_of_death.isnull(), "date_of_death"] = np.nan
df[df.date_of_death.isnull()]

Unnamed: 0,catagory,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year,award_age
10,Peace,Argentina,1931-11-26,NaT,male,https://en.wikipedia.org/wiki/Adolfo_P%C3%A9re...,Adolfo Pérez Esquivel,Buenos Aires,,"Adolfo Pérez Esquivel , Peace, 1980",1980,49
15,Literature,Nigeria,1934-07-13,NaT,male,https://en.wikipedia.org/wiki/Wole_Soyinka,Wole Soyinka,Abeokuta,,"Wole Soyinka , Literature, 1986",1986,52
21,Chemistry,Switzerland,1938-10-04,NaT,male,https://en.wikipedia.org/wiki/Kurt_W%C3%BCthrich,Kurt Wüthrich,Aarberg,,"Kurt Wüthrich , Chemistry, 2002",2002,64
23,Physiology or Medicine,Switzerland,1944-01-06,NaT,male,https://en.wikipedia.org/wiki/Rolf_M._Zinkernagel,Rolf M. Zinkernagel,Riehen,,"Rolf M. Zinkernagel , Physiology or Medicine, ...",1996,52
24,Chemistry,Switzerland,1933-08-14,NaT,male,https://en.wikipedia.org/wiki/Richard_R._Ernst,Richard R. Ernst,Winterthur,,"Richard R. Ernst , Chemistry, 1991",1991,58
25,Physics,Switzerland,1927-04-20,NaT,male,https://en.wikipedia.org/wiki/Karl_Alexander_M...,Karl Alexander Müller,Basel,,"Karl Alexander Müller , Physics, 1987",1987,60
27,Physics,United Kingdom,1924-05-11,NaT,male,https://en.wikipedia.org/wiki/Antony_Hewish,Antony Hewish,Fowey,,"Antony Hewish , Physics, 1974",1974,50
30,Physics,United Kingdom,1940-01-04,NaT,male,https://en.wikipedia.org/wiki/Brian_David_Jose...,Brian David Josephson,Cardiff,,"Brian David Josephson , Physics, 1973",1973,33
39,Literature,United States,1941-05-24,NaT,male,https://en.wikipedia.org/wiki/Bob_Dylan,Bob Dylan,Duluth,,"Bob Dylan , Literature, 2016",2016,75
40,Chemistry,United States,1930-03-15,NaT,male,https://en.wikipedia.org/wiki/Martin_Karplus,Martin Karplus,Vienna,,"Martin Karplus , born in Austria , Chemistry,...",2013,83


In [70]:
df_clean = df

In [71]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 875 entries, 2 to 1064
Data columns (total 12 columns):
catagory          875 non-null object
country           875 non-null object
date_of_birth     875 non-null datetime64[ns]
date_of_death     593 non-null datetime64[ns]
gender            875 non-null object
link              875 non-null object
name              875 non-null object
place_of_birth    875 non-null object
place_of_death    593 non-null object
text              875 non-null object
year              875 non-null int64
award_age         875 non-null int64
dtypes: datetime64[ns](2), int64(2), object(8)
memory usage: 88.9+ KB


In [72]:
df.date_of_death = df.date_of_death.astype(str)

In [160]:
dataframe_to_mongo(df_clean, DB_NOBEL_PRIZE, COLL_WINNERS)

In [73]:
bio_file = '/Users/simonsu/git/dataviz/scrapy/nobel_winners/minibios.json'
df_bio = pd.read_json(bio_file)
df_bio.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 913 entries, 0 to 912
Data columns (total 4 columns):
image_urls    913 non-null object
images        913 non-null object
link          913 non-null object
mini_bio      913 non-null object
dtypes: object(4)
memory usage: 35.7+ KB


In [74]:
df_bio.link = df_bio.link.str.replace('http:', 'https:')

In [75]:
df_all = pd.merge(df_clean, df_bio, how='outer', on='link')

In [76]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 918 entries, 0 to 917
Data columns (total 15 columns):
catagory          875 non-null object
country           875 non-null object
date_of_birth     875 non-null datetime64[ns]
date_of_death     875 non-null object
gender            875 non-null object
link              918 non-null object
name              875 non-null object
place_of_birth    875 non-null object
place_of_death    593 non-null object
text              875 non-null object
year              875 non-null float64
award_age         875 non-null float64
image_urls        918 non-null object
images            918 non-null object
mini_bio          918 non-null object
dtypes: datetime64[ns](1), float64(2), object(12)
memory usage: 114.8+ KB


In [77]:
df_all

Unnamed: 0,catagory,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year,award_age,image_urls,images,mini_bio
0,Physics,Austria,1883-06-24,1964-12-17,male,https://en.wikipedia.org/wiki/Victor_Francis_Hess,Victor Francis Hess,Deutschfeistritz,Mount Vernon,"Victor Francis Hess , Physics, 1936",1936.0,53.0,[https://upload.wikimedia.org/wikipedia/common...,[{'url': 'https://upload.wikimedia.org/wikiped...,<p><b>Victor Franz Hess</b> (24 June 1883 – 17...
1,Physics,Austria,1887-08-12,1961-01-04,male,https://en.wikipedia.org/wiki/Erwin_Schr%C3%B6...,Erwin Schrödinger,Vienna,Vienna,"Erwin Schrödinger , Physics, 1933",1933.0,46.0,[https://upload.wikimedia.org/wikipedia/common...,[{'url': 'https://upload.wikimedia.org/wikiped...,<p><b>Erwin Rudolf Josef Alexander Schrödinger...
2,Physiology or Medicine,Austria,1868-06-14,1943-06-26,male,https://en.wikipedia.org/wiki/Karl_Landsteiner,Karl Landsteiner,Vienna,New York City,"Karl Landsteiner , Physiology or Medicine, 1930",1930.0,62.0,[https://upload.wikimedia.org/wikipedia/en/thu...,[{'url': 'https://upload.wikimedia.org/wikiped...,"<p><b>Karl Landsteiner</b>, <span style=""font-..."
3,Physiology or Medicine,Austria,1857-03-07,1940-09-27,male,https://en.wikipedia.org/wiki/Julius_Wagner-Ja...,Julius Wagner-Jauregg,Wels,Vienna,"Julius Wagner-Jauregg , Physiology or Medicine...",1927.0,70.0,[https://upload.wikimedia.org/wikipedia/common...,[{'url': 'https://upload.wikimedia.org/wikiped...,<p><b>Julius Wagner-Jauregg</b> (7 March 1857 ...
4,Chemistry,Austria,1865-04-01,1929-09-23,male,https://en.wikipedia.org/wiki/Richard_Adolf_Zs...,Richard Adolf Zsigmondy,Vienna,Göttingen,"Richard Adolf Zsigmondy , Chemistry, 1925",1925.0,60.0,[https://upload.wikimedia.org/wikipedia/en/thu...,[{'url': 'https://upload.wikimedia.org/wikiped...,<p><b>Richard Adolf Zsigmondy</b> (1 April 186...
5,Chemistry,Austria,1869-09-03,1930-12-13,male,https://en.wikipedia.org/wiki/Fritz_Pregl,Fritz Pregl,Ljubljana,Graz,"Fritz Pregl , born in then Austria-Hungary, n...",1923.0,54.0,[https://upload.wikimedia.org/wikipedia/common...,[{'url': 'https://upload.wikimedia.org/wikiped...,"<p><b>Fritz Pregl</b> (in <a href=""http://en.w..."
6,Physiology or Medicine,Austria,1876-04-22,1936-04-08,male,https://en.wikipedia.org/wiki/R%C3%B3bert_B%C3...,Róbert Bárány,Vienna,Uppsala domkyrkoförsamling,"Róbert Bárány , Physiology or Medicine, 1914",1914.0,38.0,[https://upload.wikimedia.org/wikipedia/common...,[{'url': 'https://upload.wikimedia.org/wikiped...,<p><b>Róbert Bárány</b> (22 April 1876 – 8 Apr...
7,Peace,Austria,1864-11-11,1921-05-05,male,https://en.wikipedia.org/wiki/Alfred_Hermann_F...,Alfred Hermann Fried,Vienna,Vienna,"Alfred Hermann Fried , Peace, 1911",1911.0,47.0,[],[],<p><b>Alfred Hermann Fried</b> (11 November 18...
8,Peace,Argentina,1931-11-26,NaT,male,https://en.wikipedia.org/wiki/Adolfo_P%C3%A9re...,Adolfo Pérez Esquivel,Buenos Aires,,"Adolfo Pérez Esquivel , Peace, 1980",1980.0,49.0,[https://upload.wikimedia.org/wikipedia/common...,[{'url': 'https://upload.wikimedia.org/wikiped...,<p><b>Adolfo Pérez Esquivel</b> (born November...
9,Chemistry,Argentina,1906-09-06,1987-12-02,male,https://en.wikipedia.org/wiki/Luis_Federico_Le...,Luis Federico Leloir,Paris,Catamarca Province,"Luis Federico Leloir , Chemistry, 1970",1970.0,64.0,[https://upload.wikimedia.org/wikipedia/common...,[{'url': 'https://upload.wikimedia.org/wikiped...,"<p><b>Luis Federico Leloir</b> <span style=""fo..."


In [78]:
df_all[df_all.name.isnull()]

Unnamed: 0,catagory,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year,award_age,image_urls,images,mini_bio
875,,,NaT,,,https://en.wikipedia.org/wiki/International_At...,,,,,,,[https://upload.wikimedia.org/wikipedia/common...,[{'url': 'https://upload.wikimedia.org/wikiped...,<p>The <b>International Atomic Energy Agency</...
876,,,NaT,,,https://en.wikipedia.org/wiki/Karl_von_Frisch,,,,,,,[https://upload.wikimedia.org/wikipedia/en/thu...,[{'url': 'https://upload.wikimedia.org/wikiped...,"<p><b>Karl <a href=""http://en.wikipedia.org/wi..."
877,,,NaT,,,https://en.wikipedia.org/wiki/Eric_Kandel,,,,,,,[https://upload.wikimedia.org/wikipedia/common...,[{'url': 'https://upload.wikimedia.org/wikiped...,<p><b>Eric Richard Kandel</b> (<small>German:<...
878,,,NaT,,,https://en.wikipedia.org/wiki/American_Friends...,,,,,,,[https://upload.wikimedia.org/wikipedia/en/e/e...,[{'url': 'https://upload.wikimedia.org/wikiped...,<p>The <b>American Friends Service Committee</...
879,,,NaT,,,https://en.wikipedia.org/wiki/Ben_R._Mottelson,,,,,,,[https://upload.wikimedia.org/wikipedia/common...,[{'url': 'https://upload.wikimedia.org/wikiped...,"<p><b>Ben Roy Mottelson</b> (born July 9, 1926..."
880,,,NaT,,,https://en.wikipedia.org/wiki/Friends_Service_...,,,,,,,[],[],<p><b>Quaker Peace &amp; Social Witness</b> (<...
881,,,NaT,,,https://en.wikipedia.org/wiki/John_James_Ricka...,,,,,,,[https://upload.wikimedia.org/wikipedia/common...,[{'url': 'https://upload.wikimedia.org/wikiped...,"<p>Prof <b>John James Rickard Macleod</b>, <a ..."
882,,,NaT,,,https://en.wikipedia.org/wiki/Amnesty_Internat...,,,,,,,[https://upload.wikimedia.org/wikipedia/en/thu...,[{'url': 'https://upload.wikimedia.org/wikiped...,<p><b>Amnesty International</b> (commonly know...
883,,,NaT,,,https://en.wikipedia.org/wiki/Clive_W._J._Granger,,,,,,,[https://upload.wikimedia.org/wikipedia/common...,[{'url': 'https://upload.wikimedia.org/wikiped...,<p><b>Sir Clive William John Granger</b> (<spa...
884,,,NaT,,,https://en.wikipedia.org/wiki/Tunisian_Nationa...,,,,,,,[],[],<p>The <b>Tunisian National Dialogue Quartet</...


In [79]:
df_all.count()

catagory          875
country           875
date_of_birth     875
date_of_death     875
gender            875
link              918
name              875
place_of_birth    875
place_of_death    593
text              875
year              875
award_age         875
image_urls        918
images            918
mini_bio          918
dtype: int64

In [80]:
df_all = df_all[~df_all.name.isnull()]

In [171]:
df_all.count()

catagory          875
country           875
date_of_birth     875
date_of_death     875
gender            875
link              875
name              875
place_of_birth    875
place_of_death    593
text              875
year              875
award_age         875
image_urls        875
images            875
mini_bio          875
bio_image         875
dtype: int64

In [170]:
bio_image = pd.Series('', df_all.index)

for i in df_all.index:
    images = df_all.iloc[i].images
    if images:
        bio_image.iloc[i] = images[0]["path"]
    
df_all["bio_image"] = bio_image


In [175]:
df_all.iloc[8]

catagory                                                      Peace
country                                                   Argentina
date_of_birth                                   1931-11-26 00:00:00
date_of_death                                                   NaT
gender                                                         male
link              https://en.wikipedia.org/wiki/Adolfo_P%C3%A9re...
name                                          Adolfo Pérez Esquivel
place_of_birth                                         Buenos Aires
place_of_death                                                  NaN
text                            Adolfo Pérez Esquivel , Peace, 1980
year                                                           1980
award_age                                                        49
image_urls        [https://upload.wikimedia.org/wikipedia/common...
images            [{'url': 'https://upload.wikimedia.org/wikiped...
mini_bio          <p><b>Adolfo Pérez Esquivel</b

In [176]:
dataframe_to_mongo(df_all, DB_NOBEL_PRIZE, 'winners_all')