In [1]:
import numpy as np
import pandas as pd

In [2]:
# Load the data
df = pd.read_json("nobel_winners_dirty.json")

In [3]:
# We can identify missing data quickly with the info command...
# Look at gender vs name
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1052 entries, 0 to 1051
Data columns (total 12 columns):
born_in           1052 non-null object
category          1052 non-null object
country           1052 non-null object
date_of_birth     1044 non-null object
date_of_death     1044 non-null object
gender            1040 non-null object
link              1052 non-null object
name              1052 non-null object
place_of_birth    1044 non-null object
place_of_death    1044 non-null object
text              1052 non-null object
year              1052 non-null int64
dtypes: int64(1), object(11)
memory usage: 106.8+ KB


In [4]:
# Dataframes have a handy shortcut for numerical statistics...
# describe runs on the integer column
# Per the book look at the min value for year
df.describe()

Unnamed: 0,year
count,1052.0
mean,1968.729087
std,33.155829
min,1809.0
25%,1947.0
50%,1975.0
75%,1996.0
max,2014.0


In [5]:
# But it knows how to describe objects as well...
df.describe(include=['object'])

Unnamed: 0,born_in,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text
count,1052.0,1052,1052,1044,1044.0,1040,1052,1052,1044.0,1044.0,1052
unique,40.0,7,59,853,563.0,2,893,998,735.0,410.0,1043
top,,Physiology or Medicine,United States,9 May 1947,,male,http://en.wikipedia.org/wiki/Michael_Levitt,Ronald Coase,,,"Robert Aumann , born in Germany , Economics, ..."
freq,910.0,250,350,4,362.0,982,4,2,29.0,409.0,2


In [6]:
# The book expounds on data accessors... masks/filters and such.
# We'll move on to clean up data

In [7]:
# Replace all empty strings with NaN
df.replace('', np.nan, inplace=True)

In [8]:
# Find names with asterisks
# Contains tries to compile a regex so we have to escape the *
has_asterisk= df.name.str.contains('\*')

In [9]:
# Use our mask and spot check they do, in fact have asterisks...
df[has_asterisk].head()['name']

1          Ivo Andric *
2     Vladimir Prelog *
14      Simon Kuznets *
15     Menachem Begin *
16       Shimon Peres *
Name: name, dtype: object

In [10]:
# Let's replace asterisks but this time we'll pull the series (the column) out
# and just work on it
names = df.name
names = names.str.replace("*", "")

In [11]:
# If we just replace the "*" we leave the whitespace before it...
# So let's clean up the whitespace
names = names.str.strip()

In [12]:
names.head()

0                     César Milstein
1                         Ivo Andric
2                    Vladimir Prelog
3    Institut de Droit International
4                  Auguste Beernaert
Name: name, dtype: object

In [13]:
# We can use our same mask again- it only has a list of true/false values
# so it is going to show the same names... and we can check they are clean
# Again, the `has_asterisk` mask isn't re-evaluated so it shows the same rows
# it originally detected.
names[has_asterisk].head()

1          Ivo Andric
2     Vladimir Prelog
14      Simon Kuznets
15     Menachem Begin
16       Shimon Peres
Name: name, dtype: object

In [14]:
# Just to show how that works...
# rows 1 & 2 are True so they aren't masked
has_asterisk.head()

0    False
1     True
2     True
3    False
4    False
Name: name, dtype: bool

In [15]:
# Now let's put the series back in to the dataframe...
# Still the old data
df.name.head()

0                     César Milstein
1                       Ivo Andric *
2                  Vladimir Prelog *
3    Institut de Droit International
4                  Auguste Beernaert
Name: name, dtype: object

In [16]:
# Reassign the property to our series we changed
df.name = names

In [17]:
# And now it's clean
df.name.head()

0                     César Milstein
1                         Ivo Andric
2                    Vladimir Prelog
3    Institut de Droit International
4                  Auguste Beernaert
Name: name, dtype: object

In [18]:
# Now let's look at our data again
df.head()

Unnamed: 0,born_in,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
0,,Physiology or Medicine,Argentina,8 October 1927,24 March 2002,male,http://en.wikipedia.org/wiki/C%C3%A9sar_Milstein,César Milstein,"Bahía Blanca , Argentina","Cambridge , England","César Milstein , Physiology or Medicine, 1984",1984
1,Bosnia and Herzegovina,Literature,,9 October 1892,13 March 1975,male,http://en.wikipedia.org/wiki/Ivo_Andric,Ivo Andric,"Dolac (village near Travnik), Austria-Hungary ...","Belgrade, SR Serbia, SFR Yugoslavia (present-d...","Ivo Andric *, born in then Austria–Hungary ,...",1961
2,Bosnia and Herzegovina,Chemistry,,"July 23, 1906",1998-01-07,male,http://en.wikipedia.org/wiki/Vladimir_Prelog,Vladimir Prelog,"Sarajevo , Bosnia and Herzegovina , then part...","Zürich , Switzerland","Vladimir Prelog *, born in then Austria–Hung...",1975
3,,Peace,Belgium,,,,http://en.wikipedia.org/wiki/Institut_de_Droit...,Institut de Droit International,,,"Institut de Droit International , Peace, 1904",1904
4,,Peace,Belgium,26 July 1829,6 October 1912,male,http://en.wikipedia.org/wiki/Auguste_Marie_Fra...,Auguste Beernaert,"Ostend , Netherlands (now Belgium )","Lucerne , Switzerland","Auguste Beernaert , Peace, 1909",1909


In [19]:
# Let's filter down to all the rows where born_in is null
df = df[df.born_in.isnull()]

In [20]:
df.head()

Unnamed: 0,born_in,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
0,,Physiology or Medicine,Argentina,8 October 1927,24 March 2002,male,http://en.wikipedia.org/wiki/C%C3%A9sar_Milstein,César Milstein,"Bahía Blanca , Argentina","Cambridge , England","César Milstein , Physiology or Medicine, 1984",1984
3,,Peace,Belgium,,,,http://en.wikipedia.org/wiki/Institut_de_Droit...,Institut de Droit International,,,"Institut de Droit International , Peace, 1904",1904
4,,Peace,Belgium,26 July 1829,6 October 1912,male,http://en.wikipedia.org/wiki/Auguste_Marie_Fra...,Auguste Beernaert,"Ostend , Netherlands (now Belgium )","Lucerne , Switzerland","Auguste Beernaert , Peace, 1909",1909
5,,Literature,Belgium,29 August 1862,6 May 1949,male,http://en.wikipedia.org/wiki/Maurice_Maeterlinck,Maurice Maeterlinck,"Ghent , Belgium","Nice , France","Maurice Maeterlinck , Literature, 1911",1911
6,,Peace,Belgium,22 April 1854,14 May 1943,male,http://en.wikipedia.org/wiki/Henri_La_Fontaine,Henri La Fontaine,Brussels,Belgium,"Henri La Fontaine , Peace, 1913",1913


In [21]:
# Looking above the born_in column isn't useful so we'll remove it
# To drop something we have to specify if it's a column (axis=1)
# or a row (axis=0, which is the default)
df = df.drop('born_in', axis=1)

In [22]:
df.head()

Unnamed: 0,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
0,Physiology or Medicine,Argentina,8 October 1927,24 March 2002,male,http://en.wikipedia.org/wiki/C%C3%A9sar_Milstein,César Milstein,"Bahía Blanca , Argentina","Cambridge , England","César Milstein , Physiology or Medicine, 1984",1984
3,Peace,Belgium,,,,http://en.wikipedia.org/wiki/Institut_de_Droit...,Institut de Droit International,,,"Institut de Droit International , Peace, 1904",1904
4,Peace,Belgium,26 July 1829,6 October 1912,male,http://en.wikipedia.org/wiki/Auguste_Marie_Fra...,Auguste Beernaert,"Ostend , Netherlands (now Belgium )","Lucerne , Switzerland","Auguste Beernaert , Peace, 1909",1909
5,Literature,Belgium,29 August 1862,6 May 1949,male,http://en.wikipedia.org/wiki/Maurice_Maeterlinck,Maurice Maeterlinck,"Ghent , Belgium","Nice , France","Maurice Maeterlinck , Literature, 1911",1911
6,Peace,Belgium,22 April 1854,14 May 1943,male,http://en.wikipedia.org/wiki/Henri_La_Fontaine,Henri La Fontaine,Brussels,Belgium,"Henri La Fontaine , Peace, 1913",1913


In [23]:
# Following the operations on p249 we're going to drop the year 1809
df.drop(df[df.year == 1809].index, inplace=True)
# So that code, unrolled:
# df.drop( # Call the drop function
#     df[df.year == 1809].index, # make a dataframe from the mask year == 1809 so we only have that year
                                 # call .index to return the row id's of those rows as a series
#     inplace=True # Alter the dataframe in place
# )
# So that call functions as df.drop(Int64Index([1, 2, 3, ...), inplace=True) after everything is evaluated

In [24]:
# And confirm it's gone...
df.sort_values('year').head()

Unnamed: 0,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
608,Peace,Switzerland,8 May 1828,30 October 1910,male,http://en.wikipedia.org/wiki/Henry_Dunant,Henry Dunant,"Geneva , Switzerland","Heiden, Switzerland","Henry Dunant , Peace, 1901",1901
68,Physiology or Medicine,Germany,15 March 1854,31 March 1917,male,http://en.wikipedia.org/wiki/Emil_Adolf_von_Be...,Emil Adolf von Behring,Hansdorf,"Marburg , Hesse-Nassau","Emil Adolf von Behring , Physiology or Medicin...",1901
909,Peace,France,8 May 1828,30 October 1910,male,http://en.wikipedia.org/wiki/Henry_Dunant,Henry Dunant,"Geneva , Switzerland","Heiden, Switzerland","Henry Dunant , Peace, 1901",1901
908,Peace,France,"May 20, 1822","June 12, 1912",male,http://en.wikipedia.org/wiki/Fr%C3%A9d%C3%A9ri...,Frédéric Passy,"Paris , France","Paris , France","Frédéric Passy , Peace, 1901",1901
907,Literature,France,16 March 1839,6 September 1907,male,http://en.wikipedia.org/wiki/Sully_Prudhomme,Sully Prudhomme,"Paris, France","Châtenay-Malabry , France","Sully Prudhomme , Literature, 1901",1901


In [25]:
# Now let's find duplicates by name
# Note here df.duplicated is just making a mask, a list of True / False...
# So we have to use it to find data in the dataframe
dupes_by_name = df[df.duplicated('name')]

In [26]:
dupes_by_name.count()

category          45
country           45
date_of_birth     44
date_of_death     23
gender            44
link              45
name              45
place_of_birth    44
place_of_death    22
text              45
year              45
dtype: int64

In [27]:
all_dupes = df[df.name.isin(dupes_by_name.name)]

In [28]:
all_dupes.count()

category          90
country           90
date_of_birth     88
date_of_death     46
gender            88
link              90
name              90
place_of_birth    88
place_of_death    44
text              90
year              90
dtype: int64

In [29]:
# Now let's sort our data by name
sorted_df = all_dupes.sort_values('name')
# And display name, country, year
sorted_df[['name', 'country', 'year']]

Unnamed: 0,name,country,year
121,Aaron Klug,South Africa,1982
131,Aaron Klug,United Kingdom,1982
844,Albert Einstein,Germany,1921
615,Albert Einstein,Switzerland,1921
176,Arieh Warshel,United States,2013
798,Arieh Warshel,Israel,2013
830,Avram Hershko,Hungary,2004
94,Avram Hershko,Israel,2004
366,Baruj Benacerraf,United States,1980
228,Baruj Benacerraf,Venezuela,1980


In [30]:
# Examine the data above and you will notice some people won
# awards for the same year but listed in different countries.

# Mask our dataframe to those not named Marie Curie
df = df[~(df.name == 'Marie Curie')]

# Reassign the country for Marie Curie's hyphened surname
df.loc[(df.name == u'Marie Skłodowska-Curie') & (df.year == 1911), 'country'] = 'France'

In [31]:
df.loc[df.name == u'Marie Skłodowska-Curie']

Unnamed: 0,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
706,Physics,Poland,7 November 1867,4 July 1934,female,http://en.wikipedia.org/wiki/Marie_Curie,Marie Skłodowska-Curie,"Warsaw , Poland","Sancellemoz , France","Marie Skłodowska-Curie , born in partitioned ...",1903
709,Chemistry,France,7 November 1867,4 July 1934,female,http://en.wikipedia.org/wiki/Marie_Curie,Marie Skłodowska-Curie,"Warsaw , Poland","Sancellemoz , France","Marie Skłodowska-Curie , born in partitioned ...",1911


In [32]:
df[df.name.str.contains('Marie')].head()

Unnamed: 0,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
706,Physics,Poland,7 November 1867,4 July 1934,female,http://en.wikipedia.org/wiki/Marie_Curie,Marie Skłodowska-Curie,"Warsaw , Poland","Sancellemoz , France","Marie Skłodowska-Curie , born in partitioned ...",1903
709,Chemistry,France,7 November 1867,4 July 1934,female,http://en.wikipedia.org/wiki/Marie_Curie,Marie Skłodowska-Curie,"Warsaw , Poland","Sancellemoz , France","Marie Skłodowska-Curie , born in partitioned ...",1911
942,Chemistry,France,"September 30, 1939",,male,http://en.wikipedia.org/wiki/Jean-Marie_Lehn,Jean-Marie Lehn,"Rosheim , France",,"Jean-Marie Lehn , Chemistry, 1987",1987


In [33]:
# Now following along in the book, p248 & p249 to clean Sidney Altman