In [1]:
import numpy as np
import pandas as pd
import pendulum

In [2]:
# Pick up from where we left off with our wiki data
df = pd.read_json('wiki-start-dates.json')

In [3]:
df

Unnamed: 0,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
0,Physiology or Medicine,Argentina,8 October 1927,24 March 2002,male,http://en.wikipedia.org/wiki/C%C3%A9sar_Milstein,César Milstein,"Bahía Blanca , Argentina","Cambridge , England","César Milstein , Physiology or Medicine, 1984",1984
10,Physiology or Medicine,Belgium,24 August 1899,22 May 1983,male,http://en.wikipedia.org/wiki/Albert_Claude,Albert Claude,"Longlier, Neufchâteau, Belgium","Brussels , Belgium","Albert Claude , Physiology or Medicine, 1974",1974
100,Literature,Mexico,"March 31, 1914","April 19, 1998",male,http://en.wikipedia.org/wiki/Octavio_Paz,Octavio Paz Lozano,"Mexico City , Mexico","Mexico City , Mexico","Octavio Paz Lozano , Literature, 1990",1990
1000,Peace,Canada,,,,http://en.wikipedia.org/wiki/Pugwash_Conferenc...,Pugwash Conferences on Science and World Affairs,,,Pugwash Conferences on Science and World Affai...,1995
1003,Economics,Canada,"October 24, 1932",,male,http://en.wikipedia.org/wiki/Robert_Mundell,Robert Mundell,"Kingston, Ontario , Canada",,"Robert Mundell , Economics, 1999",1999
1004,Physiology or Medicine,Canada,"November 9, 1952",,male,http://en.wikipedia.org/wiki/Jack_W._Szostak,Jack W. Szostak,"London , United Kingdom",,"Jack W. Szostak , born in the United Kingdom ...",2009
1006,Physiology or Medicine,Canada,"January 14, 1943","September 30, 2011",male,http://en.wikipedia.org/wiki/Ralph_M._Steinman,Ralph M. Steinman,"Montreal, Quebec, Canada","New York City, New York, U.S.","Ralph M. Steinman , Physiology or Medicine, 2011",2011
1007,Literature,Canada,1931-07-10,,female,http://en.wikipedia.org/wiki/Alice_Munro,Alice Munro,"Wingham, Ontario , Canada",,"Alice Munro , Literature, 2013",2013
1011,Peace,Bangladesh,28 June 1940,,male,http://en.wikipedia.org/wiki/Muhammad_Yunus,Muhammad Yunus,"Chittagong , British India",,"Muhammad Yunus , Peace, 2006",2006
1012,Physics,Azerbaijan,"January 22, 1908","April 1, 1968",male,http://en.wikipedia.org/wiki/Lev_Landau,Lev Landau,Baku,Moscow,"Lev Landau , born in then Russian Empire, lau...",1962


In [4]:
# The birth date column has various formats, all strings
df.date_of_birth

0           8 October 1927
10          24 August 1899
100         March 31, 1914
1000                  None
1003      October 24, 1932
1004      November 9, 1952
1006      January 14, 1943
1007            1931-07-10
1011          28 June 1940
1012      January 22, 1908
1013          June 7, 1862
1014          June 9, 1843
1015      11 November 1864
1016         22 April 1876
1017      3 September 1869
1018          1 April 1865
1019          7 March 1857
1020         June 14, 1868
1021        12 August 1887
1022          24 June 1883
1025         25 April 1900
1026           19 May 1914
1028      November 7, 1903
1029            8 May 1899
1032       20 October 1946
1033                  None
1036     24 September 1898
1037      3 September 1899
1038       27 January 1903
1040           28 May 1912
               ...        
963       25 December 1918
964       11 December 1911
965      February 26, 1946
966           17 June 1942
967       26 December 1949
968        3 February 1948
9

In [5]:
# If we weren't using pandas we can use a library like pendulum
# to parse a string into a date time object
pdt = pendulum.parse('February 26, 1946')

In [6]:
pdt

<Pendulum [1946-02-26T00:00:00+00:00]>

In [7]:
# These objects have built in helpers to dump them to strings
pdt.to_iso8601_string()

'1946-02-26T00:00:00+00:00'

In [8]:
# And it can figure out various formats
pdt = pendulum.parse('1897-04-23')

In [9]:
pdt.to_iso8601_string()

'1897-04-23T00:00:00+00:00'

In [10]:
pdt.to_rfc2822_string()

'Fri, 23 Apr 1897 00:00:00 +0000'

In [11]:
# Back to our data we also have death dates
# And they are also in various formats (or missing entirely)
df.date_of_death

0            24 March 2002
10             22 May 1983
100         April 19, 1998
1000                  None
1003                  None
1004                  None
1006    September 30, 2011
1007                  None
1011                  None
1012         April 1, 1968
1013          May 20, 1947
1014         June 21, 1914
1015            5 May 1921
1016          8 April 1936
1017      13 December 1930
1018     23 September 1929
1019     27 September 1940
1020         June 26, 1943
1021        4 January 1961
1022      17 December 1964
1025      15 December 1958
1026       6 February 2002
1028     February 27, 1989
1029         23 March 1992
1032                  None
1033                  None
1036      21 February 1968
1037        31 August 1985
1038            2 May 1997
1040     30 September 1990
               ...        
963         6 October 1981
964         30 August 2006
965                   None
966                   None
967                   None
968                   None
9

In [12]:
# Here we reassign the values of the birth date column 
# after processing with the `to_datetime` method
df.date_of_birth = pd.to_datetime(df.date_of_birth)

In [13]:
# We do the same thing with the death date but it isn't as clean
# so we have to tell the function to coerce errors to the datatype's null value (NaT)
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.to_datetime.html
df.date_of_death = pd.to_datetime(df.date_of_death, errors='coerce')

In [14]:
# Now we can see the date column types are datetime64
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 905 entries, 0 to 999
Data columns (total 11 columns):
category          904 non-null object
country           905 non-null object
date_of_birth     896 non-null datetime64[ns]
date_of_death     577 non-null datetime64[ns]
gender            896 non-null object
link              905 non-null object
name              905 non-null object
place_of_birth    870 non-null object
place_of_death    541 non-null object
text              905 non-null object
year              905 non-null int64
dtypes: datetime64[ns](2), int64(1), object(8)
memory usage: 124.8+ KB


In [15]:
# We can inspect the records and use the isnull() method to see the missing birth dates
df[df.date_of_birth.isnull()]

Unnamed: 0,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
1000,Peace,Canada,NaT,NaT,,http://en.wikipedia.org/wiki/Pugwash_Conferenc...,Pugwash Conferences on Science and World Affairs,,,Pugwash Conferences on Science and World Affai...,1995
1033,Peace,Austria,NaT,NaT,,http://en.wikipedia.org/wiki/International_Ato...,International Atomic Energy Agency,,,"International Atomic Energy Agency , Peace, 2005",2005
156,Peace,United Kingdom,NaT,NaT,,http://en.wikipedia.org/wiki/Friends_Service_C...,Friends Service Council,,,"Friends Service Council , Peace, 1947",1947
267,Peace,United States,NaT,NaT,,http://en.wikipedia.org/wiki/American_Friends_...,American Friends Service Committee (The Quakers),,,American Friends Service Committee (The Quake...,1947
3,Peace,Belgium,NaT,NaT,,http://en.wikipedia.org/wiki/Institut_de_Droit...,Institut de Droit International,,,"Institut de Droit International , Peace, 1904",1904
574,Peace,United Kingdom,NaT,NaT,,http://en.wikipedia.org/wiki/Amnesty_Internati...,Amnesty International,,,"Amnesty International , Peace, 1977",1977
632,Peace,Switzerland,NaT,NaT,,http://en.wikipedia.org/wiki/M%C3%A9decins_San...,Médecins Sans Frontières,,,"Médecins Sans Frontières , Peace, 1999",1999
782,Physics,Japan,NaT,NaT,male,http://en.wikipedia.org/wiki/Hiroshi_Amano,Hiroshi Amano,,,"Hiroshi Amano , Physics, 2014",2014
947,Peace,France,NaT,NaT,,http://en.wikipedia.org/wiki/M%C3%A9decins_San...,Médecins Sans Frontières,,,"Médecins Sans Frontières , Peace, 1999",1999


In [16]:
len(df)

905

In [17]:
# We can drop those 9 rows so they are deleted
df.drop(df[df.date_of_birth.isnull()].index, inplace=True)

In [18]:
len(df)

896

In [19]:
# Now we want to calculate the age of every winner.
# If we had a single value in python like:
# year = 1952
# date_of_birth = 1899
# Then we could subtract those values
# age = year - date_of_birth

# This is where the power of pandas and numpy come through
# We don't have single objects though- we have lists (series in pandas parlance)
# of years but we can operate on the entire series as if they were single values.
# We have to use a DatetimeIndex which has a special function- it takes 
# each individual date time in the list and pulls the attribute specified out
# so df.date_of_birth == [<date>, <date>, <date>, ...]
# and using pd.DatetimeIndex(df.date_of_birth).year 
# turns it in to [1892, 1903, 1871, etc]
# And then we can subtract the two and add them to our dataframe as a new column
df['award_age'] = df.year - pd.DatetimeIndex(df.date_of_birth).year

In [20]:
df['award_age']


0       57
10      75
100     76
1003    67
1004    57
1006    68
1007    82
1011    66
1012    54
1013    43
1014    62
1015    47
1016    38
1017    54
1018    60
1019    70
1020    62
1021    46
1022    53
1025    45
1026    48
1028    70
1029    75
1032    58
1036    47
1037    61
1038    60
1040    61
1042    74
1043    56
        ..
963     60
964     77
965     53
966     63
967     47
968     48
969     43
97      52
970     71
971     60
973     47
974     55
975     31
976     35
977     54
98      46
981     55
982     57
983     56
984     67
985     37
986     32
988     60
99      71
990     67
994     57
995     50
996     61
998     61
999     76
Name: award_age, Length: 896, dtype: int64

In [21]:
# Numerical columns have a set of helpers such as mean and median

In [22]:
df.award_age.mean()

59.261160714285715

In [23]:
df.award_age.median()

60.0

In [32]:
df.to_json("wiki-clean.json", "records")