In [1]:
# It's a popular and common convention
# to import pandas into the "pd" namespace
import pandas as pd

In [2]:
# Create a dataframe, the core pandas data container from the json
df = pd.read_json("nobel_winners_dirty.json")

In [3]:
df

Unnamed: 0,born_in,category,country,date_of_birth,date_of_death,gender,link,name,place_of_birth,place_of_death,text,year
0,,Physiology or Medicine,Argentina,8 October 1927,24 March 2002,male,http://en.wikipedia.org/wiki/C%C3%A9sar_Milstein,César Milstein,"Bahía Blanca , Argentina","Cambridge , England","César Milstein , Physiology or Medicine, 1984",1984
1,Bosnia and Herzegovina,Literature,,9 October 1892,13 March 1975,male,http://en.wikipedia.org/wiki/Ivo_Andric,Ivo Andric *,"Dolac (village near Travnik), Austria-Hungary ...","Belgrade, SR Serbia, SFR Yugoslavia (present-d...","Ivo Andric *, born in then Austria–Hungary ,...",1961
2,Bosnia and Herzegovina,Chemistry,,"July 23, 1906",1998-01-07,male,http://en.wikipedia.org/wiki/Vladimir_Prelog,Vladimir Prelog *,"Sarajevo , Bosnia and Herzegovina , then part...","Zürich , Switzerland","Vladimir Prelog *, born in then Austria–Hung...",1975
3,,Peace,Belgium,,,,http://en.wikipedia.org/wiki/Institut_de_Droit...,Institut de Droit International,,,"Institut de Droit International , Peace, 1904",1904
4,,Peace,Belgium,26 July 1829,6 October 1912,male,http://en.wikipedia.org/wiki/Auguste_Marie_Fra...,Auguste Beernaert,"Ostend , Netherlands (now Belgium )","Lucerne , Switzerland","Auguste Beernaert , Peace, 1909",1909
5,,Literature,Belgium,29 August 1862,6 May 1949,male,http://en.wikipedia.org/wiki/Maurice_Maeterlinck,Maurice Maeterlinck,"Ghent , Belgium","Nice , France","Maurice Maeterlinck , Literature, 1911",1911
6,,Peace,Belgium,22 April 1854,14 May 1943,male,http://en.wikipedia.org/wiki/Henri_La_Fontaine,Henri La Fontaine,Brussels,Belgium,"Henri La Fontaine , Peace, 1913",1913
7,,Physiology or Medicine,Belgium,13 June 1870,6 April 1961,male,http://en.wikipedia.org/wiki/Jules_Bordet,Jules Bordet,"Soignies, Belgium",,"Jules Bordet , Physiology or Medicine, 1919",1919
8,,Physiology or Medicine,Belgium,28 March 1892,18 July 1968,male,http://en.wikipedia.org/wiki/Corneille_Heymans,Corneille Heymans,"Ghent , Flanders","Knokke , Flanders","Corneille Heymans , Physiology or Medicine, 1938",1938
9,,Peace,Belgium,1910-02-10,1969-01-30,male,http://en.wikipedia.org/wiki/Georges_Pire,Georges Pire,"Dinant , Belgium","Leuven , Belgium","Georges Pire , Peace, 1958",1958


In [4]:
# Examine the columns that are available
df.columns

Index(['born_in', 'category', 'country', 'date_of_birth', 'date_of_death',
       'gender', 'link', 'name', 'place_of_birth', 'place_of_death', 'text',
       'year'],
      dtype='object')

In [5]:
# And list the indicies...
df.index

Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050, 1051],
           dtype='int64', length=1052)

In [6]:
# The indicies were by number... but we can change it to another columm
# Convert to name...
df = df.set_index("name")

In [7]:
# Now we can lookup by name
df.loc["Albert Einstein"]

Unnamed: 0_level_0,born_in,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Albert Einstein,,Physics,Switzerland,1879-03-14,1955-04-18,male,http://en.wikipedia.org/wiki/Albert_Einstein,"Ulm , Baden-Württemberg , German Empire","Princeton, New Jersey , U.S.","Albert Einstein , born in Germany , Physics, ...",1921
Albert Einstein,,Physics,Germany,1879-03-14,1955-04-18,male,http://en.wikipedia.org/wiki/Albert_Einstein,"Ulm , Baden-Württemberg , German Empire","Princeton, New Jersey , U.S.","Albert Einstein , Physics, 1921",1921


In [8]:
# And reset it back to integer row ids
df = df.reset_index()

In [9]:
# Get the 3rd row (remember they are zero-based)
# iloc selects by position
# Remember we can always use
#df.iloc?
# To show the help docs...
df.iloc[2]

name                                              Vladimir Prelog *
born_in                                      Bosnia and Herzegovina
category                                                  Chemistry
country                                                            
date_of_birth                                         July 23, 1906
date_of_death                                            1998-01-07
gender                                                         male
link                   http://en.wikipedia.org/wiki/Vladimir_Prelog
place_of_birth    Sarajevo ,  Bosnia and Herzegovina , then part...
place_of_death                                Zürich ,  Switzerland
text              Vladimir Prelog *,  born in then  Austria–Hung...
year                                                           1975
Name: 2, dtype: object

In [10]:
# Get the gender column
gender = df.gender

In [11]:
gender.head()

0    male
1    male
2    male
3    None
4    male
Name: gender, dtype: object

In [12]:
# We can group the rows...
grouped = df.groupby("category")

In [13]:
# Which returns another type of dataframe
type(grouped)

pandas.core.groupby.DataFrameGroupBy

In [14]:
grouped.groups

{'': Int64Index([104, 712, 812, 815, 820, 922], dtype='int64'),
 'Chemistry': Int64Index([   2,   12,   26,   30,   33,   34,   49,   52,   59,   60,
             ...
              997,  998, 1017, 1018, 1024, 1026, 1030, 1034, 1041, 1050],
            dtype='int64', length=212),
 'Economics': Int64Index([  14,   32,   84,   93,   96,  133,  137,  141,  164,  165,  169,
              170,  177,  178,  183,  184,  190,  195,  202,  205,  223,  322,
              327,  333,  341,  347,  359,  363,  368,  373,  379,  383,  401,
              402,  403,  405,  408,  417,  424,  425,  438,  439,  445,  446,
              447,  456,  457,  461,  469,  473,  476,  480,  486,  487,  488,
              493,  503,  504,  507,  508,  511,  512,  513,  563,  570,  576,
              579,  589,  653,  656,  668,  684,  685,  695,  721,  733,  735,
              736,  753,  755,  827,  893,  940,  943,  957, 1001, 1002, 1003,
             1029, 1042],
            dtype='int64'),
 'Literature': Int64

In [15]:
# We can fetch a single group
physics_group = grouped.get_group('Physics')

In [16]:
physics_group.head()

Unnamed: 0,name,born_in,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
13,François Englert,,Physics,Belgium,6 November 1932,,male,http://en.wikipedia.org/wiki/Fran%C3%A7ois_Eng...,"Etterbeek , Brussels , Belgium",,"François Englert , Physics, 2013",2013
19,Niels Bohr,,Physics,Denmark,7 October 1885,18 November 1962,male,http://en.wikipedia.org/wiki/Niels_Bohr,"Copenhagen, Denmark","Copenhagen, Denmark","Niels Bohr , Physics, 1922",1922
23,Ben Roy Mottelson,,Physics,Denmark,"July 9, 1926",,male,http://en.wikipedia.org/wiki/Ben_Roy_Mottelson,"Chicago, Illinois",,"Ben Roy Mottelson , Physics, 1975",1975
24,Aage Bohr,,Physics,Denmark,19 June 1922,8 September 2009,male,http://en.wikipedia.org/wiki/Aage_Bohr,"Copenhagen , Denmark","Copenhagen , Denmark","Aage Bohr , Physics, 1975",1975
47,Alfred Kastler,,Physics,France,3 May 1902,7 January 1984,male,http://en.wikipedia.org/wiki/Alfred_Kastler,Guebwiller,"Bandol , France","Alfred Kastler , Physics, 1966",1966


In [17]:
# We can also filter the groups using the underlying NumPy 
# operations where we operate on the dataframe as if
# it were a single value
(df.category == 'Physics').head()

0    False
1    False
2    False
3    False
4    False
Name: category, dtype: bool

In [18]:
# Note that we have a list of boolean values from the comparison...
# We need to use that as a filter in our dataframe accessor
df[df.category == 'Physics']

Unnamed: 0,name,born_in,category,country,date_of_birth,date_of_death,gender,link,place_of_birth,place_of_death,text,year
13,François Englert,,Physics,Belgium,6 November 1932,,male,http://en.wikipedia.org/wiki/Fran%C3%A7ois_Eng...,"Etterbeek , Brussels , Belgium",,"François Englert , Physics, 2013",2013
19,Niels Bohr,,Physics,Denmark,7 October 1885,18 November 1962,male,http://en.wikipedia.org/wiki/Niels_Bohr,"Copenhagen, Denmark","Copenhagen, Denmark","Niels Bohr , Physics, 1922",1922
23,Ben Roy Mottelson,,Physics,Denmark,"July 9, 1926",,male,http://en.wikipedia.org/wiki/Ben_Roy_Mottelson,"Chicago, Illinois",,"Ben Roy Mottelson , Physics, 1975",1975
24,Aage Bohr,,Physics,Denmark,19 June 1922,8 September 2009,male,http://en.wikipedia.org/wiki/Aage_Bohr,"Copenhagen , Denmark","Copenhagen , Denmark","Aage Bohr , Physics, 1975",1975
47,Alfred Kastler,,Physics,France,3 May 1902,7 January 1984,male,http://en.wikipedia.org/wiki/Alfred_Kastler,Guebwiller,"Bandol , France","Alfred Kastler , Physics, 1966",1966
51,Arno Penzias *,Germany,Physics,,"April 26, 1933",,male,http://en.wikipedia.org/wiki/Arno_Penzias,"Munich , Germany",,"Arno Penzias *, Physics, 1978",1978
54,Klaus von Klitzing,,Physics,Germany,"June 28, 1943",,male,http://en.wikipedia.org/wiki/Klaus_von_Klitzing,Schroda (Posen),,"Klaus von Klitzing , Physics, 1985",1985
55,Gerd Binnig,,Physics,Germany,20 July 1947,,male,http://en.wikipedia.org/wiki/Gerd_Binnig,Frankfurt am Main,,"Gerd Binnig , Physics, 1986",1986
56,Ernst Ruska,,Physics,Germany,25 December 1906,27 May 1988,male,http://en.wikipedia.org/wiki/Ernst_Ruska,"Heidelberg , Germany","West Berlin , Germany","Ernst Ruska , Physics, 1986",1986
57,J. Georg Bednorz,,Physics,Germany,"May 16, 1950",,male,http://en.wikipedia.org/wiki/J._Georg_Bednorz,"Neuenkirchen , North Rhine-Westphalia , Germany",,"J. Georg Bednorz , Physics, 1987",1987


In [19]:
# And now we have only the physics results. Different method, same result.
len(df[df.category == 'Physics']) == len(physics_group)

True

In [20]:
# You can create a dataframe from datasets grouped by keys...
pd.DataFrame({
    'name': ['bob', 'sally', 'susan'],
    'age': [23, 35, 19]
})

Unnamed: 0,age,name
0,23,bob
1,35,sally
2,19,susan


In [21]:
# But you have to have the same number of values!!
try:
    pd.DataFrame({
        'name': ['bob', 'sally', 'susan'],
        'age': [23, 35, 19],
        'fav_color': ['red'],
    })
except ValueError as e:
    print(e)

arrays must all be same length


In [22]:
# Now this will work...
pd.DataFrame({
    'name': ['bob', 'sally', 'susan'],
    'age': [23, 35, 19],
    'fav_color': ['red', 'green', 'blue'],
})

Unnamed: 0,age,fav_color,name
0,23,red,bob
1,35,green,sally
2,19,blue,susan


In [23]:
# Or, more akin to JSON, use rows of dictionaries:
pd.DataFrame.from_dict([
    {"name": "bob", "age": 23},
    {"name": "sally", "age": 35},
    {"name": "susan", "age": 19},
])

Unnamed: 0,age,name
0,23,bob
1,35,sally
2,19,susan


In [24]:
# Only this time missing values are treated differently...
pd.DataFrame.from_dict([
    {"name": "bob", "age": 23, "fav_color": "red"},
    {"name": "sally", "age": 35},
    {"name": "susan", "age": 19},
])

Unnamed: 0,age,fav_color,name
0,23,red,bob
1,35,,sally
2,19,,susan


In [25]:
# Pandas has many helpers to load data, including 
# from JSON (which we used above)
# from CSV
# from databases

In [26]:
# Let's save our data back out...
physics_group.to_json('physics.json')

In [27]:
# Let's check the first 100 characters of our file...
cat physics.json | cut -c 1-100

SyntaxError: invalid syntax (<ipython-input-27-0cab5d72424e>, line 2)