# Grouping vs. pivot tables

In [1]:
import pandas as pd

filename = 'titanic3.csv'

df = pd.read_csv(filename)

In [2]:
df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0000,0.0,0.0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1.0,2.0,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,3.0,0.0,"Zabour, Miss. Thamine",female,,1.0,0.0,2665,14.4542,,C,,,
1306,3.0,0.0,"Zakarian, Mr. Mapriededer",male,26.5000,0.0,0.0,2656,7.2250,,C,,304.0,
1307,3.0,0.0,"Zakarian, Mr. Ortin",male,27.0000,0.0,0.0,2670,7.2250,,C,,,
1308,3.0,0.0,"Zimmerman, Mr. Leo",male,29.0000,0.0,0.0,315082,7.8750,,S,,,


In [3]:
# how much did people pay, on average, to ride on the Titanic?

df['fare'].mean()

np.float64(33.29547928134557)

In [4]:
# how much did people pay, on average, for a ticket *per* class?

# to run a groupby, I need:
# - a categorical column (pclass)
# - a numeric column (fare)
# - an aggregation method 


df.groupby('pclass')['fare'].mean()

pclass
1.0    87.508992
2.0    21.179196
3.0    13.302889
Name: fare, dtype: float64

In [5]:
# what if I want to find out: how much did people pay, on average, per class *and* per sex?

# we get a 2D multi-index because we grouped on two different categorical columns

df.groupby(['pclass', 'sex'])['fare'].mean()

pclass  sex   
1.0     female    109.412385
        male       69.888385
2.0     female     23.234827
        male       19.904946
3.0     female     15.324250
        male       12.415462
Name: fare, dtype: float64

In [6]:
# what I just did could also be expressed as a 2D table
# what if the pclass were the rows (index) and the sex were the columns?

# we can do that with a pivot table!

df.pivot_table(index='pclass',
               columns='sex',
               values='fare',
               aggfunc='mean')

sex,female,male
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,109.412385,69.888385
2.0,23.234827,19.904946
3.0,15.32425,12.415462


In [7]:
# let's say that I have the 2D multi-index on a series... can I turn that into a pivot table?

df.groupby(['pclass', 'sex'])['fare'].mean().unstack('sex')

sex,female,male
pclass,Unnamed: 1_level_1,Unnamed: 2_level_1
1.0,109.412385,69.888385
2.0,23.234827,19.904946
3.0,15.32425,12.415462


In [8]:
df.groupby(['pclass', 'sex'])['fare'].mean().unstack('sex').stack()

pclass  sex   
1.0     female    109.412385
        male       69.888385
2.0     female     23.234827
        male       19.904946
3.0     female     15.324250
        male       12.415462
dtype: float64

In [9]:
df.groupby(['pclass', 'sex'])['fare'].mean().unstack('pclass')

pclass,1.0,2.0,3.0
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,109.412385,23.234827,15.32425
male,69.888385,19.904946,12.415462
