# Agenda: Grouping

1. Simple grouping
2. More complex grouping
    - More than one grouping column
    - More than one calculating column
    - More than one aggregation method
3. Pivot tables (2D grouping)
4. Moving rows to columns and back

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
filename = '../data/taxi.csv'

df = pd.read_csv(filename)

In [3]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RateCodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-06-02 11:19:29,2015-06-02 11:47:52,1,1.63,-73.95443,40.764141,1,N,-73.974754,40.754093,2,17.0,0.0,0.5,0.0,0.0,0.3,17.8
1,2,2015-06-02 11:19:30,2015-06-02 11:27:56,1,0.46,-73.971443,40.758942,1,N,-73.978539,40.761909,1,6.5,0.0,0.5,1.0,0.0,0.3,8.3
2,2,2015-06-02 11:19:31,2015-06-02 11:30:30,1,0.87,-73.978111,40.738434,1,N,-73.990273,40.745438,1,8.0,0.0,0.5,2.2,0.0,0.3,11.0
3,2,2015-06-02 11:19:31,2015-06-02 11:39:02,1,2.13,-73.945892,40.773529,1,N,-73.971527,40.76033,1,13.5,0.0,0.5,2.86,0.0,0.3,17.16
4,1,2015-06-02 11:19:32,2015-06-02 11:32:49,1,1.4,-73.979088,40.776772,1,N,-73.982162,40.758999,2,9.5,0.0,0.5,0.0,0.0,0.3,10.3


In [6]:
# I want to find out the mean trip_distance where passenger_count is 1

df.loc[  
    df['passenger_count'] == 1   # row selector
    ,
    'trip_distance' # column selector
].mean()

3.0923380047176354

In [7]:
# find out the mean trip_distance where passenger_count is 2

df.loc[  
    df['passenger_count'] == 2   # row selector
    ,
    'trip_distance' # column selector
].mean()

3.3843869002284848

In [8]:
# find out the mean trip_distance where passenger_count is 3

df.loc[  
    df['passenger_count'] == 3   # row selector
    ,
    'trip_distance' # column selector
].mean()

3.3423891625615765

In [9]:
# what we really want is: For every unique value of passenger_count,
# calculate the mean of trip_distance on those rows

# this is the essence of *grouping*

# - we have an aggregation method
# - we want to run it on a numeric column
# - we want to get a separate result for each unique value of a categorical column

# the syntax is:

# .groupby(categorical_column)[numeric_column].agg_method()
df.groupby('passenger_count')['trip_distance'].mean()

passenger_count
0    4.600000
1    3.092338
2    3.384387
3    3.342389
4    3.628901
5    3.182712
6    3.170976
Name: trip_distance, dtype: float64

In [10]:
# what does this give me?
df.groupby('passenger_count')['trip_distance']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7f0e8314e060>

In [11]:
df.groupby('passenger_count')['trip_distance'].std()   # standard deviation

passenger_count
0    4.666905
1    4.020187
2    4.242826
3    3.822041
4    4.351369
5    3.969468
6    3.759807
Name: trip_distance, dtype: float64

In [13]:
# we can try other categorical columns

df['tip_percentage'] = df['tip_amount'] / df['total_amount']   # let's add a new column indicating the percentage of tip

In [16]:
# now let's see if the percentage changes per passenger_count

df.groupby('passenger_count')['tip_percentage'].mean()

passenger_count
0    0.183127
1    0.092880
2    0.088309
3    0.087368
4    0.077067
5    0.094349
6    0.086075
Name: tip_percentage, dtype: float64

In [17]:
# do we see different tip percentages according to the manufacturer of the computer in the taxi?
df.groupby('VendorID')['tip_percentage'].mean()

VendorID
1    0.091538
2    0.091680
Name: tip_percentage, dtype: float64

# Why not sort the `groupby` results?

1. If you're going to be sorting by values immediately after, why waste the time/CPU?
2. The sorting does take some time - -maybe you don't want to waste on that

In [18]:
# what if we groupby a non-categorical column?
# (don't do this!)

# for every distinct value of trip_distance
# find the mean total_amount
df.groupby('trip_distance')['total_amount'].mean()

trip_distance
0.00      31.58194
0.01      52.80000
0.02      43.46000
0.03       3.96000
0.04      70.01000
           ...    
34.84    137.59000
35.51    135.13000
37.20    210.14000
60.30    160.05000
64.60     79.96000
Name: total_amount, Length: 1219, dtype: float64

In [21]:
!ls ../data/*csv*

../data/2020_sharing_data_outside.csv  ../data/olympic_athlete_events.csv
../data/CPILFESL.csv		       ../data/san+francisco,ca.csv
../data/albany,ny.csv		       ../data/sat-scores.csv
../data/boston,ma.csv		       ../data/skyscrapers.csv
../data/burrito_current.csv	       ../data/springfield,il.csv
../data/celebrity_deaths_2016.csv      ../data/springfield,ma.csv
../data/chicago,il.csv		       ../data/taxi-distance.csv
../data/eu_cpi.csv		       ../data/taxi-passenger-count.csv
../data/eu_gdp.csv		       ../data/taxi.csv
../data/ice-cream.csv		       ../data/titanic3.csv
../data/languages.csv		       ../data/us-median-cpi.csv
../data/los+angeles,ca.csv	       ../data/us-unemployment-rate.csv
../data/miles-traveled.csv	       ../data/us_gdp.csv
../data/new+york,ny.csv		       ../data/winemag-150k-reviews.csv
../data/oecd_locations.csv	       ../data/wti-daily.csv
../data/oecd_tourism.csv


In [22]:
!head ../data/olympic_athlete_events.csv

"ID","Name","Sex","Age","Height","Weight","Team","NOC","Games","Year","Season","City","Sport","Event","Medal"
"1","A Dijiang","M",24,180,80,"China","CHN","1992 Summer",1992,"Summer","Barcelona","Basketball","Basketball Men's Basketball",NA
"2","A Lamusi","M",23,170,60,"China","CHN","2012 Summer",2012,"Summer","London","Judo","Judo Men's Extra-Lightweight",NA
"3","Gunnar Nielsen Aaby","M",24,NA,NA,"Denmark","DEN","1920 Summer",1920,"Summer","Antwerpen","Football","Football Men's Football",NA
"4","Edgar Lindenau Aabye","M",34,NA,NA,"Denmark/Sweden","DEN","1900 Summer",1900,"Summer","Paris","Tug-Of-War","Tug-Of-War Men's Tug-Of-War","Gold"
"5","Christine Jacoba Aaftink","F",21,185,82,"Netherlands","NED","1988 Winter",1988,"Winter","Calgary","Speed Skating","Speed Skating Women's 500 metres",NA
"5","Christine Jacoba Aaftink","F",21,185,82,"Netherlands","NED","1988 Winter",1988,"Winter","Calgary","Speed Skating","Speed Skating Women's 1,000 metres",NA
"5","Christine Jacoba Aaftink","F",25,1

# Exercise: Olympic calculations

1. Load the file `../data/olympic_athlete_events.csv` into a data frame.
2. Find the mean age for people in each sport.
3. Find the mean height for people in each sport after 1960.

In [23]:
filename = '../data/olympic_athlete_events.csv'

df = pd.read_csv(filename)

In [24]:
df.groupby('Sport')['Age'].mean()

Sport
Aeronautics         26.000000
Alpine Skiing       23.205462
Alpinism            38.812500
Archery             27.935226
Art Competitions    45.901009
                      ...    
Tug-Of-War          29.309524
Volleyball          25.183800
Water Polo          25.659627
Weightlifting       25.502010
Wrestling           25.798289
Name: Age, Length: 66, dtype: float64

In [27]:
# which 5 sports have the tallest atheletes, on average?

df.groupby('Sport')['Age'].mean().sort_values(ascending=False).head(10)

Sport
Roque               53.333333
Art Competitions    45.901009
Alpinism            38.812500
Polo                35.333333
Equestrianism       34.390831
Croquet             33.733333
Shooting            33.422226
Motorboating        33.333333
Jeu De Paume        32.454545
Curling             31.412527
Name: Age, dtype: float64

In [34]:
# Find the mean height for people in each sport after 1960.

(
    df
    .loc[
        df['Year'] > 1960   # row selector
        ,
        ['Sport', 'Height']  # column selector
    ]
    .groupby('Sport')['Height'].mean()
    .sort_values(ascending=False)
)

Sport
Basketball                   191.451050
Volleyball                   186.994822
Beach Volleyball             186.144954
Water Polo                   185.213519
Rowing                       184.483305
Handball                     183.386017
Baseball                     182.599291
Bobsleigh                    181.680074
Ice Hockey                   179.066636
Tennis                       178.941767
Swimming                     178.802635
Canoeing                     178.713466
Sailing                      178.418746
Modern Pentathlon            178.234389
Fencing                      177.546094
Nordic Combined              176.763566
Taekwondo                    176.750836
Luge                         176.657742
Ski Jumping                  176.642412
Cycling                      176.253339
Skeleton                     176.098266
Athletics                    176.085087
Football                     175.564442
Rugby Sevens                 175.363636
Equestrianism                174.4

In [37]:
# use the .query method

(
    df
    .query('Year > 1960')  # put your query in a string, and Pandas turns column names into df[NAME] syntax
    [['Sport', 'Height']]
    .groupby('Sport')['Height'].mean()
    .sort_values(ascending=False)
)

Unnamed: 0,Sport,Height
0,Basketball,180.0
1,Judo,170.0
4,Speed Skating,185.0
5,Speed Skating,185.0
6,Speed Skating,185.0
...,...,...
271111,Luge,179.0
271112,Ski Jumping,176.0
271113,Ski Jumping,176.0
271114,Bobsleigh,185.0


In [38]:
# go the functional-programming route

(
    df
    .loc[lambda df_: df_['Year'] > 1960]
    [['Sport', 'Height']]
    .groupby('Sport')['Height'].mean()
    .sort_values(ascending=False)
)

Sport
Basketball                   191.451050
Volleyball                   186.994822
Beach Volleyball             186.144954
Water Polo                   185.213519
Rowing                       184.483305
Handball                     183.386017
Baseball                     182.599291
Bobsleigh                    181.680074
Ice Hockey                   179.066636
Tennis                       178.941767
Swimming                     178.802635
Canoeing                     178.713466
Sailing                      178.418746
Modern Pentathlon            178.234389
Fencing                      177.546094
Nordic Combined              176.763566
Taekwondo                    176.750836
Luge                         176.657742
Ski Jumping                  176.642412
Cycling                      176.253339
Skeleton                     176.098266
Athletics                    176.085087
Football                     175.564442
Rugby Sevens                 175.363636
Equestrianism                174.4

# More complex!

- What if we want to to group taxi info not just by passenger_count, but also by vendor ID?

In other words, I want to know, not just for each unique value of `passenger_count`, but for each unique combination of `passenger_count` and `VendorID`, what was the mean `trip_distance`.

Remember that in Pandas, just about anywhere that we can pass a string (indicating one column), we can pass a list (indicating multiple columns).

In [42]:
filename = '../data/taxi.csv'
df = pd.read_csv(filename)

results = df.groupby(['passenger_count', 'VendorID'])['trip_distance'].mean()
results

passenger_count  VendorID
0                1           4.600000
1                1           2.956456
                 2           3.262967
2                1           3.452027
                 2           3.328849
3                1           3.588535
                 2           3.187189
4                1           3.952239
                 2           3.440522
5                1           4.933333
                 2           3.172553
6                2           3.170976
Name: trip_distance, dtype: float64

In [43]:
# we have a series with a *multi-index*
# it's a hierarchical index!

results.loc[1]


VendorID
1    2.956456
2    3.262967
Name: trip_distance, dtype: float64

In [44]:
# show me with passenger_count of either 1 or 4
results.loc[[1, 4]]

passenger_count  VendorID
1                1           2.956456
                 2           3.262967
4                1           3.952239
                 2           3.440522
Name: trip_distance, dtype: float64

In [45]:
# what if I only want to groupby passenger_count
# but I want to know the mean trip_distance and total_amount?

df.groupby('passenger_count')[['trip_distance', 'total_amount']].mean()

Unnamed: 0_level_0,trip_distance,total_amount
passenger_count,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4.6,25.57
1,3.092338,17.368569
2,3.384387,18.406306
3,3.342389,17.994704
4,3.628901,18.881648
5,3.182712,17.211269
6,3.170976,17.401355


In [46]:
# what if I want more than one aggregation method?
# I can use the "agg" method, passing it a list of strings -- the aggregation methods we want to use

df.groupby('passenger_count')['trip_distance'].agg(['mean', 'median', 'std'])

Unnamed: 0_level_0,mean,median,std
passenger_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4.6,4.6,4.666905
1,3.092338,1.63,4.020187
2,3.384387,1.8,4.242826
3,3.342389,1.7,3.822041
4,3.628901,2.0,4.351369
5,3.182712,1.675,3.969468
6,3.170976,1.67,3.759807


In [None]:
# can I provide multiples of more than one of these?
# OF COURSE YOU CAN!

df.groupby(['passenger_count', 'VendorID'])['trip