# How to Use GroupBy with Multiple Columns in Pandas

https://datascientyst.com/use-groupby-multiple-columns-pandas/

In [1]:
import pandas as pd
df = pd.read_csv('../data/medium_data.csv.zip')

df['date'] = pd.to_datetime(df['date'])
df['date_m'] = df['date'].dt.to_period('M')

In [2]:
cols = ['title', 'date', 'publication', 'url']
df[cols].sample(5)

Unnamed: 0,title,date,publication,url
3974,How to Start a Motorcycle Ride sharing Company...,2019-05-24,Data Driven Investor,https://medium.com/datadriveninvestor/how-to-s...
4318,How To Build a Pitch Deck,2019-10-23,Data Driven Investor,https://medium.com/datadriveninvestor/how-to-b...
1679,"What is Marketing Segmentation, and why you ne...",2019-10-28,The Startup,https://medium.com/swlh/what-is-marketing-segm...
4203,How to create a meaningful relationship,2019-10-23,UX Collective,https://uxdesign.cc/how-to-create-a-meaningful...
3926,"<strong class=""markup--strong markup--h3-stron...",2019-05-24,The Startup,https://medium.com/swlh/how-to-navigate-the-st...


## Step 2: Group by multiple columns

In [3]:
df.groupby('publication')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ff1d16349a0>

## Step 3: GroupBy SeriesGroupBy vs DataFrameGroupBy

In [4]:
df.groupby(['publication'])['url']

<pandas.core.groupby.generic.SeriesGroupBy object at 0x7ff1d1634d00>

In [5]:
df.groupby(['publication'])[['url', 'date']]

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ff2243ef2b0>

In [6]:
df.groupby(['publication', 'date_m'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7ff1d15cf700>

In [7]:
df.groupby(['publication', 'date_m'])['url'].count()

publication    date_m 
Better Humans  2019-03     5
               2019-04     4
               2019-05     4
               2019-06     1
               2019-07     3
                          ..
UX Collective  2019-08    24
               2019-09    33
               2019-10    86
               2019-11    28
               2019-12    46
Name: url, Length: 79, dtype: int64

In [8]:
df.groupby(['publication'])[['url', 'date']].count()

Unnamed: 0_level_0,url,date
publication,Unnamed: 1_level_1,Unnamed: 2_level_1
Better Humans,28,28
Better Marketing,242,242
Data Driven Investor,778,778
The Startup,3041,3041
The Writing Cooperative,403,403
Towards Data Science,1461,1461
UX Collective,555,555


## Step 4: Apply multiple agg functions

In [9]:
df.groupby(['publication', 'date_m'])[['claps', 'reading_time']].agg(['mean', 'count', 'sum'])

Unnamed: 0_level_0,Unnamed: 1_level_0,claps,claps,claps,reading_time,reading_time,reading_time
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count,sum,mean,count,sum
publication,date_m,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Better Humans,2019-03,1710.800000,5,8554,12.000000,5,60
Better Humans,2019-04,3942.250000,4,15769,7.500000,4,30
Better Humans,2019-05,1187.000000,4,4748,25.000000,4,100
Better Humans,2019-06,9700.000000,1,9700,5.000000,1,5
Better Humans,2019-07,1183.333333,3,3550,22.333333,3,67
...,...,...,...,...,...,...,...
UX Collective,2019-08,960.750000,24,23058,6.333333,24,152
UX Collective,2019-09,144.606061,33,4772,5.757576,33,190
UX Collective,2019-10,417.383721,86,35895,6.104651,86,525
UX Collective,2019-11,345.642857,28,9678,5.535714,28,155


In [10]:
df.groupby(['publication', 'date_m']).agg(['mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,id,claps,reading_time,date
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,mean,mean,mean
publication,date_m,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Better Humans,2019-03,5138.200000,1710.800000,12.000000,2019-03-17 04:48:00.000000000
Better Humans,2019-04,2308.250000,3942.250000,7.500000,2019-04-24 18:00:00.000000000
Better Humans,2019-05,3382.000000,1187.000000,25.000000,2019-05-14 06:00:00.000000000
Better Humans,2019-06,6326.000000,9700.000000,5.000000,2019-06-27 00:00:00.000000000
Better Humans,2019-07,1189.333333,1183.333333,22.333333,2019-07-24 00:00:00.000000000
...,...,...,...,...,...
UX Collective,2019-08,2702.583333,960.750000,6.333333,2019-08-09 06:00:00.000000000
UX Collective,2019-09,2142.909091,144.606061,5.757576,2019-09-15 13:05:27.272727296
UX Collective,2019-10,2927.406977,417.383721,6.104651,2019-10-20 05:18:08.372092928
UX Collective,2019-11,2297.750000,345.642857,5.535714,2019-11-06 04:17:08.571428608


## Step 5: Pandas groupby and named aggregations

In [11]:
df.groupby('publication').agg(
             id_mean = ('id', 'mean'),
             claps_mean = ('claps', 'mean'),
             claps_count = ('claps', 'count'),
             claps_range = ('claps', lambda x: x.max() - x.min()))

Unnamed: 0_level_0,id_mean,claps_mean,claps_count,claps_range
publication,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Better Humans,3004.285714,1827.785714,28,9660
Better Marketing,2954.578512,829.347107,242,22971
Data Driven Investor,3491.115681,95.034704,778,4100
The Startup,3238.427162,303.403815,3041,38000
The Writing Cooperative,3539.476427,372.744417,403,12600
Towards Data Science,3181.948665,283.631759,1461,8000
UX Collective,3138.336937,380.924324,555,10496
