# GroupBy and Count in Pandas

In [1]:
import pandas as pd
df = pd.read_csv('../data/medium_data.csv.zip')

## Option 1: GroupBy and Count in Pandas

In [2]:
df['date'] = pd.to_datetime(df['date'])
df['date_m'] = df['date'].dt.to_period('M')

In [3]:
df.groupby(['publication', 'date_m'])['url'].count()

publication    date_m 
Better Humans  2019-03     5
               2019-04     4
               2019-05     4
               2019-06     1
               2019-07     3
                          ..
UX Collective  2019-08    24
               2019-09    33
               2019-10    86
               2019-11    28
               2019-12    46
Name: url, Length: 79, dtype: int64

## Option 2: GroupBy and Aggregate functions in Pandas

In [4]:
df.groupby(['publication', 'date_m'])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fc350575be0>

In [5]:
df.groupby(['publication', 'date_m']).agg(['mean', 'count', 'sum']).head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,id,id,id,claps,claps,claps,reading_time,reading_time,reading_time
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count,sum,mean,count,sum,mean,count,sum
publication,date_m,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Better Humans,2019-03,5138.2,5,25691,1710.8,5,8554,12.0,5,60
Better Humans,2019-04,2308.25,4,9233,3942.25,4,15769,7.5,4,30
Better Humans,2019-05,3382.0,4,13528,1187.0,4,4748,25.0,4,100
Better Humans,2019-06,6326.0,1,6326,9700.0,1,9700,5.0,1,5
Better Humans,2019-07,1189.333333,3,3568,1183.333333,3,3550,22.333333,3,67
Better Humans,2019-09,2201.333333,3,6604,439.333333,3,1318,12.0,3,36
Better Humans,2019-10,2422.0,4,9688,204.5,4,818,9.75,4,39
Better Humans,2019-11,2441.5,2,4883,1460.5,2,2921,9.0,2,18
Better Humans,2019-12,2299.5,2,4599,1900.0,2,3800,9.5,2,19
Better Marketing,2019-03,4798.6,5,23993,1857.6,5,9288,9.6,5,48


## Option 3: GroupBy, Count and value_counts in Pandas

In [6]:
df.value_counts(['publication', 'date_m']) 

publication    date_m 
The Startup    2019-05    656
               2019-10    477
               2019-07    401
               2019-12    310
               2019-06    286
                         ... 
Better Humans  2019-07      3
UX Collective  2019-01      2
Better Humans  2019-12      2
               2019-11      2
               2019-06      1
Length: 79, dtype: int64

In [7]:
df.groupby(['publication', 'date_m'])['url'].count().sort_values(ascending=False)

publication    date_m 
The Startup    2019-05    656
               2019-10    477
               2019-07    401
               2019-12    310
               2019-06    286
                         ... 
Better Humans  2019-07      3
UX Collective  2019-01      2
Better Humans  2019-12      2
               2019-11      2
               2019-06      1
Name: url, Length: 79, dtype: int64

## Option 4: GroupBy and Count + Size in Pandas

In [8]:
df.groupby(['publication', 'date_m']).size()

publication    date_m 
Better Humans  2019-03     5
               2019-04     4
               2019-05     4
               2019-06     1
               2019-07     3
                          ..
UX Collective  2019-08    24
               2019-09    33
               2019-10    86
               2019-11    28
               2019-12    46
Length: 79, dtype: int64

## Option 5: GroupBy and Count + Size in Pandas

In [9]:
df.groupby(['publication', 'date_m'])['reading_time'].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
publication,date_m,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Better Humans,2019-03,5.0,12.000000,5.196152,3.0,13.00,13.0,15.00,16.0
Better Humans,2019-04,4.0,7.500000,5.196152,2.0,4.25,7.0,10.25,14.0
Better Humans,2019-05,4.0,25.000000,7.958224,17.0,21.50,23.5,27.00,36.0
Better Humans,2019-06,1.0,5.000000,,5.0,5.00,5.0,5.00,5.0
Better Humans,2019-07,3.0,22.333333,15.307950,13.0,13.50,14.0,27.00,40.0
...,...,...,...,...,...,...,...,...,...
UX Collective,2019-08,24.0,6.333333,3.102126,2.0,4.00,5.0,7.50,14.0
UX Collective,2019-09,33.0,5.757576,2.738959,2.0,4.00,5.0,7.00,13.0
UX Collective,2019-10,86.0,6.104651,5.788486,2.0,4.00,5.0,7.00,55.0
UX Collective,2019-11,28.0,5.535714,2.516349,2.0,3.00,5.0,7.00,12.0


## Performance

In [10]:
%timeit df.groupby(['publication', 'date_m'])['publication'].count()

1.41 ms ± 21.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [11]:
%timeit df.value_counts(subset=['publication', 'date_m'], sort=False)

1.06 ms ± 12.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [12]:
%timeit df.value_counts(subset=['publication', 'date_m'])

1.18 ms ± 6.17 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [13]:
df.groupby(['publication', 'date_m'])['publication'].count()

publication    date_m 
Better Humans  2019-03     5
               2019-04     4
               2019-05     4
               2019-06     1
               2019-07     3
                          ..
UX Collective  2019-08    24
               2019-09    33
               2019-10    86
               2019-11    28
               2019-12    46
Name: publication, Length: 79, dtype: int64

In [14]:
df.value_counts(subset=['publication', 'date_m'], sort=False)

publication    date_m 
Better Humans  2019-03     5
               2019-04     4
               2019-05     4
               2019-06     1
               2019-07     3
                          ..
UX Collective  2019-08    24
               2019-09    33
               2019-10    86
               2019-11    28
               2019-12    46
Length: 79, dtype: int64