In [1]:
import pandas as pd
import numpy as np


In [2]:
df=pd.DataFrame({"key1":["a","a","b","b","a"],
                 "key2":["one","two","one","two","one"],
                 "data1":np.random.randn(5),
                 "data2":np.random.randn(5)})
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.756068,0.259816
1,a,two,-1.736872,-0.068225
2,b,one,0.609637,0.340782
3,b,two,-0.657067,0.60371
4,a,one,-1.525572,-0.85179


In [3]:
grouped=df["data1"].groupby(df["key1"])

In [4]:
grouped

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000013C50EB5EC8>

In [None]:
This grouped variable is now a GroupBy object. It has not actually computed anything.We can apply some 
operation to each of the groups.

In [5]:
grouped.mean()

key1
a   -1.339504
b   -0.023715
Name: data1, dtype: float64

In [6]:
means=df["data1"].groupby([df["key1"],df["key2"]]).mean()

Here we grouped the data using two keys,and the resulting Series now has a heirarchical index 
consisting of unique pairs of key. 

In [7]:
means

key1  key2
a     one    -1.140820
      two    -1.736872
b     one     0.609637
      two    -0.657067
Name: data1, dtype: float64

In [9]:
states=np.array(["Ohio","California","California","Ohio","Ohio"])

years=np.array([2005,2005,2006,2005,2006])

df["data1"].groupby([states,years]).mean()

California  2005   -1.736872
            2006    0.609637
Ohio        2005   -0.706568
            2006   -1.525572
Name: data1, dtype: float64

In [10]:
df

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.756068,0.259816
1,a,two,-1.736872,-0.068225
2,b,one,0.609637,0.340782
3,b,two,-0.657067,0.60371
4,a,one,-1.525572,-0.85179


In [11]:
df.groupby("key1").mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-1.339504,-0.220066
b,-0.023715,0.472246


In [12]:
df.groupby("key2").mean()

Unnamed: 0_level_0,data1,data2
key2,Unnamed: 1_level_1,Unnamed: 2_level_1
one,-0.557334,-0.08373
two,-1.19697,0.267742


In [None]:
#Here the key1 is excluded as it is not numeric, it is said to be a nuisance column.

In [13]:
df.groupby(["key1","key2"]).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-1.14082,-0.295987
a,two,-1.736872,-0.068225
b,one,0.609637,0.340782
b,two,-0.657067,0.60371


In [14]:
df.groupby("key1").size()

key1
a    3
b    2
dtype: int64

Iterating over Groups:GroupBy supports iteration,generating a sequence of 2-tuples containing the group name along with
the chunk of data.

In [17]:
for name,group in df.groupby("key1"):
    print(name,"\n")
    print(group,"\n")

a 

  key1 key2     data1     data2
0    a  one -0.756068  0.259816
1    a  two -1.736872 -0.068225
4    a  one -1.525572 -0.851790 

b 

  key1 key2     data1     data2
2    b  one  0.609637  0.340782
3    b  two -0.657067  0.603710 



In [20]:
for (k1,k2),group in df.groupby(["key1","key2"]):

        print((k1,k2),"\n")
        print(group,"\n")

('a', 'one') 

  key1 key2     data1     data2
0    a  one -0.756068  0.259816
4    a  one -1.525572 -0.851790 

('a', 'two') 

  key1 key2     data1     data2
1    a  two -1.736872 -0.068225 

('b', 'one') 

  key1 key2     data1     data2
2    b  one  0.609637  0.340782 

('b', 'two') 

  key1 key2     data1    data2
3    b  two -0.657067  0.60371 



In [21]:
pieces=dict(list(df.groupby("key1")))

In [23]:
pieces["a"]

Unnamed: 0,key1,key2,data1,data2
0,a,one,-0.756068,0.259816
1,a,two,-1.736872,-0.068225
4,a,one,-1.525572,-0.85179


In [24]:
pieces["b"]

Unnamed: 0,key1,key2,data1,data2
2,b,one,0.609637,0.340782
3,b,two,-0.657067,0.60371


In [25]:
pieces

{'a':   key1 key2     data1     data2
 0    a  one -0.756068  0.259816
 1    a  two -1.736872 -0.068225
 4    a  one -1.525572 -0.851790, 'b':   key1 key2     data1     data2
 2    b  one  0.609637  0.340782
 3    b  two -0.657067  0.603710}

In [26]:
df.dtypes

key1      object
key2      object
data1    float64
data2    float64
dtype: object

In [27]:
grouped=df.groupby(df.dtypes,axis=1)

In [28]:
grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000013C511163C8>

In [30]:
for dtype,group in grouped:
    print(dtype,"\n")
    print(group,"\n")

float64 

      data1     data2
0 -0.756068  0.259816
1 -1.736872 -0.068225
2  0.609637  0.340782
3 -0.657067  0.603710
4 -1.525572 -0.851790 

object 

  key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one 



Selecting a Column or Subset of Columns

In [34]:
df.groupby("key1")["data1"]
df.groupby("key1")[["data2"]]

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000013C510FBC48>

In [33]:
df["data1"].groupby(df["key1"])
df[["data2"]].groupby(df["key1"])

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000013C510F7DC8>

We can for large data sets aggregate only few columns.
To compute the means of data2 column and get the result as a DataFrame.

In [38]:
df.groupby(["key1","key2"])[["data2"]]  #when the column is passed the it returns a DataFrame.

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000013C51122148>

In [39]:
df.groupby(["key1","key2"])[["data2"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,-0.295987
a,two,-0.068225
b,one,0.340782
b,two,0.60371


In [36]:
df.groupby(["key1","key2"])[["data2"]].size() 

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

In [40]:
df.groupby(["key1","key2"])["data2"] #when the column name is passed as a scalar,then result is a grouped Series

<pandas.core.groupby.generic.SeriesGroupBy object at 0x0000013C51122D48>

In [41]:
df.groupby(["key1","key2"])["data2"].mean()

key1  key2
a     one    -0.295987
      two    -0.068225
b     one     0.340782
      two     0.603710
Name: data2, dtype: float64

In [None]:
#Grouping with Dicts and Series 

In [42]:
people=pd.DataFrame(np.random.randn(5,5), 
                    columns=["a","b","c","d","e"],
                    index=["Joe","Steve","Wes","Jim","Travis"])

In [43]:
people.iloc[2:3,[1,2]] =np.nan #Add a few NA values

In [44]:
people

Unnamed: 0,a,b,c,d,e
Joe,0.429642,-1.036279,0.289835,-0.67909,-1.431681
Steve,-0.92154,0.961481,0.907833,-0.386728,-0.291484
Wes,1.371678,,,0.496658,0.775398
Jim,-0.560822,-1.624451,0.971684,0.889651,1.138299
Travis,0.918496,0.077477,0.483753,1.025657,0.537539


In [45]:
mapping={"a":"red","b":"red","c":"blue","d":"blue","e":"red","f":"orange"}

In [46]:
by_column=people.groupby(mapping,axis=1)

In [47]:
by_column.sum()

Unnamed: 0,blue,red
Joe,-0.389255,-2.038319
Steve,0.521104,-0.251543
Wes,0.496658,2.147076
Jim,1.861335,-1.046974
Travis,1.50941,1.533512


In [54]:
map_series=pd.Series(mapping)

In [55]:
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

In [60]:
people.groupby(map_series, axis=1).sum()

Unnamed: 0,blue,red
Joe,-0.389255,-2.038319
Steve,0.521104,-0.251543
Wes,0.496658,2.147076
Jim,1.861335,-1.046974
Travis,1.50941,1.533512


In [None]:
# Groupng with Functions

In [61]:
people.groupby(len).sum()

Unnamed: 0,a,b,c,d,e
3,1.240498,-2.660731,1.261519,0.707219,0.482016
5,-0.92154,0.961481,0.907833,-0.386728,-0.291484
6,0.918496,0.077477,0.483753,1.025657,0.537539


In [62]:
key_list=["one","one","one","two","two"]

In [64]:
people.groupby([len,key_list]).sum()

Unnamed: 0,Unnamed: 1,a,b,c,d,e
3,one,1.80132,-1.036279,0.289835,-0.182432,-0.656283
3,two,-0.560822,-1.624451,0.971684,0.889651,1.138299
5,one,-0.92154,0.961481,0.907833,-0.386728,-0.291484
6,two,0.918496,0.077477,0.483753,1.025657,0.537539


Grouping by Index Levels

In [65]:
columns=pd.MultiIndex.from_arrays([["US","US","US","JP","JP"],[1, 3, 5, 1, 3]],
                                 names=["cty","tenor"])

In [67]:
heir_df=pd.DataFrame(np.random.randn(4,5), columns=columns)

In [68]:
heir_df

cty,US,US,US,JP,JP
tenor,1,3,5,1,3
0,0.094483,0.370994,-0.088352,1.095877,0.893464
1,0.430145,1.709407,-0.281068,0.578803,0.500706
2,1.863336,-0.463124,0.684315,0.725132,0.171422
3,0.323207,0.657091,-1.20675,-1.076879,1.374433


To group by level, pass the level number or name using the level keyword:

In [70]:
heir_df.groupby(level="cty",axis=1).count()

cty,JP,US
0,2,3
1,2,3
2,2,3
3,2,3


#Data Aggregation : Applying Aggregation to any data transformation produces scalar values.

In [75]:
grouped=df.groupby("key1")

In [76]:
grouped["data1"].quantile(0.9) #Internally the GroupBy slices up the Series hence we can use the quantile for each of these 
                               #piece. Quantile is not explicitly implemented for GroupBy

key1
a   -0.909969
b    0.482966
Name: data1, dtype: float64

In [77]:
def peak_to_peak(arr):
    return arr.max() - arr.min()

In [78]:
grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.980805,1.111607
b,1.266704,0.262927


In [79]:
df.groupby("key1").describe()

Unnamed: 0_level_0,data1,data1,data1,data1,data1,data1,data1,data1,data2,data2,data2,data2,data2,data2,data2,data2
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
a,3.0,-1.339504,0.516198,-1.736872,-1.631222,-1.525572,-1.14082,-0.756068,3.0,-0.220066,0.571147,-0.85179,-0.460008,-0.068225,0.095796,0.259816
b,2.0,-0.023715,0.895695,-0.657067,-0.340391,-0.023715,0.292961,0.609637,2.0,0.472246,0.185918,0.340782,0.406514,0.472246,0.537978,0.60371


In [None]:
#Columnwise and Multiple Function Application

In [80]:
import seaborn as sns
tips=sns.load_dataset("tips")

In [83]:
tips["tip_pct"]=tips["tip"]/tips["total_bill"]

In [84]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,tip_pct
0,16.99,1.01,Female,No,Sun,Dinner,2,0.059447
1,10.34,1.66,Male,No,Sun,Dinner,3,0.160542
2,21.01,3.5,Male,No,Sun,Dinner,3,0.166587
3,23.68,3.31,Male,No,Sun,Dinner,2,0.13978
4,24.59,3.61,Female,No,Sun,Dinner,4,0.146808


In [85]:
grouped=tips.groupby(["tip","smoker"])

In [86]:
grouped_pct=grouped["tip_pct"]

In [88]:
grouped_pct.mean()

tip    smoker
1.00   Yes       0.193004
       No        0.137931
1.01   Yes            NaN
       No        0.059447
1.10   Yes       0.085271
                   ...   
7.58   No        0.192288
9.00   Yes            NaN
       No        0.186220
10.00  Yes       0.196812
       No             NaN
Name: tip_pct, Length: 246, dtype: float64

In [90]:
grouped_pct.agg("mean")

tip    smoker
1.00   Yes       0.193004
       No        0.137931
1.01   Yes            NaN
       No        0.059447
1.10   Yes       0.085271
                   ...   
7.58   No        0.192288
9.00   Yes            NaN
       No        0.186220
10.00  Yes       0.196812
       No             NaN
Name: tip_pct, Length: 246, dtype: float64

In [94]:
grouped_pct.agg(["mean", "std", peak_to_peak])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,peak_to_peak
tip,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.00,Yes,0.193004,0.124288,0.246368
1.00,No,0.137931,,0.000000
1.01,Yes,,,
1.01,No,0.059447,,0.000000
1.10,Yes,0.085271,,0.000000
...,...,...,...,...
7.58,No,0.192288,,0.000000
9.00,Yes,,,
9.00,No,0.186220,,0.000000
10.00,Yes,0.196812,,0.000000


In [96]:
grouped_pct.agg([("M","mean"),("S","std")])  #here we pass a list of (name,function) tuple, the first element of each
                                             #tuple will be used as DataFrame column name

Unnamed: 0_level_0,Unnamed: 1_level_0,M,S
tip,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
1.00,Yes,0.193004,0.124288
1.00,No,0.137931,
1.01,Yes,,
1.01,No,0.059447,
1.10,Yes,0.085271,
...,...,...,...
7.58,No,0.192288,
9.00,Yes,,
9.00,No,0.186220,
10.00,Yes,0.196812,


In [98]:
functions=["count", "mean", "max"]

In [100]:
grouped=tips.groupby(["tip", "smoker"])

In [101]:
result=grouped["total_bill","tip_pct"].agg(functions)

In [102]:
result

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,total_bill,total_bill,tip_pct,tip_pct,tip_pct
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,max,count,mean,max
tip,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1.00,No,1.0,7.25,7.25,1.0,0.137931,0.137931
1.00,Yes,3.0,7.14,12.60,3.0,0.193004,0.325733
1.01,No,1.0,16.99,16.99,1.0,0.059447,0.059447
1.01,Yes,,,,,,
1.10,No,,,,,,
...,...,...,...,...,...,...,...
7.58,Yes,,,,,,
9.00,No,1.0,48.33,48.33,1.0,0.186220,0.186220
9.00,Yes,,,,,,
10.00,No,,,,,,


In [103]:
result["tip_pct"]

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,max
tip,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.00,No,1.0,0.137931,0.137931
1.00,Yes,3.0,0.193004,0.325733
1.01,No,1.0,0.059447,0.059447
1.01,Yes,,,
1.10,No,,,
...,...,...,...,...
7.58,Yes,,,
9.00,No,1.0,0.186220,0.186220
9.00,Yes,,,
10.00,No,,,


In [104]:
ftuples = [("Durchschnitt", "mean"), ("Abweichung", np.var)]

In [None]:
grouped["tip_pct", "total_bill"].agg(ftuples)

In [105]:
tips.groupby(["tip", "smoker"])["tip_pct", "total_bill"].agg(ftuples)

Unnamed: 0_level_0,Unnamed: 1_level_0,tip_pct,tip_pct,total_bill,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,Durchschnitt,Abweichung,Durchschnitt,Abweichung
tip,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
1.00,Yes,0.193004,0.015448,7.14,24.1543
1.00,No,0.137931,,7.25,
1.01,Yes,,,,
1.01,No,0.059447,,16.99,
1.10,Yes,0.085271,,12.90,
...,...,...,...,...,...
7.58,No,0.192288,,39.42,
9.00,Yes,,,,
9.00,No,0.186220,,48.33,
10.00,Yes,0.196812,,50.81,


In [None]:
#If we wanted to apply different functions to one or more of the columns

In [106]:
grouped.agg({"tip" : np.max , "size" : "sum"})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,size
tip,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
1.00,Yes,1.00,5.0
1.00,No,1.00,1.0
1.01,Yes,,
1.01,No,1.01,2.0
1.10,Yes,1.10,2.0
...,...,...,...
7.58,No,7.58,4.0
9.00,Yes,,
9.00,No,9.00,4.0
10.00,Yes,10.00,3.0


In [107]:
grouped.agg({"tip" : ["min", "max", "mean", "std"], "size" : "count"})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,tip,tip,tip,size
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,std,count
tip,smoker,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1.00,No,1.00,1.00,1.00,,1.0
1.00,Yes,1.00,1.00,1.00,0.0,3.0
1.01,No,1.01,1.01,1.01,,1.0
1.01,Yes,,,,,
1.10,No,,,,,
...,...,...,...,...,...,...
7.58,Yes,,,,,
9.00,No,9.00,9.00,9.00,,1.0
9.00,Yes,,,,,
10.00,No,,,,,


In [None]:
# A DataFrame will have hierarchical columns only if multiple functions are applied to atleast one column

In [109]:
tips.groupby(["day", "smoker"], as_index=False).mean()

Unnamed: 0,day,smoker,total_bill,tip,size,tip_pct
0,Thur,Yes,19.190588,3.03,2.352941,0.163863
1,Thur,No,17.113111,2.673778,2.488889,0.160298
2,Fri,Yes,16.813333,2.714,2.066667,0.174783
3,Fri,No,18.42,2.8125,2.25,0.15165
4,Sat,Yes,21.276667,2.875476,2.47619,0.147906
5,Sat,No,19.661778,3.102889,2.555556,0.158048
6,Sun,Yes,24.12,3.516842,2.578947,0.18725
7,Sun,No,20.506667,3.167895,2.929825,0.160113
