# GroupBy

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Data Sets

d = {'Branches': ['Store 1', 'Store 1','Store 2', 'Store 2', 'Store 3','Store 3','Store 4', 'Store 4', 'Store 5', 'Store 5', 'Store 5'],
     'Employee': ['Vanessa', 'Alexa','James', 'Charlie', 'Billie','Sam','Eckert','John','Millie','Jonas','Xander'],
     'Sales': [300, 175, 250, 120, 400, 500, 300, 200, 600, 300, 400]
}

In [3]:
df = pd.DataFrame(d)

In [4]:
df

Unnamed: 0,Branches,Employee,Sales
0,Store 1,Vanessa,300
1,Store 1,Alexa,175
2,Store 2,James,250
3,Store 2,Charlie,120
4,Store 3,Billie,400
5,Store 3,Sam,500
6,Store 4,Eckert,300
7,Store 4,John,200
8,Store 5,Millie,600
9,Store 5,Jonas,300


In [5]:
# Group together by Column Name

df.groupby(by = "Branches")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000000000822FDC8>

In [6]:
byBranch = df.groupby(by = "Branches")

In [7]:
byBranch

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000000008239588>

In [8]:
# Aggregation

byBranch.mean()

Unnamed: 0_level_0,Sales
Branches,Unnamed: 1_level_1
Store 1,237.5
Store 2,185.0
Store 3,450.0
Store 4,250.0
Store 5,433.333333


In [9]:
# Aggregation

byBranch.sum()

Unnamed: 0_level_0,Sales
Branches,Unnamed: 1_level_1
Store 1,475
Store 2,370
Store 3,900
Store 4,500
Store 5,1300


In [10]:
# Using one line (please do this one)

df.groupby('Branches').sum()

Unnamed: 0_level_0,Sales
Branches,Unnamed: 1_level_1
Store 1,475
Store 2,370
Store 3,900
Store 4,500
Store 5,1300


In [11]:
# If you are interested only on 1 specific

df.groupby('Branches').sum().loc['Store 3']

Sales    900
Name: Store 3, dtype: int64

In [12]:
# If you are interested only on 2 specifics

df.groupby('Branches').sum().loc[['Store 3','Store 5']]

Unnamed: 0_level_0,Sales
Branches,Unnamed: 1_level_1
Store 3,900
Store 5,1300


In [13]:
# Other usefull aggregation function

df.groupby('Branches').count()

Unnamed: 0_level_0,Employee,Sales
Branches,Unnamed: 1_level_1,Unnamed: 2_level_1
Store 1,2,2
Store 2,2,2
Store 3,2,2
Store 4,2,2
Store 5,3,3


In [14]:
# Using aggregate .max()

df.groupby('Branches').max()

Unnamed: 0_level_0,Employee,Sales
Branches,Unnamed: 1_level_1,Unnamed: 2_level_1
Store 1,Vanessa,300
Store 2,James,250
Store 3,Sam,500
Store 4,John,300
Store 5,Xander,600


In [15]:
# Using aggregate .min()

df.groupby('Branches').min()

Unnamed: 0_level_0,Employee,Sales
Branches,Unnamed: 1_level_1,Unnamed: 2_level_1
Store 1,Alexa,175
Store 2,Charlie,120
Store 3,Billie,400
Store 4,Eckert,200
Store 5,Jonas,300


In [16]:
# if you are interested in sorting values (ascending)

df.sort_values(by = 'Sales')

Unnamed: 0,Branches,Employee,Sales
3,Store 2,Charlie,120
1,Store 1,Alexa,175
7,Store 4,John,200
2,Store 2,James,250
0,Store 1,Vanessa,300
6,Store 4,Eckert,300
9,Store 5,Jonas,300
4,Store 3,Billie,400
10,Store 5,Xander,400
5,Store 3,Sam,500


In [17]:
# if you are interested in sorting values (descending)

df.sort_values(by = 'Sales', ascending = False) 

Unnamed: 0,Branches,Employee,Sales
8,Store 5,Millie,600
5,Store 3,Sam,500
4,Store 3,Billie,400
10,Store 5,Xander,400
0,Store 1,Vanessa,300
6,Store 4,Eckert,300
9,Store 5,Jonas,300
2,Store 2,James,250
7,Store 4,John,200
1,Store 1,Alexa,175


In [18]:
# GroupBy + Sort Values

df.groupby('Branches').sum().sort_values('Sales', ascending = False)

Unnamed: 0_level_0,Sales
Branches,Unnamed: 1_level_1
Store 5,1300
Store 3,900
Store 4,500
Store 1,475
Store 2,370


In [19]:
# GroupBy + Sort Values with Specifics

df.groupby('Branches').sum().sort_values('Sales', ascending = False).loc[['Store 1','Store 2']]

Unnamed: 0_level_0,Sales
Branches,Unnamed: 1_level_1
Store 1,475
Store 2,370


In [20]:
# Using .describe()

df.groupby('Branches').describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Branches,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Store 1,2.0,237.5,88.388348,175.0,206.25,237.5,268.75,300.0
Store 2,2.0,185.0,91.923882,120.0,152.5,185.0,217.5,250.0
Store 3,2.0,450.0,70.710678,400.0,425.0,450.0,475.0,500.0
Store 4,2.0,250.0,70.710678,200.0,225.0,250.0,275.0,300.0
Store 5,3.0,433.333333,152.752523,300.0,350.0,400.0,500.0,600.0


In [21]:
# If you do not want the presentation, use transpose

df.groupby('Branches').describe().T

Unnamed: 0,Branches,Store 1,Store 2,Store 3,Store 4,Store 5
Sales,count,2.0,2.0,2.0,2.0,3.0
Sales,mean,237.5,185.0,450.0,250.0,433.333333
Sales,std,88.388348,91.923882,70.710678,70.710678,152.752523
Sales,min,175.0,120.0,400.0,200.0,300.0
Sales,25%,206.25,152.5,425.0,225.0,350.0
Sales,50%,237.5,185.0,450.0,250.0,400.0
Sales,75%,268.75,217.5,475.0,275.0,500.0
Sales,max,300.0,250.0,500.0,300.0,600.0


In [22]:
df.groupby('Branches').describe().transpose()

Unnamed: 0,Branches,Store 1,Store 2,Store 3,Store 4,Store 5
Sales,count,2.0,2.0,2.0,2.0,3.0
Sales,mean,237.5,185.0,450.0,250.0,433.333333
Sales,std,88.388348,91.923882,70.710678,70.710678,152.752523
Sales,min,175.0,120.0,400.0,200.0,300.0
Sales,25%,206.25,152.5,425.0,225.0,350.0
Sales,50%,237.5,185.0,450.0,250.0,400.0
Sales,75%,268.75,217.5,475.0,275.0,500.0
Sales,max,300.0,250.0,500.0,300.0,600.0


In [23]:
# GroupBy + describe + transpose + specifics

df.groupby('Branches').describe().loc[['Store 3','Store 5']].transpose()

Unnamed: 0,Branches,Store 3,Store 5
Sales,count,2.0,3.0
Sales,mean,450.0,433.333333
Sales,std,70.710678,152.752523
Sales,min,400.0,300.0
Sales,25%,425.0,350.0
Sales,50%,450.0,400.0
Sales,75%,475.0,500.0
Sales,max,500.0,600.0


In [24]:
# Also as an alternative

df.groupby('Branches').describe().transpose()[['Store 3','Store 5']]

Unnamed: 0,Branches,Store 3,Store 5
Sales,count,2.0,3.0
Sales,mean,450.0,433.333333
Sales,std,70.710678,152.752523
Sales,min,400.0,300.0
Sales,25%,425.0,350.0
Sales,50%,450.0,400.0
Sales,75%,475.0,500.0
Sales,max,500.0,600.0


In [28]:
# Using rank

df.groupby('Branches').sum().rank(ascending = False)

Unnamed: 0_level_0,Sales
Branches,Unnamed: 1_level_1
Store 1,4.0
Store 2,5.0
Store 3,2.0
Store 4,3.0
Store 5,1.0


In [42]:
# Using rank + sort values

df.groupby('Branches').sum().rank(ascending = False).sort_values('Sales')

Unnamed: 0_level_0,Sales
Branches,Unnamed: 1_level_1
Store 5,1.0
Store 3,2.0
Store 4,3.0
Store 1,4.0
Store 2,5.0


In [43]:
df.groupby('Branches').sum().sort_values(by = 'Sales', ascending = False)

Unnamed: 0_level_0,Sales
Branches,Unnamed: 1_level_1
Store 5,1300
Store 3,900
Store 4,500
Store 1,475
Store 2,370
