# Pandas Aggregation, Filtering, and Transformation

In [1]:
import pandas as pd
pd.options.display.max_rows = 20``

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv")

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


## Aggregation

Method 1:

In [4]:
df.groupby(['day'])['total_bill'].mean()

day
Fri     17.151579
Sat     20.441379
Sun     21.410000
Thur    17.682742
Name: total_bill, dtype: float64

Method 2:

#### Using named aggregation, we get a dataframe instead of a series:

In [5]:
df.groupby(['day']).agg(
    day_bill_avg=('total_bill', 'mean')
)

Unnamed: 0_level_0,day_bill_avg
day,Unnamed: 1_level_1
Fri,17.151579
Sat,20.441379
Sun,21.41
Thur,17.682742


#### Since we have a dataframe from named aggregation, we can leverage query() to filter the records

In [6]:
(df.groupby(['day']).agg(
     day_bill_avg=('total_bill', 'mean')
 )
 .query("day_bill_avg > 20")
)

Unnamed: 0_level_0,day_bill_avg
day,Unnamed: 1_level_1
Sat,20.441379
Sun,21.41


# Filter

Which meals were eaten on days where the average bill was greater than 20? Basically, we're filtering for Sundays and Saturdays

In [7]:
df.groupby('day').filter(lambda x : x['total_bill'].mean() > 20)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
238,35.83,4.67,Female,No,Sat,Dinner,3
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2


## Transform

Transfrom is useful when you want to add a new column and keep the same number of rows as the input dataframe

In [8]:
(df
 .assign(day_avg=df.groupby('day')['total_bill'].transform(lambda x : x.mean()),
         day_avg_ratio=df['total_bill'] / df.groupby('day')['total_bill'].transform(lambda x : x.mean())
 )
).head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,day_avg,day_avg_ratio
0,16.99,1.01,Female,No,Sun,Dinner,2,21.41,0.793554
1,10.34,1.66,Male,No,Sun,Dinner,3,21.41,0.482952
2,21.01,3.5,Male,No,Sun,Dinner,3,21.41,0.981317
3,23.68,3.31,Male,No,Sun,Dinner,2,21.41,1.106025
4,24.59,3.61,Female,No,Sun,Dinner,4,21.41,1.148529


In [9]:
(df
 .assign(day_avg=df.groupby('day')['total_bill'].transform(lambda x : x.mean()),
         day_avg_ratio=df['total_bill'] / df.groupby('day')['total_bill'].transform(lambda x : x.mean())
 )
 .query("day_avg_ratio < 1")
)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,day_avg,day_avg_ratio
0,16.99,1.01,Female,No,Sun,Dinner,2,21.410000,0.793554
1,10.34,1.66,Male,No,Sun,Dinner,3,21.410000,0.482952
2,21.01,3.50,Male,No,Sun,Dinner,3,21.410000,0.981317
6,8.77,2.00,Male,No,Sun,Dinner,2,21.410000,0.409622
8,15.04,1.96,Male,No,Sun,Dinner,2,21.410000,0.702475
...,...,...,...,...,...,...,...,...,...
233,10.77,1.47,Male,No,Sat,Dinner,2,20.441379,0.526872
234,15.53,3.00,Male,Yes,Sat,Dinner,2,20.441379,0.759733
235,10.07,1.25,Male,No,Sat,Dinner,2,20.441379,0.492628
236,12.60,1.00,Male,Yes,Sat,Dinner,2,20.441379,0.616397


In [10]:
(df
 .assign(day_avg=df.groupby('day')['total_bill'].transform(lambda x : x.mean()),
         day_avg_ratio=df['total_bill'] / df.groupby('day')['total_bill'].transform(lambda x : x.mean())
 )
 .query("day_avg_ratio >= 1")
)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,day_avg,day_avg_ratio
3,23.68,3.31,Male,No,Sun,Dinner,2,21.410000,1.106025
4,24.59,3.61,Female,No,Sun,Dinner,4,21.410000,1.148529
5,25.29,4.71,Male,No,Sun,Dinner,4,21.410000,1.181224
7,26.88,3.12,Male,No,Sun,Dinner,4,21.410000,1.255488
11,35.26,5.00,Female,No,Sun,Dinner,4,21.410000,1.646894
...,...,...,...,...,...,...,...,...,...
238,35.83,4.67,Female,No,Sat,Dinner,3,20.441379,1.752817
239,29.03,5.92,Male,No,Sat,Dinner,3,20.441379,1.420159
240,27.18,2.00,Female,Yes,Sat,Dinner,2,20.441379,1.329656
241,22.67,2.00,Male,Yes,Sat,Dinner,2,20.441379,1.109025
