# 4.8 Grouping Data and Aggregating Variables

1. Aggregating data with agg() function
2. Aggregating data with transform()
3. Deriving columns with loc()

In [1]:
#import libraries
import pandas as pd
import numpy as np
import os

In [2]:
#create usable path 
path = r'C:\Users\rutha\CareerFoundry\01-23_Instacart_Basket_Analysis'

In [3]:
#import dataframe
df = pd.read_pickle(os.path.join(path, '02_Data', 'Prepared_data', 'full_merged3.pkl'))

In [4]:
#dropping busiest day column
df = df.drop(['busiest_day'], axis = 1)

In [5]:
df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,price_range_loc,busiest_days,busiest_period_of_day
0,2539329,1,1,2,8,,196,1,0,both,Soda,77.0,7.0,9.0,Mid-range product,Regularly busy,Average orders
1,2539329,1,1,2,8,,14084,2,0,both,Organic Unsweetened Vanilla Almond Milk,91.0,16.0,12.5,Mid-range product,Regularly busy,Average orders
2,2539329,1,1,2,8,,12427,3,0,both,Original Beef Jerky,23.0,19.0,4.4,Low-range product,Regularly busy,Average orders
3,2539329,1,1,2,8,,26088,4,0,both,Aged White Cheddar Popcorn,23.0,19.0,4.7,Low-range product,Regularly busy,Average orders
4,2539329,1,1,2,8,,26405,5,0,both,XL Pick-A-Size Paper Towel Rolls,54.0,17.0,1.0,Low-range product,Regularly busy,Average orders


Create a subset of the first one million entries in the dataframe

In [6]:
#create subset of first one million entries
df_subset = df[:1000000]

print('Sample output of subset filter of merged dataset:')
df_subset.sample(5)

Sample output of subset filter of merged dataset:


Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,price_range_loc,busiest_days,busiest_period_of_day
784621,1804176,5168,27,3,18,2.0,25659,5,1,both,Organic Coconut Milk,91.0,16.0,4.1,Low-range product,Slowest days,Average orders
603219,2333242,4041,11,4,10,15.0,-24639,9,1,both,Kale,123.0,4.0,6.6,Mid-range product,Slowest days,Most orders
43670,1169863,290,13,6,11,7.0,16669,5,1,both,Grilled Chicken Canine Cuisine Wet Dog Food,40.0,8.0,14.0,Mid-range product,Regularly busy,Most orders
80155,2324216,523,2,5,15,16.0,21694,1,1,both,Mini Classic Ice Cream,37.0,1.0,8.9,Mid-range product,Regularly busy,Most orders
369163,3378514,2446,4,6,11,7.0,22935,13,1,both,Organic Yellow Onion,83.0,4.0,1.4,Low-range product,Regularly busy,Most orders


In [7]:
df_subset.shape

(1000000, 17)

# 1. Grouping data with agg() function

In [8]:
#group your data by product name
df.groupby('product_name')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002500009E400>

Calculating the mean of the 'order_number' grouped by 'department_id'

In [9]:
#split the data into groups based on dep_id, then agg() by ord_number
df.groupby('department_id').agg({'order_number': ['mean']})

Unnamed: 0_level_0,order_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
1.0,15.457838
2.0,17.27792
3.0,17.170395
4.0,17.811403
5.0,15.215751
6.0,16.439806
7.0,17.225802
8.0,15.34065
9.0,15.895474
10.0,20.197148


In [10]:
#using the mean() function
df.groupby('department_id')['order_number'].mean()

department_id
1.0     15.457838
2.0     17.277920
3.0     17.170395
4.0     17.811403
5.0     15.215751
6.0     16.439806
7.0     17.225802
8.0     15.340650
9.0     15.895474
10.0    20.197148
11.0    16.170638
12.0    15.887671
13.0    16.583536
14.0    16.773669
15.0    16.165037
16.0    17.665606
17.0    15.694469
18.0    19.310397
19.0    17.177343
20.0    16.473447
21.0    22.902379
Name: order_number, dtype: float64

In [11]:
#performing multiple aggregations 

print('The mean, min, and max of order_number grouped by the department_id:')
df.groupby('department_id').agg({'order_number': ['mean', 'min', 'max']})

The mean, min, and max of order_number grouped by the department_id:


Unnamed: 0_level_0,order_number,order_number,order_number
Unnamed: 0_level_1,mean,min,max
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1.0,15.457838,1,99
2.0,17.27792,1,99
3.0,17.170395,1,99
4.0,17.811403,1,99
5.0,15.215751,1,99
6.0,16.439806,1,99
7.0,17.225802,1,99
8.0,15.34065,1,99
9.0,15.895474,1,99
10.0,20.197148,1,99


# 2. Aggregating Data with Transform

In [12]:
df['max_order'] = df.groupby(['user_id'])['order_number'].transform(np.max)

In [13]:
#function to display all rows
pd.options.display.max_rows = None

In [14]:
df.head(100)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,price_range_loc,busiest_days,busiest_period_of_day,max_order
0,2539329,1,1,2,8,,196,1,0,both,Soda,77.0,7.0,9.0,Mid-range product,Regularly busy,Average orders,10
1,2539329,1,1,2,8,,14084,2,0,both,Organic Unsweetened Vanilla Almond Milk,91.0,16.0,12.5,Mid-range product,Regularly busy,Average orders,10
2,2539329,1,1,2,8,,12427,3,0,both,Original Beef Jerky,23.0,19.0,4.4,Low-range product,Regularly busy,Average orders,10
3,2539329,1,1,2,8,,26088,4,0,both,Aged White Cheddar Popcorn,23.0,19.0,4.7,Low-range product,Regularly busy,Average orders,10
4,2539329,1,1,2,8,,26405,5,0,both,XL Pick-A-Size Paper Towel Rolls,54.0,17.0,1.0,Low-range product,Regularly busy,Average orders,10
5,2398795,1,2,3,7,15.0,196,1,1,both,Soda,77.0,7.0,9.0,Mid-range product,Slowest days,Average orders,10
6,2398795,1,2,3,7,15.0,10258,2,0,both,Pistachios,117.0,19.0,3.0,Low-range product,Slowest days,Average orders,10
7,2398795,1,2,3,7,15.0,12427,3,1,both,Original Beef Jerky,23.0,19.0,4.4,Low-range product,Slowest days,Average orders,10
8,2398795,1,2,3,7,15.0,13176,4,0,both,Bag of Organic Bananas,24.0,4.0,10.3,Mid-range product,Slowest days,Average orders,10
9,2398795,1,2,3,7,15.0,26088,5,1,both,Aged White Cheddar Popcorn,23.0,19.0,4.7,Low-range product,Slowest days,Average orders,10


# Deriving Columns with loc()

In [15]:
#creating my flag based on values using loc()

In [16]:
df.loc[df['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'

In [17]:
df.loc[df['max_order'] <= 40, 'loyalty_flag'] = 'Regular customer'

In [18]:
df.loc[df['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [19]:
df['loyalty_flag'].value_counts(dropna = False)

Regular customer    15891507
Loyal customer      10294027
New customer         6249525
Name: loyalty_flag, dtype: int64

In [20]:
#checking output of certain columns
df[['user_id', 'loyalty_flag', 'max_order']].sample(60)

Unnamed: 0,user_id,loyalty_flag,max_order
11885032,75249,Regular customer,11
11104556,70344,New customer,7
27881156,177171,Regular customer,23
13382167,84652,Loyal customer,85
14095958,89247,Regular customer,12
16390029,103672,Regular customer,32
20273069,128648,New customer,9
31982890,203256,Loyal customer,99
16923324,107124,Regular customer,38
26472203,168292,New customer,4


In [None]:
#export dataframe 
df.to_pickle(os.path.join(path, '02_Data', 'Prepared_data', 'full_merged4.pkl'))