## Table of Contents:
#### 01. Importing libraries
#### 02. Importing the dataframe
#### 03. Grouping data with Pandas
#### 04. Performing single aggregation
#### 05. Performing multiple aggregations
#### 06.  Aggregating data with transform ( )

## 01. Importing libraries

In [1]:
# importing the libraries

import pandas as pd
import numpy as np
import os

## 02. Importing the dataframe

In [4]:
# defining the path

path= r'/Users/sanju/Documents/Jul 2023 Instacart Basket Analysis/02 Data'

In [5]:
# importing the 'orders_products_merged_derived' pickle file

ords_prods_merge=pd.read_pickle(os.path.join(path,'Prepared Data','orders_products_merged_derived.pkl'))

In [6]:
# creating a subset of 1st one million rows

df=ords_prods_merge[:1000000]

In [7]:
# checking the subset

df.shape

(1000000, 18)

In [8]:
df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day
0,2539329,1,1,2,8,,196,1,0,both,Soda,77,7,9.0,Mid-range product,Regularly Busy,Regularly Busy,Average orders
1,2398795,1,2,3,7,15.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Regularly Busy,Least Busy,Average orders
2,473747,1,3,3,12,21.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Regularly Busy,Least Busy,Most orders
3,2254736,1,4,4,7,29.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Least Busy,Least Busy,Average orders
4,431534,1,5,4,15,28.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Least Busy,Least Busy,Most orders


## 03. Grouping data with Pandas

In [9]:
# step1: splitting the 'product_name' column into groups

df.groupby('product_name')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x106e71350>

## 04. Performing single aggregation

In [10]:
# splitting the data into groups based on 'department_id' and 
# applying agg() function to each group to get the Mean of the 'order_number' column

df.groupby('department_id').agg({'order_number':['mean']})

Unnamed: 0_level_0,order_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
4,18.82578
7,17.472355
13,17.993423
14,19.246334
16,19.463012
17,11.294069
19,19.305237
20,17.599636


## 05. Performing multiple aggregations

In [12]:
# calculating the mean, min and max of 'order_number' column

df.groupby('department_id').agg({'order_number':['mean','min','max']})

Unnamed: 0_level_0,order_number,order_number,order_number
Unnamed: 0_level_1,mean,min,max
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
4,18.82578,1,99
7,17.472355,1,99
13,17.993423,1,99
14,19.246334,1,99
16,19.463012,1,99
17,11.294069,1,98
19,19.305237,1,99
20,17.599636,1,99


## 06. Aggregating data with transform ( )

In [14]:
# creating a new 'max_order' column

ords_prods_merge['max_order']=ords_prods_merge.groupby('user_id')['order_number'].transform(np.max)

In [16]:
# checking

ords_prods_merge.head(15)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order
0,2539329,1,1,2,8,,196,1,0,both,Soda,77,7,9.0,Mid-range product,Regularly Busy,Regularly Busy,Average orders,10
1,2398795,1,2,3,7,15.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Regularly Busy,Least Busy,Average orders,10
2,473747,1,3,3,12,21.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Regularly Busy,Least Busy,Most orders,10
3,2254736,1,4,4,7,29.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Least Busy,Least Busy,Average orders,10
4,431534,1,5,4,15,28.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Least Busy,Least Busy,Most orders,10
5,3367565,1,6,2,7,19.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Regularly Busy,Regularly Busy,Average orders,10
6,550135,1,7,1,9,20.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Regularly Busy,Busiest Day,Most orders,10
7,3108588,1,8,1,14,14.0,196,2,1,both,Soda,77,7,9.0,Mid-range product,Regularly Busy,Busiest Day,Most orders,10
8,2295261,1,9,1,16,0.0,196,4,1,both,Soda,77,7,9.0,Mid-range product,Regularly Busy,Busiest Day,Most orders,10
9,2550362,1,10,4,8,30.0,196,1,1,both,Soda,77,7,9.0,Mid-range product,Least Busy,Least Busy,Average orders,10


In [17]:
# creating a flag with the loc() function

ords_prods_merge.loc[ords_prods_merge['max_order']>40,'loyalty_flag']='Loyal Customer'

In [19]:
ords_prods_merge.loc[(ords_prods_merge['max_order']<=40) & (ords_prods_merge['max_order']>10),'loyalty_flag']='Regular Customer'

In [20]:
ords_prods_merge.loc[ords_prods_merge['max_order']<=10,'loyalty_flag']='New Customer'

In [21]:
# checking the frequency of 'loyalty_flag' column

ords_prods_merge['loyalty_flag'].value_counts(dropna=False)

Regular Customer    15876776
Loyal Customer      10284093
New Customer         6243990
Name: loyalty_flag, dtype: int64

In [23]:
ords_prods_merge['loyalty_flag']

0               New Customer
1               New Customer
2               New Customer
3               New Customer
4               New Customer
                  ...       
32404854    Regular Customer
32404855    Regular Customer
32404856        New Customer
32404857        New Customer
32404858    Regular Customer
Name: loyalty_flag, Length: 32404859, dtype: object

In [25]:
ords_prods_merge[['user_id','loyalty_flag','order_number']].head(20)

Unnamed: 0,user_id,loyalty_flag,order_number
0,1,New Customer,1
1,1,New Customer,2
2,1,New Customer,3
3,1,New Customer,4
4,1,New Customer,5
5,1,New Customer,6
6,1,New Customer,7
7,1,New Customer,8
8,1,New Customer,9
9,1,New Customer,10
