**This script contains the following points:**

1. Importing libraries
2. Importing data
3. Grouping data

# 1. Importing libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 2. Importing data

In [2]:
# Turn project folder path into a string
path = r'/Users/sarahtischer/Desktop/CareerFoundry/Data Immersion/Achievement 4/01-2024_Instacart_Basket_Analysis'

In [3]:
# Import "orders_products_merged(2).pkl"
df_ords_prods_merged = pd.read_pickle(os.path.join(path, '02_Data', 'Prepared_data', 'orders_products_merged(2).pkl'))

In [4]:
df_ords_prods_merged.shape

(32404859, 17)

In [5]:
# Limit the dataframe to one million rows
df_sub = df_ords_prods_merged[:1000000]

In [6]:
# Check the output
df_sub.shape

(1000000, 17)

In [7]:
df_sub.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge,price_range_loc,busiest_days,busiest_period_of_day
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,5,0,both,Mid-range product,Regularly busy,Most orders
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,1,1,both,Mid-range product,Regularly busy,Average orders
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,20,0,both,Mid-range product,Busiest day,Average orders
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,10,0,both,Mid-range product,Least busy,Most orders
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,11,1,both,Mid-range product,Least busy,Average orders


# 3. Grouping data

In [8]:
# Group dataframe by 'product_name'
df_sub.groupby('product_name')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x127f33210>

In [9]:
# Group dataframe by 'department_id' & aggregate mean for 'order_number'
df_sub.groupby('department_id').agg({'order_number': ['mean']})

Unnamed: 0_level_0,order_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
1,15.577493
2,17.320781
3,16.084944
4,17.530458
5,14.763075
6,16.658449
7,17.03159
8,15.076662
9,15.44758
10,18.681852


In [10]:
# Use mean() function for comparison
df_sub.groupby('department_id')['order_number'].mean()

department_id
1     15.577493
2     17.320781
3     16.084944
4     17.530458
5     14.763075
6     16.658449
7     17.031590
8     15.076662
9     15.447580
10    18.681852
11    15.447411
12    14.327957
13    16.548642
14    16.960241
15    16.121948
16    17.803851
17    15.593633
18    19.674252
19    16.899756
20    16.255442
21    25.535479
Name: order_number, dtype: float64

In [11]:
# Group dataframe by 'department_id' & aggregate mean, min and max for 'order_number'
df_sub.groupby('department_id').agg({'order_number': ['mean', 'min', 'max']})

Unnamed: 0_level_0,order_number,order_number,order_number
Unnamed: 0_level_1,mean,min,max
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,15.577493,1,99
2,17.320781,1,96
3,16.084944,1,99
4,17.530458,1,99
5,14.763075,1,99
6,16.658449,1,99
7,17.03159,1,99
8,15.076662,1,98
9,15.44758,1,99
10,18.681852,1,99


In [12]:
# Group data by 'user_id' & generate max orders for each user
df_ords_prods_merged['max_order'] = df_ords_prods_merged.groupby(['user_id'])['order_number'].transform(np.max)

  df_ords_prods_merged['max_order'] = df_ords_prods_merged.groupby(['user_id'])['order_number'].transform(np.max)


In [13]:
# Check the output
df_ords_prods_merged.sort_values(by='user_id').head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge,price_range_loc,busiest_days,busiest_period_of_day,max_order
6002731,10326,Organic Fuji Apples,24,4,2.7,431534,1,5,4,15,28.0,5,0,both,Low-range product,Least busy,Most orders,10
92374,196,Soda,77,7,9.0,550135,1,7,1,9,20.0,1,1,both,Mid-range product,Busiest day,Most orders,10
92373,196,Soda,77,7,9.0,3367565,1,6,2,7,19.0,1,1,both,Mid-range product,Regularly busy,Average orders,10
92371,196,Soda,77,7,9.0,2254736,1,4,4,7,29.0,1,1,both,Mid-range product,Least busy,Average orders,10
92370,196,Soda,77,7,9.0,473747,1,3,3,12,21.0,1,1,both,Mid-range product,Least busy,Most orders,10


In [14]:
# Clear dislay options
pd.options.display.max_rows = None

In [15]:
# Check the output of first 100 rows
df_ords_prods_merged.head(100)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge,price_range_loc,busiest_days,busiest_period_of_day,max_order
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,5,0,both,Mid-range product,Regularly busy,Most orders,32
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,1,1,both,Mid-range product,Regularly busy,Average orders,32
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,20,0,both,Mid-range product,Busiest day,Average orders,5
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,10,0,both,Mid-range product,Least busy,Most orders,3
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,11,1,both,Mid-range product,Least busy,Average orders,3
5,1,Chocolate Sandwich Cookies,61,19,5.8,1701441,777,16,1,7,26.0,7,0,both,Mid-range product,Busiest day,Average orders,26
6,1,Chocolate Sandwich Cookies,61,19,5.8,1871483,825,3,2,14,30.0,2,0,both,Mid-range product,Regularly busy,Most orders,9
7,1,Chocolate Sandwich Cookies,61,19,5.8,1290456,910,12,3,10,30.0,1,0,both,Mid-range product,Least busy,Most orders,12
8,1,Chocolate Sandwich Cookies,61,19,5.8,369558,1052,10,1,20,19.0,1,0,both,Mid-range product,Busiest day,Average orders,20
9,1,Chocolate Sandwich Cookies,61,19,5.8,589712,1052,15,1,12,15.0,2,1,both,Mid-range product,Busiest day,Most orders,20


In [16]:
# Assign loyalty flag based on 'max_order' value
df_ords_prods_merged.loc[df_ords_prods_merged['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'
df_ords_prods_merged.loc[(df_ords_prods_merged['max_order'] <= 40) & (df_ords_prods_merged['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'
df_ords_prods_merged.loc[df_ords_prods_merged['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [17]:
# Check frequencies of new column 'loyalty_flag'
df_ords_prods_merged['loyalty_flag'].value_counts(dropna = False)

loyalty_flag
Regular customer    15876776
Loyal customer      10284093
New customer         6243990
Name: count, dtype: int64

In [18]:
# Check the output of relevant columns
df_ords_prods_merged[['user_id', 'order_number', 'loyalty_flag']].head(60)

Unnamed: 0,user_id,order_number,loyalty_flag
0,138,28,Regular customer
1,138,30,Regular customer
2,709,2,New customer
3,764,1,New customer
4,764,3,New customer
5,777,16,Regular customer
6,825,3,New customer
7,910,12,Regular customer
8,1052,10,Regular customer
9,1052,15,Regular customer
