In [1]:
import pandas as pd
import numpy as np
from os import path as pth

In [2]:
path = r'/Users/polusa/Library/Mobile Documents/com~apple~CloudDocs/my_DA_2024/CareerFoundry_Data_Analytics_Bootcamp/4-Python_Fundamentals_for_DA/04-2024_Instacart_Basket_Analysis/02-Data'
prepared_data_folder = r'02-Prepared_Data'
raw_data_folder = r'01-Raw_Data'

In [3]:
ords_prods_merge = pd.read_pickle(pth.join(path,prepared_data_folder, 'ords_prods_merge_4.7.pkl'))

In [4]:
ords_prods_merge.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_name,aisle_id,department_id,prices,price_label,busiest_days,busiest_period_of_day
0,2,33120,1,1,202279,3,5,9,8.0,Organic Egg Whites,86,16,11.3,Mid-Range,regular_days,average_orders
1,2,28985,2,1,202279,3,5,9,8.0,Michigan Organic Kale,83,4,13.4,Mid-Range,regular_days,average_orders
2,2,9327,3,0,202279,3,5,9,8.0,Garlic Powder,104,13,3.6,Low-Range,regular_days,average_orders
3,2,45918,4,1,202279,3,5,9,8.0,Coconut Butter,19,13,8.4,Mid-Range,regular_days,average_orders
4,2,30035,5,0,202279,3,5,9,8.0,Natural Sweetener,17,13,13.7,Mid-Range,regular_days,average_orders


#### 2a) Deriving new column: "max_order"  
The column will contain the largest number of order ever placed by each individual user.  

We group by `user_id`, and then calculate the `max` of the `order_number` column for each group of identical `user_id`.  
The `transformation` fucntion doesn't physically aggregate the rows, leaving the original dataframe's size intact.  
It just adds the value to each single rows of each group.

In [5]:
ords_prods_merge['order_max'] = ords_prods_merge.groupby('user_id')['order_number'].transform('max')

In [6]:
ords_prods_merge.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_name,aisle_id,department_id,prices,price_label,busiest_days,busiest_period_of_day,order_max
0,2,33120,1,1,202279,3,5,9,8.0,Organic Egg Whites,86,16,11.3,Mid-Range,regular_days,average_orders,8
1,2,28985,2,1,202279,3,5,9,8.0,Michigan Organic Kale,83,4,13.4,Mid-Range,regular_days,average_orders,8
2,2,9327,3,0,202279,3,5,9,8.0,Garlic Powder,104,13,3.6,Low-Range,regular_days,average_orders,8
3,2,45918,4,1,202279,3,5,9,8.0,Coconut Butter,19,13,8.4,Mid-Range,regular_days,average_orders,8
4,2,30035,5,0,202279,3,5,9,8.0,Natural Sweetener,17,13,13.7,Mid-Range,regular_days,average_orders,8


#### 2b) Deriving new column: "loyalty_flag"  

To create your flag, you’ll need some criteria. You can use the following:

- If the maximum orders the user has made is over 40, then the customer will be labeled a “Loyal customer.”
- If the maximum orders the user has made is over 10 but less than or equal to 40, then the customer will be labeled a “Regular customer.”
- If the maximum orders the user has made is less than or equal to 10, then the customer will be labeled a “New customer.”

In [7]:
ords_prods_merge.loc[ords_prods_merge['order_max'] <= 10, 'loyalty_flag'] = 'new_customer'
ords_prods_merge.loc[(ords_prods_merge['order_max'] > 10) & (ords_prods_merge['order_max'] <= 40), 'loyalty_flag'] = 'regular_customer'
ords_prods_merge.loc[ords_prods_merge['order_max'] > 40, 'loyalty_flag'] = 'loyal_customer'

In [8]:
ords_prods_merge.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_name,aisle_id,department_id,prices,price_label,busiest_days,busiest_period_of_day,order_max,loyalty_flag
0,2,33120,1,1,202279,3,5,9,8.0,Organic Egg Whites,86,16,11.3,Mid-Range,regular_days,average_orders,8,new_customer
1,2,28985,2,1,202279,3,5,9,8.0,Michigan Organic Kale,83,4,13.4,Mid-Range,regular_days,average_orders,8,new_customer
2,2,9327,3,0,202279,3,5,9,8.0,Garlic Powder,104,13,3.6,Low-Range,regular_days,average_orders,8,new_customer
3,2,45918,4,1,202279,3,5,9,8.0,Coconut Butter,19,13,8.4,Mid-Range,regular_days,average_orders,8,new_customer
4,2,30035,5,0,202279,3,5,9,8.0,Natural Sweetener,17,13,13.7,Mid-Range,regular_days,average_orders,8,new_customer


Let's quickly verify the shape of the dataframe after deriving these two new columns, and let's check on the distribution of the loyalty_flag column.

In [9]:
ords_prods_merge.shape

(32404859, 18)

In [10]:
ords_prods_merge['loyalty_flag'].value_counts(dropna = False)

loyalty_flag
regular_customer    15876776
loyal_customer      10284093
new_customer         6243990
Name: count, dtype: int64

#### 5) The marketing team at Instacart wants to know whether there’s a difference between the spending habits of the three types of customers you identified.  

Use the loyalty flag you created and check the basic statistics of the product prices for each loyalty category (Loyal Customer, Regular Customer, and New Customer). What you’re trying to determine is whether the prices of products purchased by loyal customers differ from those purchased by regular or new customers.

In [11]:
ords_prods_merge[['user_id', 'product_name','prices', 'price_label', 'loyalty_flag']].head(20)

Unnamed: 0,user_id,product_name,prices,price_label,loyalty_flag
0,202279,Organic Egg Whites,11.3,Mid-Range,new_customer
1,202279,Michigan Organic Kale,13.4,Mid-Range,new_customer
2,202279,Garlic Powder,3.6,Low-Range,new_customer
3,202279,Coconut Butter,8.4,Mid-Range,new_customer
4,202279,Natural Sweetener,13.7,Mid-Range,new_customer
5,202279,Carrots,10.7,Mid-Range,new_customer
6,202279,Original Unflavored Gelatine Mix,11.5,Mid-Range,new_customer
7,202279,All Natural No Stir Creamy Almond Butter,11.5,Mid-Range,new_customer
8,202279,Classic Blend Cole Slaw,7.5,Mid-Range,new_customer
9,205970,Total 2% with Strawberry Lowfat Greek Strained...,11.8,Mid-Range,regular_customer


In [12]:
ords_prods_merge.groupby('loyalty_flag').agg({'prices': ['mean','median', 'min', 'max']})

Unnamed: 0_level_0,prices,prices,prices,prices
Unnamed: 0_level_1,mean,median,min,max
loyalty_flag,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
loyal_customer,10.386336,7.4,1.0,99999.0
new_customer,13.29467,7.4,1.0,99999.0
regular_customer,12.495717,7.4,1.0,99999.0


There seem to be no outliers since the min and max prices are all within the same reasonable range.  
On average, new customers are those who purchase more expensive products followed by regular customers and finally loyal customers.  
The price difference isn't by much, however new customers spend on average 3$ more on a product compared to loyal customers.

#### 6) The team now wants to target different types of spenders in their marketing campaigns.  

This can be achieved by looking at the prices of the items people are buying. Create a spending flag for each user based on the average price across all their orders using the following criteria:  
- If the mean of the prices of products purchased by a user is lower than 10, then flag them as a “Low spender.”
- If the mean of the prices of products purchased by a user is higher than or equal to 10, then flag them as a “High spender.”

In [13]:
# return a pandas with the corresponding mean spending for every user that act as boolean mask when creating the new spending_flag column
user_mean_prices = ords_prods_merge.groupby('user_id')['prices'].transform('mean')

In [14]:
ords_prods_merge.loc[user_mean_prices < 10, 'spending_flag'] = 'low_spender'
ords_prods_merge.loc[user_mean_prices >= 10, 'spending_flag'] = 'high_spender'

In [15]:
ords_prods_merge['spending_flag'].value_counts(dropna = False)

spending_flag
low_spender     31770614
high_spender      634245
Name: count, dtype: int64

7\) In order to send relevant notifications to users within the app (for instance, asking users if they want to buy the same item again), the Instacart team wants you to determine frequent versus non-frequent customers. Create an order frequency flag that marks the regularity of a user’s ordering behavior according to the median in the “days_since_prior_order” column. The criteria for the flag should be as follows: 

- If the median of “days_since_prior_order” is higher than 20, then the customer should be labeled a “Non-frequent customer.”
- If the median is higher than 10 and lower than or equal to 20, then the customer should be labeled a “Regular customer.”
- If the median is lower than or equal to 10, then the customer should be labeled a “Frequent customer.”  



In [16]:
# create the boolean mask based on the conditions above
order_median_days = ords_prods_merge.groupby('user_id')['days_since_last_order'].transform('median')

In [17]:
ords_prods_merge.loc[order_median_days > 20, 'order_frequency_flag'] = 'non-frequent-customer'
ords_prods_merge.loc[(order_median_days > 10) & (order_median_days <= 20), 'order_frequency_flag'] = 'regular_customer'
ords_prods_merge.loc[order_median_days <= 10, 'order_frequency_flag'] = 'frequent_customer'

In [18]:
ords_prods_merge['order_frequency_flag'].value_counts(dropna = False)

order_frequency_flag
frequent_customer        22797115
regular_customer          6921016
non-frequent-customer     2686728
Name: count, dtype: int64

In [19]:
# ords_prods_merge.to_pickle(pth.join(path, prepared_data_folder, 'ords_prods_merge-4.8.pkl'))