# 4.8 Grouping Data and Aggregating Variables (2)

##### This script contains the following points:

#### Step 5 - Use the loyalty flag and check basic statistics of product prices for each loyalty category

#### Step 6 - Create spending flag with 'Low spender' and 'High_spender' labels

#### Step 7 - Create frequency flag with 'Non-frequent customer', 'Regular customer', and 'Frequent customer' labels

#### Checking data frames and merging them into one

#### Step 9 - Export as .pkl

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Set path
path = r'/Users/mainframe/Documents/Instacart Basket Analysis'

In [3]:
# Import data
ords_prods_merge = pd.read_pickle(os.path.join(path,'02 Data', 'Prepared Data', 'ords_prods_merged_derived.pkl'))

In [4]:
# Specify the columns to keep
columns_to_keep = ['order_id', 'user_id', 'max_order', 'loyalty_flag', 'prices', 'days_since_prior_order']

In [5]:
# Create a new DataFrame with only the specified columns
opm_loyalty = ords_prods_merge[columns_to_keep]

In [6]:
opm_loyalty.head()

Unnamed: 0,order_id,user_id,max_order,loyalty_flag,prices,days_since_prior_order
0,2539329,1,10,New customer,9.0,
1,2539329,1,10,New customer,12.5,
2,2539329,1,10,New customer,4.4,
3,2539329,1,10,New customer,4.7,
4,2539329,1,10,New customer,1.0,


### Step 5 - Use the loyalty flag and check basic statistics of product prices for each loyalty category

In [7]:
# Perform basic statistics check of prices for loyalty categories
opm_loyalty.groupby('loyalty_flag').agg({'prices': ['mean', 'min', 'max', 'std', 'median']})

Unnamed: 0_level_0,prices,prices,prices,prices,prices
Unnamed: 0_level_1,mean,min,max,std,median
loyalty_flag,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Loyal customer,10.386336,1.0,99999.0,328.017787,7.4
New customer,13.29467,1.0,99999.0,597.560299,7.4
Regular customer,12.495717,1.0,99999.0,539.720919,7.4


### Step 6 - Create spending flag with 'Low_spender' being users who have a spending mean of under 10, and 'High_spender' those greater than or equal to 10

In [8]:
# Create 'mean_prices' column
opm_loyalty['mean_prices'] = opm_loyalty.groupby(['user_id'])['prices'].transform(np.mean)

  opm_loyalty['mean_prices'] = opm_loyalty.groupby(['user_id'])['prices'].transform(np.mean)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  opm_loyalty['mean_prices'] = opm_loyalty.groupby(['user_id'])['prices'].transform(np.mean)


In [9]:
opm_loyalty.head(15)

Unnamed: 0,order_id,user_id,max_order,loyalty_flag,prices,days_since_prior_order,mean_prices
0,2539329,1,10,New customer,9.0,,6.367797
1,2539329,1,10,New customer,12.5,,6.367797
2,2539329,1,10,New customer,4.4,,6.367797
3,2539329,1,10,New customer,4.7,,6.367797
4,2539329,1,10,New customer,1.0,,6.367797
5,2398795,1,10,New customer,9.0,15.0,6.367797
6,2398795,1,10,New customer,3.0,15.0,6.367797
7,2398795,1,10,New customer,4.4,15.0,6.367797
8,2398795,1,10,New customer,10.3,15.0,6.367797
9,2398795,1,10,New customer,4.7,15.0,6.367797


In [10]:
# Create 'spending_flag' column
opm_loyalty.loc[opm_loyalty['mean_prices'] >= 10, 'spending_flag'] = 'High spender'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  opm_loyalty.loc[opm_loyalty['mean_prices'] >= 10, 'spending_flag'] = 'High spender'


In [11]:
opm_loyalty.loc[opm_loyalty['mean_prices'] < 10, 'spending_flag'] = 'Low spender'

In [12]:
# Check counts of 'spending_flag' column's values
opm_loyalty['spending_flag'].value_counts(dropna = False)

spending_flag
Low spender     31770614
High spender      634245
Name: count, dtype: int64

### Create frequency flag where 'Non-frequent customer' median 'days_since_prior_order' is over 20 days, 'Regular customer' median 'days_since_prior_order' is less than or equal to 20 days ago, and 'Frequent customer' median 'days_since_prior_order' is less than or equal to 10 days

In [13]:
# Create 'median_days_since_prior_order' column
opm_loyalty['median_days_since_prior_order'] = opm_loyalty.groupby(['user_id'])['days_since_prior_order'].transform(np.median)

  opm_loyalty['median_days_since_prior_order'] = opm_loyalty.groupby(['user_id'])['days_since_prior_order'].transform(np.median)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  opm_loyalty['median_days_since_prior_order'] = opm_loyalty.groupby(['user_id'])['days_since_prior_order'].transform(np.median)


In [14]:
opm_loyalty.head()

Unnamed: 0,order_id,user_id,max_order,loyalty_flag,prices,days_since_prior_order,mean_prices,spending_flag,median_days_since_prior_order
0,2539329,1,10,New customer,9.0,,6.367797,Low spender,20.5
1,2539329,1,10,New customer,12.5,,6.367797,Low spender,20.5
2,2539329,1,10,New customer,4.4,,6.367797,Low spender,20.5
3,2539329,1,10,New customer,4.7,,6.367797,Low spender,20.5
4,2539329,1,10,New customer,1.0,,6.367797,Low spender,20.5


In [15]:
# Create 'frequency_flag' column
opm_loyalty.loc[opm_loyalty['median_days_since_prior_order'] > 20, 'frequency_flag'] = 'Non-frequent customer'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  opm_loyalty.loc[opm_loyalty['median_days_since_prior_order'] > 20, 'frequency_flag'] = 'Non-frequent customer'


In [16]:
opm_loyalty.loc[opm_loyalty['median_days_since_prior_order'] <= 20, 'frequency_flag'] = 'Regular customer'

In [17]:
opm_loyalty.loc[opm_loyalty['median_days_since_prior_order'] <= 10, 'frequency_flag'] = 'Frequent customer'

In [18]:
# Check counts of 'frequency_flag' column's values
opm_loyalty['frequency_flag'].value_counts(dropna = False)

frequency_flag
Frequent customer        21559853
Regular customer          7208564
Non-frequent customer     3636437
NaN                             5
Name: count, dtype: int64

#### Checking data frames and merging them into one

In [19]:
opm_loyalty.shape

(32404859, 10)

In [20]:
ords_prods_merge.shape

(32404859, 20)

In [21]:
opm_loyalty.head()

Unnamed: 0,order_id,user_id,max_order,loyalty_flag,prices,days_since_prior_order,mean_prices,spending_flag,median_days_since_prior_order,frequency_flag
0,2539329,1,10,New customer,9.0,,6.367797,Low spender,20.5,Non-frequent customer
1,2539329,1,10,New customer,12.5,,6.367797,Low spender,20.5,Non-frequent customer
2,2539329,1,10,New customer,4.4,,6.367797,Low spender,20.5,Non-frequent customer
3,2539329,1,10,New customer,4.7,,6.367797,Low spender,20.5,Non-frequent customer
4,2539329,1,10,New customer,1.0,,6.367797,Low spender,20.5,Non-frequent customer


In [22]:
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,hour_ordered,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Regularly busy days,Average orders,10,New customer
1,2539329,1,1,2,8,,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,both,Mid-range product,Regularly busy,Regularly busy days,Average orders,10,New customer
2,2539329,1,1,2,8,,12427,3,0,Original Beef Jerky,23,19,4.4,both,Low-range product,Regularly busy,Regularly busy days,Average orders,10,New customer
3,2539329,1,1,2,8,,26088,4,0,Aged White Cheddar Popcorn,23,19,4.7,both,Low-range product,Regularly busy,Regularly busy days,Average orders,10,New customer
4,2539329,1,1,2,8,,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,both,Low-range product,Regularly busy,Regularly busy days,Average orders,10,New customer


In [23]:
opm_new_flags = opm_loyalty[['mean_prices', 'spending_flag', 'median_days_since_prior_order', 'frequency_flag']]

In [24]:
ords_prods_new_merged = pd.concat([ords_prods_merge, opm_new_flags], axis = 1)

In [25]:
ords_prods_new_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,hour_ordered,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,price_range_loc,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,mean_prices,spending_flag,median_days_since_prior_order,frequency_flag
0,2539329,1,1,2,8,,196,1,0,Soda,...,Mid-range product,Regularly busy,Regularly busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
1,2539329,1,1,2,8,,14084,2,0,Organic Unsweetened Vanilla Almond Milk,...,Mid-range product,Regularly busy,Regularly busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
2,2539329,1,1,2,8,,12427,3,0,Original Beef Jerky,...,Low-range product,Regularly busy,Regularly busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
3,2539329,1,1,2,8,,26088,4,0,Aged White Cheddar Popcorn,...,Low-range product,Regularly busy,Regularly busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
4,2539329,1,1,2,8,,26405,5,0,XL Pick-A-Size Paper Towel Rolls,...,Low-range product,Regularly busy,Regularly busy days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer


In [26]:
ords_prods_new_merged.shape

(32404859, 24)

### Step 9 - Export data frame as pickle into 'Prepared Data' 

In [27]:
ords_prods_new_merged.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_new_merged.pkl'))