**This script contains the following points:**

1. Importing libraries
2. Importing data
3. Deriving new variables
    - If-statements with user-defined functions
    - If-statements with loc() functions
        - Dataframe: df_ords_prods
        - Dataframe: df_ords_prods_merge
    - If-statements with for-loops
4. Answering business questions
    - 'busiest_days'
    - 'busiest_period_of_day'
5. Exporting data

# 1. Importing libraries

In [39]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 2. Importing data

In [40]:
# Turn project folder path into a string
path = r'/Users/sarahtischer/Desktop/CareerFoundry/Data Immersion/Achievement 4/01-2024_Instacart_Basket_Analysis'

In [41]:
# Import "orders_products_merged.pkl"
df_ords_prods_merged = pd.read_pickle(os.path.join(path, '02_Data', 'Prepared_data', 'orders_products_merged.pkl'))

In [42]:
df_ords_prods_merged.shape

(32404859, 14)

In [43]:
# Limit the dataframe to one million rows
df_sub = df_ords_prods_merged[:1000000]

In [44]:
df_sub.shape

(1000000, 14)

# 3. Deriving new variables

## If-statements with user-defined functions

In [45]:
# Define function price_label
def price_label(row):
    if row['prices'] <= 5:
        return 'Low-range product'
    elif (row['prices'] > 5) and (row['prices'] <= 15):
        return 'Mid-range product'
    elif row['prices'] > 15:
        return 'High-range product'
    else:
        return np.nan

In [46]:
# Apply function price_label
df_sub['price_range'] = df_sub.apply(price_label, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['price_range'] = df_sub.apply(price_label, axis = 1)


In [47]:
# Check frequencies of new row price_range
df_sub['price_range'].value_counts(dropna = False)

price_range
Mid-range product     652638
Low-range product     338018
High-range product      9344
Name: count, dtype: int64

In [48]:
df_sub['prices'].max()

24.5

## If-statements with loc() function

### Dataframe: df_ords_prods

In [72]:
# Create conditions for price_range_loc
df_sub.loc[df_sub['prices'] <= 5, 'price_range_loc'] = 'Low-range product'
df_sub.loc[(df_sub['prices'] > 5) & (df_sub['prices'] <= 15), 'price_range_loc'] = 'Mid-range product'
df_sub.loc[df_sub['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [50]:
# Check frequencies of new column 'price_range_loc'
df_sub['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid-range product     652638
Low-range product     338018
High-range product      9344
Name: count, dtype: int64

### Dataframe: df_ords_prods_merged

In [51]:
# Create conditions for price_range_loc
df_ords_prods_merged.loc[df_ords_prods_merged['prices'] <= 5, 'price_range_loc'] = 'Low-range product'
df_ords_prods_merged.loc[(df_ords_prods_merged['prices'] <= 15) & (df_ords_prods_merged['prices'] > 5), 'price_range_loc'] = 'Mid-range product'
df_ords_prods_merged.loc[df_ords_prods_merged['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [52]:
# Check frequencies of new column 'price_range_loc'
df_ords_prods_merged['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: count, dtype: int64

## If-statements with for-loops

In [53]:
# Determine the busiest day
df_ords_prods_merged['orders_day_of_week'].value_counts(dropna = False)

orders_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In [54]:
# Create result list
result = []

# Run for-loop
for value in df_ords_prods_merged['orders_day_of_week']:
  if value == 0:
    result.append('Busiest day')
  elif value == 4:
    result.append('Least busy')
  else:
    result.append('Regularly busy')

In [55]:
# Print result list
result

['Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Busiest day',
 'Least busy',
 'Busiest day',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Bus

In [56]:
# Assign result list to df_ords_prods_merged column 'busiest_day'
df_ords_prods_merged['busiest_day'] = result

In [57]:
# Check frequencies of new column 'busiest_day'
df_ords_prods_merged['busiest_day'].value_counts(dropna = False)

busiest_day
Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: count, dtype: int64

# 4. Answering business questions

## *'busiest_days'*

In [58]:
# Rename 'busiest_day' to 'busiest_days'
df_ords_prods_merged.rename(columns = {'busiest_day' : 'busiest_days'}, inplace = True)

In [59]:
# Determine the busiest and slowest days
df_ords_prods_merged['orders_day_of_week'].value_counts(dropna = False)

orders_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In [60]:
# Create result list
result_2 = []

# Run for-loop (Question 2)
for value in df_ords_prods_merged['orders_day_of_week']:
  if value in (0, 1):
    result_2.append('Busiest day')
  elif value in (3, 4):
    result_2.append('Least busy')
  else:
    result_2.append('Regularly busy')

In [61]:
# Print result list
result_2

['Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Least busy',
 'Least busy',
 'Busiest day',
 'Regularly busy',
 'Least busy',
 'Busiest day',
 'Busiest day',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Busiest day',
 'Regularly busy',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Least busy',
 'Regularly busy',
 'Busiest day',
 'Busiest day',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Least busy',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Least busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Busiest day',
 'Busiest day',
 'Regularly busy',
 'Busiest day',
 'Least busy',
 

In [62]:
# Assign result list to df_ords_prods_merged column 'busiest_days'
df_ords_prods_merged['busiest_days'] = result_2

In [63]:
# Check values of new column for accuracy (Question 3)
df_ords_prods_merged.head(15)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge,price_range_loc,busiest_days
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,5,0,both,Mid-range product,Regularly busy
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,1,1,both,Mid-range product,Regularly busy
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,20,0,both,Mid-range product,Busiest day
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,10,0,both,Mid-range product,Least busy
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,11,1,both,Mid-range product,Least busy
5,1,Chocolate Sandwich Cookies,61,19,5.8,1701441,777,16,1,7,26.0,7,0,both,Mid-range product,Busiest day
6,1,Chocolate Sandwich Cookies,61,19,5.8,1871483,825,3,2,14,30.0,2,0,both,Mid-range product,Regularly busy
7,1,Chocolate Sandwich Cookies,61,19,5.8,1290456,910,12,3,10,30.0,1,0,both,Mid-range product,Least busy
8,1,Chocolate Sandwich Cookies,61,19,5.8,369558,1052,10,1,20,19.0,1,0,both,Mid-range product,Busiest day
9,1,Chocolate Sandwich Cookies,61,19,5.8,589712,1052,15,1,12,15.0,2,1,both,Mid-range product,Busiest day


#### *<mark>Answer:</mark> All values (0 - 6) have been assigned the correct string label in the 'busiest_days' column.*

In [64]:
# Check frequencies of new column 'busiest_days'
df_ords_prods_merged['busiest_days'].value_counts(dropna = False)

busiest_days
Regularly busy    12916111
Busiest day       11864412
Least busy         7624336
Name: count, dtype: int64

## *'busiest_period_of_day'*

In [65]:
# Determine the busiest hours of the day
df_ords_prods_merged['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: count, dtype: int64

In [66]:
# Create result list
result_3 = []

# Create *_orders tuples for easier access
most_orders = (10, 11, 14, 15, 13, 12, 16, 9)
fewest_orders = (23, 6, 0, 1, 5, 2, 4, 3)

# Run for-loop (Question 4)
for value in df_ords_prods_merged['order_hour_of_day']:
  if value in most_orders:
    result_3.append('Most orders')
  elif value in fewest_orders:
    result_3.append('Fewest orders')
  else:
    result_3.append('Average orders')

In [67]:
# Print result list
result_3

['Most orders',
 'Average orders',
 'Average orders',
 'Most orders',
 'Average orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Average orders',
 'Fewest orders',
 'Average orders',
 'Fewest orders',
 'Fewest orders',
 'Fewest orders',
 'Fewest orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Fewest orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Average orders',
 'Most orders',
 'Most ord

In [68]:
# Assign result list to df_ords_prods_merged column 'busiest_period_of_day' (Question 4)
df_ords_prods_merged['busiest_period_of_day'] = result_3

In [69]:
# Check frequencies of new column 'busiest_period_of_day' (Question 5)
df_ords_prods_merged['busiest_period_of_day'].value_counts(dropna = False)

busiest_period_of_day
Most orders       21118071
Average orders     9997651
Fewest orders      1289137
Name: count, dtype: int64

# 5. Exporting data

In [70]:
df_ords_prods_merged.shape

(32404859, 17)

In [71]:
# Export df_ords_prods_merged as "orders_products_merged(2).pkl" 
df_ords_prods_merged.to_pickle(os.path.join(path, '02_Data', 'Prepared_data', 'orders_products_merged(2).pkl'))