# Table of Contents
## 1. Data Import and Checks
## 2. Create Price Ranges
## 3. Create Busiest Day Flag
## 4. Create Flag for Busiest Hours
## 5. Export Data

# 1. Data Import and Checks

In [12]:
# import libraries
import pandas as pd
import numpy as np
import os

In [13]:
# create path 
path = r'C:\Users\18602\Documents\Data Analytics\Data Immersion\Month 4\Instacart Basket Analysis'

In [16]:
# import order_products_combined
df_ords_prods_combined = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_combined_2.pkl'))

In [17]:
# Check new dataframe
df_ords_prods_combined.head()

Unnamed: 0,Unnamed: 0_x,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,Unnamed: 0_y,product_name,aisle_id,department_id,prices
0,0.0,2539329.0,1.0,1.0,2.0,8.0,,196.0,1.0,0.0,both,195.0,Soda,77.0,7.0,9.0
1,1.0,2398795.0,1.0,2.0,3.0,7.0,15.0,196.0,1.0,1.0,both,195.0,Soda,77.0,7.0,9.0
2,2.0,473747.0,1.0,3.0,3.0,12.0,21.0,196.0,1.0,1.0,both,195.0,Soda,77.0,7.0,9.0
3,3.0,2254736.0,1.0,4.0,4.0,7.0,29.0,196.0,1.0,1.0,both,195.0,Soda,77.0,7.0,9.0
4,4.0,431534.0,1.0,5.0,4.0,15.0,28.0,196.0,1.0,1.0,both,195.0,Soda,77.0,7.0,9.0


In [19]:
# Create a new subset
df = df_ords_prods_combined[:1000000]

In [20]:
# Check new subset
df.shape

(1000000, 16)

# 2. Create Price Ranges

In [21]:
# create filter for high, mid, and low-range products

def price_label(row):

  if row['prices'] <= 5:
    return 'Low-range product'
  elif (row['prices'] > 5) and (row['prices'] <= 15):
    return 'Mid-range product'
  elif row['prices'] > 15:
    return 'High range'
  else: return 'Not enough data'

In [22]:
# apply filter
df['price_range'] = df.apply(price_label, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis=1)


In [23]:
# check filter

df['price_range'].value_counts() 

Mid-range product    581197
Low-range product    212594
Not enough data      206209
Name: price_range, dtype: int64

In [24]:
# check prices

df['prices'].max()

14.0

Create Alternate filter using df.loc

In [27]:
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [28]:
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [29]:
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [30]:
df['price_range_loc'].value_counts() 

Mid-range product    581197
Low-range product    212594
Name: price_range_loc, dtype: int64

In [31]:
df_ords_prods_combined.loc[df_ords_prods_combined['prices'] >15, 'price_range_loc'] = 'High-range product'

In [32]:
df_ords_prods_combined.loc[(df_ords_prods_combined['prices'] <=15) & (df_ords_prods_combined['prices'] > 5), 'price_range_loc'] = 'Mid=range product'

In [33]:
df_ords_prods_combined.loc[df_ords_prods_combined['prices'] <=5, 'price_range_loc'] = 'Low-range product'

In [34]:
# check filter

df_ords_prods_combined['price_range_loc'].value_counts()

Mid=range product     21860868
Low-range product     10126324
High-range product      417678
Name: price_range_loc, dtype: int64

# 3. Create Busiest Day Flag

In [78]:
# create function for busiest days

result = []

for value in df_ords_prods_combined["order_day_of_week"]:
  if value == 0:
    result.append("Busiest days")
  elif value == 1:
    result.append("Busiest days")
  elif value == 3:
    result.append("Least busy days")
  elif value == 4:
    result.append("Least busy days")
  else:
    result.append("Regularly busy")

In [79]:
# add busiest_days column

df_ords_prods_combined['busiest_days'] = result

In [80]:
# check function values

df_ords_prods_combined['busiest_days'].value_counts(dropna = False)

Regularly busy     13010477
Busiest days       11949665
Least busy days     7681137
Name: busiest_days, dtype: int64

In [85]:
# check counts of days of week for orders

df_ords_prods_combined['order_day_of_week'].value_counts(dropna = False)

0.0    6252941
1.0    5696724
6.0    4530304
2.0    4243173
5.0    4236989
3.0    3868747
4.0    3812390
NaN         11
Name: order_day_of_week, dtype: int64

In [82]:
# df to check days of week assigned

df_week_frequency = df_ords_prods_combined.filter(['order_day_of_week','busiest_days'])

In [83]:
# check busiest days assigned

df_week_frequency.value_counts()

order_day_of_week  busiest_days   
0.0                Busiest days       6252941
1.0                Busiest days       5696724
6.0                Regularly busy     4530304
2.0                Regularly busy     4243173
5.0                Regularly busy     4236989
3.0                Least busy days    3868747
4.0                Least busy days    3812390
dtype: int64

In [84]:
# check df_ords_prods_combined output

Unnamed: 0,Unnamed: 0_x,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,Unnamed: 0_y,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days
0,0.0,2539329.0,1.0,1.0,2.0,8.0,,196.0,1.0,0.0,both,195.0,Soda,77.0,7.0,9.0,Mid=range product,Regularly busy,Regularly busy
1,1.0,2398795.0,1.0,2.0,3.0,7.0,15.0,196.0,1.0,1.0,both,195.0,Soda,77.0,7.0,9.0,Mid=range product,Regularly busy,Least busy days
2,2.0,473747.0,1.0,3.0,3.0,12.0,21.0,196.0,1.0,1.0,both,195.0,Soda,77.0,7.0,9.0,Mid=range product,Regularly busy,Least busy days
3,3.0,2254736.0,1.0,4.0,4.0,7.0,29.0,196.0,1.0,1.0,both,195.0,Soda,77.0,7.0,9.0,Mid=range product,Least busy,Least busy days
4,4.0,431534.0,1.0,5.0,4.0,15.0,28.0,196.0,1.0,1.0,both,195.0,Soda,77.0,7.0,9.0,Mid=range product,Least busy,Least busy days


Everything looks as though it was correctly assigned. I edited the result[] function to have additional elifs. 
My question is was there a way to assign multiple variables in the "if" and "elif". Technically I am assigning 0 and 1 to the same output but I created 2 statements to get that result. Could that have been one statement using an ',' or 'or'. I tried to use both but was uncertain of what syntax would be appropriate.

Determine the busiest hours of the day so the tech team can help the app function at busier times.

In [86]:
df_ords_prods_combined['order_hour_of_day'].value_counts(dropna = False)

10.0    2781009
11.0    2755369
14.0    2709084
15.0    2682090
13.0    2680171
12.0    2637533
16.0    2554110
9.0     2471067
17.0    2104225
8.0     1729871
18.0    1649870
19.0    1268365
20.0     983232
7.0      897163
21.0     801052
22.0     638671
23.0     405051
6.0      292533
0.0      220337
1.0      116589
5.0       88661
2.0       69904
4.0       53636
3.0       51675
NaN          11
Name: order_hour_of_day, dtype: int64

Since we don't have specific parameters as to what constitutes as "busy" we are going to define the most as the highest 25% of ders, average as the 26-74% and fewest as 0-25%

Most Orders = 2554110< orders (hours 10-16)
Average Orders = 220338-2554109 (6-9, 17-23)
Fewest Orders = <220337 (0-5)

# 4. Create Flag for Busiest Hours

In [89]:
# create df to analyze hours

df = df_ords_prods_combined['order_hour_of_day'].value_counts(dropna = False)

In [90]:
# get statistics for hours of day
df.describe()

count    2.500000e+01
mean     1.305651e+06
std      1.097584e+06
min      1.100000e+01
25%      2.203370e+05
50%      9.832320e+05
75%      2.554110e+06
max      2.781009e+06
Name: order_hour_of_day, dtype: float64

In [114]:
# create function to assign hours to times of day

df_ords_prods_combined.loc[(df_ords_prods_combined['order_hour_of_day'] <=16) & (df_ords_prods_combined['order_hour_of_day'] >=10),'busiest_period_of_day'] = 'Most Orders'

In [115]:
df_ords_prods_combined.loc[(df_ords_prods_combined['order_hour_of_day'] <=23) & (df_ords_prods_combined['order_hour_of_day']>=17),'busiest_period_of_day'] = 'Average Orders'

In [116]:
df_ords_prods_combined.loc[(df_ords_prods_combined['order_hour_of_day'] <=9) & (df_ords_prods_combined['order_hour_of_day']<=6),'busiest_period_of_day'] = 'Average Orders'

In [117]:
df_ords_prods_combined.loc[(df_ords_prods_combined['order_hour_of_day'] <=5),'busiest_period_of_day'] = 'Fewest Orders'

In [118]:
df_ords_prods_combined.columns

Index(['Unnamed: 0_x', 'order_id', 'user_id', 'order_number',
       'order_day_of_week', 'order_hour_of_day', 'days_since_prior_order',
       'product_id', 'add_to_cart_order', 'reordered', '_merge',
       'Unnamed: 0_y', 'product_name', 'aisle_id', 'department_id', 'prices',
       'price_range_loc', 'busiest_day', 'busiest_days',
       'busiest_period_of_day'],
      dtype='object')

In [119]:
#check values for times of day

df_ords_prods_combined['busiest_period_of_day'].value_counts(dropna = False)

Most Orders       18799366
Average Orders    13241100
Fewest Orders       600802
NaN                     11
Name: busiest_period_of_day, dtype: int64

In [120]:
# check to make sure column was added in

df_ords_prods_combined.head()

Unnamed: 0,Unnamed: 0_x,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,Unnamed: 0_y,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_days,busiest_period_of_day
0,0.0,2539329.0,1.0,1.0,2.0,8.0,,196.0,1.0,0.0,both,195.0,Soda,77.0,7.0,9.0,Mid=range product,Regularly busy,Regularly busy,Average Orders
1,1.0,2398795.0,1.0,2.0,3.0,7.0,15.0,196.0,1.0,1.0,both,195.0,Soda,77.0,7.0,9.0,Mid=range product,Regularly busy,Least busy days,Average Orders
2,2.0,473747.0,1.0,3.0,3.0,12.0,21.0,196.0,1.0,1.0,both,195.0,Soda,77.0,7.0,9.0,Mid=range product,Regularly busy,Least busy days,Most Orders
3,3.0,2254736.0,1.0,4.0,4.0,7.0,29.0,196.0,1.0,1.0,both,195.0,Soda,77.0,7.0,9.0,Mid=range product,Least busy,Least busy days,Average Orders
4,4.0,431534.0,1.0,5.0,4.0,15.0,28.0,196.0,1.0,1.0,both,195.0,Soda,77.0,7.0,9.0,Mid=range product,Least busy,Least busy days,Most Orders


In [123]:
# create df to check hour of day filter 
df2 = df_ords_prods_combined.filter(['order_hour_of_day','busiest_period_of_day'], axis=1)

In [129]:
# check value counts to make sure filters were applied correctly

df2.value_counts()

order_hour_of_day  busiest_period_of_day
10.0               Most Orders              2781009
11.0               Most Orders              2755369
14.0               Most Orders              2709084
15.0               Most Orders              2682090
13.0               Most Orders              2680171
12.0               Most Orders              2637533
16.0               Most Orders              2554110
9.0                Average Orders           2471067
17.0               Average Orders           2104225
8.0                Average Orders           1729871
18.0               Average Orders           1649870
19.0               Average Orders           1268365
20.0               Average Orders            983232
7.0                Average Orders            897163
21.0               Average Orders            801052
22.0               Average Orders            638671
23.0               Average Orders            405051
6.0                Average Orders            292533
0.0                Fewe

# 5. Export Data

In [130]:
#export to pickle

df_ords_prods_combined.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_large.pkl'))