# 4.7 Deriving New Variables.ipynb — TOC
	1.	Setup (imports + paths)
	2.	Exercise
        2.1 If statements with UDFs
        2.2 If statements with loc()
        2.3 If statements with loops
	3.	Task
        3.1 Create busiest_days column
        3.2 Validate values / frequency check
        3.3 Create busiest_period_of_day column
        3.4 Export (pickle)

# Importing

In [134]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [135]:
# Import datasets
path = r'/Users/spencer/Documents/Career Foundry/Data Immersion/4 Python Fundamentals for Data Analysts/Instacart Basket Analysis'
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge.pkl'))

# Exercise

In [136]:
# Create subset with 1,000,000 rows
df = ords_prods_merge[:1000000]

In [137]:
# View data for reference
df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices
0,2539329,1,1,2,8,,True,196,1,0,Soda,77.0,7.0,9.0
1,2539329,1,1,2,8,,True,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91.0,16.0,12.5
2,2539329,1,1,2,8,,True,12427,3,0,Original Beef Jerky,23.0,19.0,4.4
3,2539329,1,1,2,8,,True,26088,4,0,Aged White Cheddar Popcorn,23.0,19.0,4.7
4,2539329,1,1,2,8,,True,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54.0,17.0,1.0


In [138]:
df.shape

(1000000, 14)

## If-Statements with User-Defined Functions

In [139]:
def price_label(row):

  if row['prices'] <= 5:
    return 'Low-range product'
  elif (row['prices'] > 5) and (row['prices'] <= 15):
    return 'Mid-range product'
  elif row['prices'] > 15:
    return 'High range'
  else: return 'Not enough data'

In [140]:
# Define new column 'price_range' based on condition. Axis=1 stands for "rows" (0 = apply to columns)
df['price_range'] = df.apply(price_label, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis=1)


In [141]:
df['price_range'].value_counts(dropna = False)

price_range
Mid-range product    672534
Low-range product    314100
High range            12413
Not enough data         953
Name: count, dtype: int64

In [142]:
df['prices'].max()

99999.0

## If-Statements with the loc() Function

In [143]:
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'


In [144]:
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [145]:
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [146]:
df['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid-range product     672534
Low-range product     314100
High-range product     12413
NaN                      953
Name: count, dtype: int64

### Apply to full dataframe (instead of subset)

In [147]:
ords_prods_merge.loc[ords_prods_merge['prices'] > 15, 'price_label'] = 'High-range product'

In [148]:
ords_prods_merge.loc[(ords_prods_merge['prices'] <= 15) & (ords_prods_merge['prices'] > 5), 'price_label'] = 'Mid-range product'

In [149]:
ords_prods_merge.loc[ords_prods_merge['prices'] <= 5, 'price_label'] = 'Low-range product'

In [150]:
ords_prods_merge['price_label'].value_counts(dropna = False)

price_label
Mid-range product     21860852
Low-range product     10125759
High-range product      417678
NaN                      30200
Name: count, dtype: int64

## If-Statements with For-Loops

In [151]:
# View busiest days of week. 0 = Saturday
ords_prods_merge['orders_day_of_week'].value_counts(dropna = False)

orders_day_of_week
0    6209666
1    5665856
6    4500304
2    4217798
5    4209533
3    3844117
4    3787215
Name: count, dtype: int64

In [152]:
# Create empty list
result = []

# Loop through dataframe
for value in ords_prods_merge["orders_day_of_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

In [153]:
# Make new column 'busiest day' and combine with result
ords_prods_merge['busiest_day'] = result

In [154]:
ords_prods_merge['busiest_day'].value_counts(dropna = False)

busiest_day
Regularly busy    22437608
Busiest day        6209666
Least busy         3787215
Name: count, dtype: int64

# Task

In [155]:
# Verify I have “price_label” and “busiest_day” columns.
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_label,busiest_day
0,2539329,1,1,2,8,,True,196,1,0,Soda,77.0,7.0,9.0,Mid-range product,Regularly busy
1,2539329,1,1,2,8,,True,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91.0,16.0,12.5,Mid-range product,Regularly busy
2,2539329,1,1,2,8,,True,12427,3,0,Original Beef Jerky,23.0,19.0,4.4,Low-range product,Regularly busy
3,2539329,1,1,2,8,,True,26088,4,0,Aged White Cheddar Popcorn,23.0,19.0,4.7,Low-range product,Regularly busy
4,2539329,1,1,2,8,,True,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54.0,17.0,1.0,Low-range product,Regularly busy


## Make busiest days column

In [156]:
# View busiest days of week. 0 = Saturday
ords_prods_merge['orders_day_of_week'].value_counts(dropna = False)

orders_day_of_week
0    6209666
1    5665856
6    4500304
2    4217798
5    4209533
3    3844117
4    3787215
Name: count, dtype: int64

In [157]:
# Create busiest days column

# Create empty list
result2 = []

# Loop through dataframe, using OR logic
for value in ords_prods_merge["orders_day_of_week"]:
  if value == 0 or value == 1:
    result2.append("Busiest days")
  elif value == 4 or value == 3:
    result2.append("Least busy")
  else:
    result2.append("Regularly busy")

In [158]:
# Make new column 'busiest days' and combine with result
ords_prods_merge['busiest_days'] = result2

## Check values for accuracy

In [159]:
# View distribution
ords_prods_merge['busiest_days'].value_counts(dropna = False)

busiest_days
Regularly busy    12927635
Busiest days      11875522
Least busy         7631332
Name: count, dtype: int64

In [160]:
# Expect 32,434,489 rows
ords_prods_merge.shape

(32434489, 17)

### Regulary busy is the most frequent, since it contains 3 days. Busiest days outweighs Least busy, which makes logical sense. The number of rows is maintained at 32,434,489.

## Create busiest_period_of_day column by “Most orders,” “Average orders,” and “Fewest orders.”

In [161]:
# View spread of data
ords_prods_merge['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    2764426
11    2738582
14    2691548
15    2664533
13    2663292
12    2620847
16    2537458
9     2456713
17    2089465
8     1719973
18    1637923
19    1259401
20     977038
7      891937
21     796370
22     634734
23     402620
6      290795
0      218948
1      115786
5       88062
2       69434
4       53283
3       51321
Name: count, dtype: int64

In [162]:
# Define lists of hours, split into thirds (8 hours each group)
most_orders = [10, 11, 14, 15, 13, 12, 16, 9]
average_orders = [17, 8, 18, 19, 20, 7, 21, 22]
fewest_orders = [23, 6, 0, 1, 5, 2, 4, 3]

In [163]:
# Apply labels using .loc since it is cleaner and faster
ords_prods_merge.loc[ords_prods_merge['order_hour_of_day'].isin(most_orders), 'busiest_period_of_day'] = 'Most orders'
ords_prods_merge.loc[ords_prods_merge['order_hour_of_day'].isin(average_orders), 'busiest_period_of_day'] = 'Average orders'
ords_prods_merge.loc[ords_prods_merge['order_hour_of_day'].isin(fewest_orders), 'busiest_period_of_day'] = 'Fewest orders'

In [164]:
# Print frequency
ords_prods_merge['busiest_period_of_day'].value_counts()

busiest_period_of_day
Most orders       21137399
Average orders    10006841
Fewest orders      1290249
Name: count, dtype: int64

## Export dataframe as a pickle file

In [165]:
ords_prods_merge.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_merged_updated.pkl'))