In [1]:
import pandas as pd
from os import path as pth

In [2]:
path = r'/Users/polusa/Library/Mobile Documents/com~apple~CloudDocs/my_DA_2024/CareerFoundry_Data_Analytics_Bootcamp/4-Python_Fundamentals_for_DA/04-2024_Instacart_Basket_Analysis/02-Data'
prepared_data_folder = r'02-Prepared_Data'
raw_data_folder = r'01-Raw_Data'

In [3]:
# import a subset (first 1_000_000 rows) of the dataset ords_prods_merge
df = pd.read_pickle(pth.join(path, prepared_data_folder, 'ords_prods_merge.pkl'))[:1_000_000]


In [4]:
# import the full dataset
ords_prods_merge = pd.read_pickle(pth.join(path, prepared_data_folder, 'ords_prods_merge.pkl'))
ords_prods_merge.drop(['_merge'],axis=1, inplace=True)

In [5]:
ords_prods_merge.shape

(32404859, 13)

#### Conditions:  

- 0$ < price <= 5$ then "low-range" product
- 5$ < price <= 15$ then "mid-range" product
- 15$ < price then "high-range" product  

In [6]:
df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_name,aisle_id,department_id,prices,_merge
0,2,33120,1,1,202279,3,5,9,8.0,Organic Egg Whites,86,16,11.3,both
1,2,28985,2,1,202279,3,5,9,8.0,Michigan Organic Kale,83,4,13.4,both
2,2,9327,3,0,202279,3,5,9,8.0,Garlic Powder,104,13,3.6,both
3,2,45918,4,1,202279,3,5,9,8.0,Coconut Butter,19,13,8.4,both
4,2,30035,5,0,202279,3,5,9,8.0,Natural Sweetener,17,13,13.7,both


In [7]:
# a row in a panda series (similar to a list) 
df.iloc[0]

order_id                                  2
product_id                            33120
add_to_cart_order                         1
reordered                                 1
user_id                              202279
order_number                              3
orders_day_of_week                        5
order_hour_of_day                         9
days_since_last_order                   8.0
product_name             Organic Egg Whites
aisle_id                                 86
department_id                            16
prices                                 11.3
_merge                                 both
Name: 0, dtype: object

In [8]:
# function for new column derivation

def price_label(row):
    if row['prices'] <= 5:
        return 'Low-Range'
    elif row['prices'] <= 15:
        return 'Mid-Range'
    elif row['prices'] > 15:
        return 'High-Range'
    else: return 'Not Enough Data'

#### The `apply()` function  

The `apply()` function in Pandas is used to apply a function along an axis of a DataFrame or Series. This function is very versatile and can be applied in various scenarios for data manipulation, transformation, and cleaning.  

##### Syntax:  
`DataFrame.apply(func, axis=0, raw=False, result_type=None, args=(), **kwds)`  

`Series.apply(func, convert_dtype=True, args=(), **kwds)`  

`axis=0` horizontal  
`axis=1` vertical


In [9]:
# create a new column called `price_range` and populate it with 1 as a test

df['price_range'] = 1

In [10]:
# now let's populated with the value returned by our function

df['price_range'] = df.apply(price_label, axis=1)

In [11]:
df['price_range'].value_counts(dropna=False)

price_range
Mid-Range     674229
Low-Range     312859
High-Range     12912
Name: count, dtype: int64

Why not use the `max()` function to check what the most expensive product within the subset is?  
This confirms your findings from the labels—that there aren’t any high-range products within the subset.

In [12]:
# most expensive product
df['prices'].max() # df.loc[:,'prices'].max()

99999.0

Using `.loc()`, you can apply the conditional logic of an if-statement to a function without explicitly creating an if-else construct.

In [13]:
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-Range'

In [14]:
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-Range' 

In [15]:
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-Range'

In [16]:
df.loc[:,'price_range_loc'].value_counts(dropna=False)

price_range_loc
Mid-Range     674229
Low-Range     312859
High-Range     12912
Name: count, dtype: int64

### Deriving new "_price_label_" columns in the `ords_prods_merge` Dataframe

In [17]:
ords_prods_merge.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_name,aisle_id,department_id,prices
0,2,33120,1,1,202279,3,5,9,8.0,Organic Egg Whites,86,16,11.3
1,2,28985,2,1,202279,3,5,9,8.0,Michigan Organic Kale,83,4,13.4
2,2,9327,3,0,202279,3,5,9,8.0,Garlic Powder,104,13,3.6
3,2,45918,4,1,202279,3,5,9,8.0,Coconut Butter,19,13,8.4
4,2,30035,5,0,202279,3,5,9,8.0,Natural Sweetener,17,13,13.7


In [18]:
ords_prods_merge.loc[ords_prods_merge['prices'] > 15, 'price_label'] = 'High-Range'

In [19]:
ords_prods_merge.loc[(ords_prods_merge['prices'] <= 15) & (ords_prods_merge['prices'] > 5), 'price_label'] = 'Mid-Range' 

In [20]:
ords_prods_merge.loc[ords_prods_merge['prices'] <= 5, 'price_label'] = 'Low-Range'

In [21]:
ords_prods_merge.loc[:,'price_label'].value_counts(dropna=False)

price_label
Mid-Range     21860860
Low-Range     10126321
High-Range      417678
Name: count, dtype: int64

In [22]:
ords_prods_merge.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_name,aisle_id,department_id,prices,price_label
0,2,33120,1,1,202279,3,5,9,8.0,Organic Egg Whites,86,16,11.3,Mid-Range
1,2,28985,2,1,202279,3,5,9,8.0,Michigan Organic Kale,83,4,13.4,Mid-Range
2,2,9327,3,0,202279,3,5,9,8.0,Garlic Powder,104,13,3.6,Low-Range
3,2,45918,4,1,202279,3,5,9,8.0,Coconut Butter,19,13,8.4,Mid-Range
4,2,30035,5,0,202279,3,5,9,8.0,Natural Sweetener,17,13,13.7,Mid-Range


#### Create a new column in your `ords_prods_merge` dataframe that summarizes how busy each day of the week is  

First, we need to know on which day most orders take place  

In the project brief, we can see that the value 0 means Saturday

In [23]:
ords_prods_merge.loc[:,'orders_day_of_week'].value_counts(dropna=False)

orders_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

##### Create a new column, “busiest day,” that will contain one of three different values: “Busiest day,” “Least busy,” and “Regularly busy.”

Method 1 (using user-defined function and  the `apply()` method)

In [24]:
def how_busy(day):
    if day == 0: return 'Busiest'
    elif day == 4: return 'Least Busy'
    else: return 'Regularly Busy'

In [25]:
ords_prods_merge.loc[:,'busiest_day'] = ords_prods_merge.loc[:,'orders_day_of_week'].apply(how_busy)

In [26]:
ords_prods_merge['busiest_day'].value_counts(dropna=False)

busiest_day
Regularly Busy    22416875
Busiest            6204182
Least Busy         3783802
Name: count, dtype: int64

In [27]:
ords_prods_merge.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_name,aisle_id,department_id,prices,price_label,busiest_day
0,2,33120,1,1,202279,3,5,9,8.0,Organic Egg Whites,86,16,11.3,Mid-Range,Regularly Busy
1,2,28985,2,1,202279,3,5,9,8.0,Michigan Organic Kale,83,4,13.4,Mid-Range,Regularly Busy
2,2,9327,3,0,202279,3,5,9,8.0,Garlic Powder,104,13,3.6,Low-Range,Regularly Busy
3,2,45918,4,1,202279,3,5,9,8.0,Coconut Butter,19,13,8.4,Mid-Range,Regularly Busy
4,2,30035,5,0,202279,3,5,9,8.0,Natural Sweetener,17,13,13.7,Mid-Range,Regularly Busy


Method 2 (using a for-loop)

In [28]:
# will use the subset df dataframe 

In [29]:
result = []

In [30]:
for i in df.loc[:,'orders_day_of_week']:
    if i == 0: result.append('Busiest')
    elif i == 4: result.append('Least Busy')
    else: result.append('Regularly Busy')

In [31]:
df.loc[:,'Busiest_Day'] = result

In [32]:
df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_name,aisle_id,department_id,prices,_merge,price_range,price_range_loc,Busiest_Day
0,2,33120,1,1,202279,3,5,9,8.0,Organic Egg Whites,86,16,11.3,both,Mid-Range,Mid-Range,Regularly Busy
1,2,28985,2,1,202279,3,5,9,8.0,Michigan Organic Kale,83,4,13.4,both,Mid-Range,Mid-Range,Regularly Busy
2,2,9327,3,0,202279,3,5,9,8.0,Garlic Powder,104,13,3.6,both,Low-Range,Low-Range,Regularly Busy
3,2,45918,4,1,202279,3,5,9,8.0,Coconut Butter,19,13,8.4,both,Mid-Range,Mid-Range,Regularly Busy
4,2,30035,5,0,202279,3,5,9,8.0,Natural Sweetener,17,13,13.7,both,Mid-Range,Mid-Range,Regularly Busy


# Task 4.7

1) If you haven’t done so already, complete the instructions in the Exercise for creating the “`price_label`” and “`busiest_day`” columns.
2) Suppose your clients have changed their minds about the labels you created in your “`busiest_day`” column. Now, they want “Busiest day” to become “Busiest days” (plural). This label should correspond with the two busiest days of the week as opposed to the single busiest day. At the same time, they’d also like to know the two slowest days. Create a new column for this using a suitable method.

In [33]:
ords_prods_merge['orders_day_of_week'].value_counts(dropna=False)

orders_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In [34]:
# busiest 0 and 1
# slowest 3 and 4
# regular 2 5 6 

In [35]:
# rename column
ords_prods_merge.rename(columns={'busiest_day': 'busiest_days'}, inplace=True)

In [36]:
# function to fill the colummn 'busiest_days'
def how_busy(day):
    if day in [0,1]: return 'busiest_days'
    elif day in [3,4]: return 'slowest_days'
    elif day in [2,5,6]: return 'regular_days'
    else: return 'Info not available'

In [37]:
ords_prods_merge.loc[:,'busiest_days'] = ords_prods_merge.loc[:,'orders_day_of_week'].apply(how_busy)

3) Check the values of this new column for accuracy. Note any observations in markdown format.

In [38]:
print(ords_prods_merge.loc[:,'busiest_days'].shape)
ords_prods_merge.loc[:,'busiest_days'].value_counts(dropna=False)

(32404859,)


busiest_days
regular_days    12916111
busiest_days    11864412
slowest_days     7624336
Name: count, dtype: int64

In [39]:
12916111+11864412+7624336

32404859

By doing a quick check, using the shape and the `value_counts()`, it seems that all rows had a valid value that indicates the day of the week.  
The regualar days makes up the largest majority, followed by busiest days and slowest days respectively.

4) When too many users make Instacart orders at the same time, the app freezes. The senior technical officer at Instacart wants you to identify the busiest hours of the day.  
Rather than by hour, they want periods of time labeled “_Most orders_,” “_Average orders_,” and “_Fewest orders_.”  
Create a new column containing these labels called “busiest_period_of_day.”

In [40]:
ords_prods_merge.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_name,aisle_id,department_id,prices,price_label,busiest_days
0,2,33120,1,1,202279,3,5,9,8.0,Organic Egg Whites,86,16,11.3,Mid-Range,regular_days
1,2,28985,2,1,202279,3,5,9,8.0,Michigan Organic Kale,83,4,13.4,Mid-Range,regular_days
2,2,9327,3,0,202279,3,5,9,8.0,Garlic Powder,104,13,3.6,Low-Range,regular_days
3,2,45918,4,1,202279,3,5,9,8.0,Coconut Butter,19,13,8.4,Mid-Range,regular_days
4,2,30035,5,0,202279,3,5,9,8.0,Natural Sweetener,17,13,13.7,Mid-Range,regular_days


In [41]:
ords_prods_merge['order_hour_of_day'].value_counts(dropna=False)

order_hour_of_day
10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: count, dtype: int64

In [42]:
# >= 250_000 : 'most_orders' [10,11,14,15,13,12,16]
# >= 100_000 : 'average_orders' [9,17,8,18,19,20,7,21,22,23,6,0,1]
# < 100_000 : 'fewest_orders' [5,2,4,3]

In [43]:
ords_prods_merge.loc[ords_prods_merge['order_hour_of_day'].isin([10,11,14,15,13,12,16]), 'busiest_period_of_day' ] = 'most_orders'
ords_prods_merge.loc[ords_prods_merge['order_hour_of_day'].isin([9,17,8,18,19,20,7,21,22,23,6,0,1]), 'busiest_period_of_day' ] = 'average_orders'
ords_prods_merge.loc[ords_prods_merge['order_hour_of_day'].isin([5,2,4,3]), 'busiest_period_of_day' ] = 'fewest_orders'

In [44]:
ords_prods_merge.loc[:,'busiest_period_of_day'].value_counts(dropna=False)

busiest_period_of_day
most_orders       18663868
average_orders    13479132
fewest_orders       261859
Name: count, dtype: int64

In [45]:
ords_prods_merge.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order,product_name,aisle_id,department_id,prices,price_label,busiest_days,busiest_period_of_day
0,2,33120,1,1,202279,3,5,9,8.0,Organic Egg Whites,86,16,11.3,Mid-Range,regular_days,average_orders
1,2,28985,2,1,202279,3,5,9,8.0,Michigan Organic Kale,83,4,13.4,Mid-Range,regular_days,average_orders
2,2,9327,3,0,202279,3,5,9,8.0,Garlic Powder,104,13,3.6,Low-Range,regular_days,average_orders
3,2,45918,4,1,202279,3,5,9,8.0,Coconut Butter,19,13,8.4,Mid-Range,regular_days,average_orders
4,2,30035,5,0,202279,3,5,9,8.0,Natural Sweetener,17,13,13.7,Mid-Range,regular_days,average_orders


In [46]:
pd.to_pickle(ords_prods_merge, pth.join(path,prepared_data_folder, 'ords_prods_merge_4.7.pkl'))