## Contents
1. Importing libraries
2. Reading data, setting up
3. Creating flags and flag columns

### 1. Importing libraries

In [18]:
# importing libraries
import pandas as pd
import numpy as np
import os

### 2. Reading data

In [19]:
# Read data from pickle
path = r'C:\Users\Richárd\Desktop\CareerFoundry\Data Immersion\Archievment 4\Instacart_Basket_Analysis_2023_04_03\02_Data\Prepared_Data'
merged_data = pd.read_pickle(os.path.join(path, 'ords_prods_merged_final.pkl'))
# creating subset
df = merged_data[:1000000]
df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_the_week,order_hour_of_day,days_after_previous_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both


### 3. Creating flags and flag columns

### Creating function to flag price-ranges

In [20]:
# Function to assign price_range labels
def price_label(row):
    if row['prices'] <= 5:
        return 'Low-range product'
    elif row['prices'] > 5 and row['prices'] <= 15:
        return 'Mid-range product'
    elif row['prices'] > 15:
        return 'High-range product'
    else:
        return 'Not enough data'

### Creating flag column

In [21]:
# Applying price_label() to df
df['price_range'] = df.apply(price_label, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis = 1)


In [22]:
# Frequency of price range labels
df['price_range'].value_counts()

Mid-range product    756450
Low-range product    243550
Name: price_range, dtype: int64

In [23]:
# checking out the maximum
df['prices'].max()

14.8

### Using loc to assign price ranges

In [24]:
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'


In [25]:
df.loc[(df['prices'] > 5) & (df['prices'] <= 15), 'price_range_loc'] = 'Mid-range product'

In [26]:
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [27]:
# Frequency of price ranges
df['price_range_loc'].value_counts()

Mid-range product    756450
Low-range product    243550
Name: price_range_loc, dtype: int64

### Assigning price-ranges to the whole merged dataset
#### Assigning flags using .loc

In [28]:
merged_data.loc[merged_data['prices'] > 15, 'price_range'] = 'High-range product'

In [29]:
merged_data.loc[(merged_data['prices'] > 5) & (merged_data['prices'] <= 15), 'price_range'] = 'Mid-range product'

In [30]:
merged_data.loc[merged_data['prices'] <= 5, 'price_range'] = 'Low-range product'

In [31]:
# Frequency of price ranges
merged_data['price_range'].value_counts()

Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: price_range, dtype: int64

In [32]:
merged_data['orders_day_of_the_week'].value_counts()

0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: orders_day_of_the_week, dtype: int64

In [33]:
# looping to flag days based on busyness, adding them to a list
result = []

for value in merged_data['orders_day_of_the_week']:
    if value == 0:
        result.append('Busiest day')
    elif value == 4:
        result.append('Least busy day')
    else:
        result.append('Regularly busy')


In [34]:
# Adding 'busiest_day' column
merged_data['busiest_day'] = result
# Frequency of 'busiest_day' column
merged_data['busiest_day'].value_counts()

Regularly busy    22416875
Busiest day        6204182
Least busy day     3783802
Name: busiest_day, dtype: int64

## Tasks

In [36]:
# Task 2. Flagging busiest days
days = []

for day in merged_data['orders_day_of_the_week']:
    if day == 0 or day == 1:
        days.append('Busiest days')
    elif day == 4 or day == 3:
        days.append('Least busy days')
    else:
        days.append('Regularly busy')

In [37]:
# Creating 'busiest_days' column
merged_data['busiest_days'] = days
merged_data['busiest_days'].value_counts()

Regularly busy     12916111
Busiest days       11864412
Least busy days     7624336
Name: busiest_days, dtype: int64

### Task 3.
The two busiest days make up a disproportionately large number of orders it seems, there are significantly more orderder on those days.

In [51]:
# Task 4. identifying busiest hours
merged_data['order_hour_of_day'].value_counts()


10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: order_hour_of_day, dtype: int64

In [53]:
# Setting up hour groups, flagging hours by number of orders
most_orders = [10, 11, 14, 15, 13, 12, 16, 9]
average_orders = [17, 8, 18, 19, 20, 7, 21, 22]

hours = []

for hour in merged_data['order_hour_of_day']:
    if hour in most_orders:
        hours.append('Most orders')
    elif hour in average_orders:
        hours.append('Average orders')
    else:
        hours.append('Fewest orders')

In [54]:
# Creating 'busiest_period_of_day' column
merged_data['busiest_period_of_day'] = hours
# Task 5. Displaying Frequency
merged_data['busiest_period_of_day'].value_counts()

Most orders       21118071
Average orders     9997651
Fewest orders      1289137
Name: busiest_period_of_day, dtype: int64

In [55]:
# Task 6 exporting df to pickle
merged_data.to_pickle(os.path.join(path, 'merged_flagged_data.pkl'))