**This script contains the following points:**

1. Importing libraries
2. Importing data
3. Data wrangling
4. Answering questions
    - Question 2
    - Question 3
    - Question 4
    - Question 5
    - Question 6
    - Question 7
    - Question 8
    - Question 9
    - Question 10
5. Exporting data

# 01. Importing libraries

In [34]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 02. Importing data

In [35]:
# Turn project folder path into a string
path = r'/Users/sarahtischer/Desktop/CareerFoundry/Data Immersion/Achievement 4/01-2024_Instacart_Basket_Analysis'

In [36]:
# Import "orders.csv"
df_ords = pd.read_csv(os.path.join(path, '02_Data', 'Original_data', 'orders.csv'), index_col = False)

In [37]:
df_ords.shape

(3421083, 7)

In [38]:
# Import "products.csv"
df_prods = pd.read_csv(os.path.join(path, '02_Data', 'Original_data', 'products.csv'), index_col = False)

In [39]:
df_prods.shape

(49693, 5)

In [40]:
# Import "departments.csv"
df_dep = pd.read_csv(os.path.join(path, '02_Data', 'Original_data', 'departments.csv'), index_col = False)

In [41]:
df_dep.shape

(1, 22)

# 03. Data wrangling

In [42]:
# Drop 'eval_set' column from df_ords
df_ords = df_ords.drop(columns = ['eval_set'])

In [43]:
# Rename 'order_dow' to 'orders_day_of_week' in df_ords
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [44]:
# Change data type of 'order_id' from int to str
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [45]:
# Transpose df_dep to df_dep_t
df_dep_t = df_dep.T

In [46]:
# Add an index to df_dep_t
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [47]:
# Assign a new header to df_dep_t

new_header = df_dep_t.iloc[0] # Assigning values of first row (index = 0) to a variable

df_dep_t_new = df_dep_t[1:] # Creating new dataframe by copying rows beyond the first row

df_dep_t_new.columns = new_header # Assigning new_header variable as column names

In [48]:
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [49]:
# Create a data dictionary of df_dep_t_new
data_dict = df_dep_t_new.to_dict('index')

In [50]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [51]:
# Create a subset of df_prods
df_snacks = df_prods[df_prods['department_id']==19]

# 04. Answering questions

In [52]:
df_ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                object 
 1   user_id                 int64  
 2   order_number            int64  
 3   orders_day_of_week      int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 156.6+ MB


### Question 2

In [53]:
# Change data type of 'user_id' from int to str
df_ords['user_id'] = df_ords['user_id'].astype('str')

### Question 3

In [54]:
# Rename 'orders_day_of_week' to 'order_day_of_week' in df_ords to be consistent with 'order_hour_of_day',
# Rename 'order_number' to 'order_sequence' in df_ords to make it more intuitive
df_ords.rename(
    columns = {'orders_day_of_week' : 'order_day_of_week', 'order_number' : 'order_sequence'}, 
    inplace = False
)

Unnamed: 0,order_id,user_id,order_sequence,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


### Question 4

In [55]:
# Find frequencies of order hours
df_ords['order_hour_of_day'].value_counts(dropna = True)

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

#### *<mark>Answer:</mark> The busiest hour for placing orders is 10 a.m.*

### Question 5

In [56]:
# Determine the meaning behind a value of 4 in the 'department_id' column
print(data_dict.get('4'))

{'department': 'produce'}


#### *<mark>Answer:</mark> Products with a department ID of 4 are in the department 'produce'.*

### Question 6

In [57]:
# Create subset with breakfast item sales
df_breakfast = df_prods.loc[df_prods['department_id'] == 14]

### Question 7

In [58]:
# Create subset for dinner party items from departments alcohol, deli, beverages, and meat/seafood
df_dinner_party = df_prods.loc[df_prods['department_id'].isin([5, 7, 12, 20])]

df_dinner_party

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
...,...,...,...,...,...
49676,49672,Cafe Mocha K-Cup Packs,26,7,6.5
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49686,49682,California Limeade,98,7,4.3


### Question 8

In [59]:
# Print dimensions of df_dinner_party
df_dinner_party.shape

(7650, 5)

#### *<mark>Answer:</mark> df_dinner_party has 7650 rows.*

### Question 9

In [60]:
# Print information for user_id = 1 as df_ords_user_1
df_ords_user_1 = df_ords.loc[df_ords['user_id'] == '1']

### Question 10

In [61]:
# Print descriptive statistics for df_ords_user_1
df_ords_user_1.describe()

Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


#### *<mark>Answers</mark>*
* ***The user has placed 11 orders in total.***
* ***The user ordered earliest at 7 a.m. and latest at 4 p.m., and usually orders around 10 a.m.***
* ***The user placed orders between day 1 (Sunday) and 4 (Wednesday) of the week.***
* ***On average, the user orders every 19 days.***

# 05. Exporting data

In [62]:
df_ords.shape

(3421083, 6)

In [63]:
# Export df_ords as "orders_wrangled.csv"
df_ords.to_csv(os.path.join(path, '02_Data','Prepared_data', 'orders_wrangled.csv'), index = False)

In [64]:
df_dep_t_new.shape

(21, 1)

In [65]:
# Export df_dep_t_new as "departments_wrangled.csv"
df_dep_t_new.to_csv(os.path.join(path, '02_Data','Prepared_data', 'departments_wrangled.csv'))