In [5]:
# File name: Exercise 4.4 - Data Wrangling
# Author: Sam Abrams
# Created: 10/19/24
# Description: This notebook contains basic wrangling steps for the orders and products database.

# Exercise 4.4 Practice

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import os

In [4]:
# Datasets

In [8]:
df_ord = pd.read_csv(r'/Users/samabrams/Data Analysis Projects/Instacart Basket Analysis/02 Data/Original Data/orders.csv', index_col = False)

In [10]:
df_prod = pd.read_csv(r'/Users/samabrams/Data Analysis Projects/Instacart Basket Analysis/02 Data/Original Data/products.csv', index_col = False)

In [48]:
df_dep = pd.read_csv(r'/Users/samabrams/Data Analysis Projects/Instacart Basket Analysis/02 Data/Original Data/departments.csv', index_col = False)

## Dropping Columns

In [14]:
df_ord.columns

Index(['order_id', 'user_id', 'eval_set', 'order_number', 'order_dow',
       'order_hour_of_day', 'days_since_prior_order'],
      dtype='object')

In [16]:
df_ord.drop(columns = ['eval_set'])

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [18]:
df_ord = df_ord.drop(columns = ['eval_set'])

In [42]:
## Missing Values

In [22]:
df_ord['days_since_prior_order'].value_counts(dropna = False)

days_since_prior_order
30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: count, dtype: int64

## Renaming Columns

In [30]:
df_ord.rename(columns = {'order_dow' : 'order_day_of_week'}, inplace = True)

In [32]:
df_ord.columns

Index(['order_id', 'user_id', 'order_number', 'order_day_of_week',
       'order_hour_of_day', 'days_since_prior_order'],
      dtype='object')

In [34]:
df_ord.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [46]:
## Changing Data Type for a Column

In [36]:
df_ord['order_id'] = df_ord['order_id'].astype('str')

In [38]:
df_ord['order_id'].dtype

dtype('O')

In [50]:
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [52]:
df_dep.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [54]:
df_dep_t = df_dep.T

In [56]:
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [60]:
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


## Use first row as headers

In [62]:
new_header = df_dep_t.iloc[0]

In [64]:
new_header

0    department
Name: department_id, dtype: object

In [66]:
df_dep_t.head()

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce


In [68]:
##Tel python to only count after the first row
df_dep_t_new = df_dep_t[1:]

In [70]:
df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [72]:
#Set columns as headers
df_dep_t_new.columns = new_header

In [74]:
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [76]:
data_dict = df_dep_t_new.to_dict('index')

In [78]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [80]:
df_prods.head()

NameError: name 'df_prods' is not defined

In [82]:
df_prod.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [84]:
data_dict.get('19')

{'department': 'snacks'}

In [88]:
df_snacks = df_prod[df_prod['department_id']==19]

In [90]:
df_snacks

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5
...,...,...,...,...,...
49666,49662,Bacon Cheddar Pretzel Pieces,107,19,3.6
49669,49665,Super Dark Coconut Ash & Banana Chocolate Bar,45,19,6.9
49670,49666,Ginger Snaps Snacking Cookies,61,19,5.2
49675,49671,Milk Chocolate Drops,45,19,3.0


# Start of Work for Task 4.4

In [92]:
whos

Variable               Type         Data/Info
---------------------------------------------
data_dict              dict         n=21
dataframe_columns      function     <function dataframe_columns at 0x16bfe11c0>
dataframe_hash         function     <function dataframe_hash at 0x16bfe1080>
df_dep                 DataFrame      department_id       1  <...>\n\n[1 rows x 22 columns]
df_dep_t               DataFrame                             <...>                  missing
df_dep_t_new           DataFrame    department_id       depar<...>                  missing
df_ord                 DataFrame            order_id  user_id<...>3421083 rows x 6 columns]
df_prod                DataFrame           product_id        <...>n[49693 rows x 5 columns]
df_snacks              DataFrame           product_id        <...>\n[6264 rows x 5 columns]
dtypes_str             function     <function dtypes_str at 0x16bfe2520>
get_dataframes         function     <function get_dataframes at 0x16bfe0400>
getpass 

In [94]:
df_ord.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


## #2 Changing User ID in the orders dataframe to a string data type

In [96]:
df_ord['user_id'] = df_ord['user_id'].astype('str')

In [100]:
df_ord['user_id'].dtype

dtype('O')

## #3 Changing Variable Name

In [112]:
df_ord.rename(columns = {'days_since_prior_order' : 'days_since_last_order'}, inplace = True)

In [114]:
df_ord.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_last_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


## #4 Busiest Hour

In [117]:
df_ord['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

In [119]:
# 10 AM is the hour during which the most orders are placed.

## #5 Meaning of "4" in Department ID

In [122]:
df_prod

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3
...,...,...,...,...,...
49688,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3
49689,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1
49690,49686,Artisan Baguette,112,3,7.8
49691,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7


In [126]:
print(data_dict.get('4'))

{'department': 'produce'}


In [128]:
# A value of 4 refers to produce items.

## #6 Breakfast Items Sales

In [131]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [133]:
df_breakfast = df_prod[df_prod['department_id']==14]

In [135]:
df_breakfast

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


In [137]:
# df_breakfast now represents a subset of the larger database which consists solely of breakfast items.

## #7 Dinner Parties

In [140]:
df_dinpar = df_prod.loc[df_prod['department_id'].isin([5,7,12,20])]

In [144]:
df_dinpar.head(30)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
19,20,Pomegranate Cranberry & Aloe Vera Enrich Drink,98,7,6.0
22,23,Organic Turkey Burgers,49,12,8.2
34,35,Italian Herb Porcini Mushrooms Chicken Sausage,106,12,15.1
38,39,Daily Tangerine Citrus Flavored Beverage,64,7,12.5
39,40,Beef Hot Links Beef Smoked Sausage With Chile ...,106,12,22.5


In [142]:
df_dinpar.shape

(7650, 5)

In [146]:
# The df_dinpar dataframe has 7,650 rows.

## #9 User 1

In [173]:
df_user1 = df_ord[df_ord['user_id']=='1']

df_user1

In [176]:
df_user1.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_last_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [182]:
df_user1.shape

(11, 6)

In [184]:
df_user1.head(11)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_last_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


In [186]:
## The list above shows all orders from the customer with user_id 1.

## #10 User 1 Behavior

In [189]:
df_user1.describe()

Unnamed: 0,order_number,order_day_of_week,order_hour_of_day,days_since_last_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


In [193]:

df_user1['order_hour_of_day'].value_counts()


order_hour_of_day
8     3
7     3
12    1
15    1
9     1
14    1
16    1
Name: count, dtype: int64

In [195]:
df_user1['order_day_of_week'].value_counts()

order_day_of_week
4    4
1    3
2    2
3    2
Name: count, dtype: int64

In [201]:
# The commands above retrieve basic information about user 1's ordering habits, like when they place orders (time of day and day of week) as well as some descriptive statistics, although these might be less useful.

## Exporting Dataframes

In [211]:
# Exporting the orders dataframe
df_ord.to_csv(os.path.join('/Users/samabrams/Data Analysis Projects/Instacart Basket Analysis/02 Data/Prepared Data', 'orders_wrangled.csv'), index=False) 


In [208]:
# Exporting the new department dataframe
df_dep_t_new.to_csv(os.path.join('/Users/samabrams/Data Analysis Projects/Instacart Basket Analysis/02 Data/Prepared Data', 'departments_wrangled.csv'), index=False) 
