# Table of Contents
## 1. Data Import and Checks
## 2. Data Wrangling
## 3. Data Wrangling Questions
## 4. Export Data

# 1. Data Import and Checks

In [2]:
# import libraries
import pandas as pd
import numpy as np
import os

In [3]:
# create path
path = r'C:\Users\18602\Documents\Data Analytics\Data Immersion\Month 4\Instacart Basket Analysis'

In [4]:
# importing dataset products 
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [5]:
# importing dataset departments 
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'), index_col = False)

In [6]:
# importing dataset orders 
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col = False)

# 2. Data Wrangling

In [7]:
# dropping eval_set from df_ords
df_ords = df_ords.drop(columns = ['eval_set'])

In [8]:
# Renaming order dow to order day of week
df_ords.rename(columns = {'order_dow' : 'order_day_of_week'}, inplace = True)

In [9]:
# change order_id to a string
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [10]:
# Transposing df_def 
df_dep_t = df_dep.T

In [11]:
# Resetting Index
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [12]:
# Take the first row of df_dep_t and make it the header
new_header = df_dep_t.iloc[0]

In [13]:
# Create a new df starting at row 1
df_dep_t_new = df_dep_t[1:]

In [14]:
# Combine header with df_dep_t_new
df_dep_t_new.columns = new_header

In [15]:
# create new dataframe for dep
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [16]:
# create a data dictionary for product deparments
data_dict = df_dep_t_new.to_dict('index')

In [17]:
#create dataframe for department 19 'snacks'
df_snacks =  df_prods[df_prods['department_id']==19]

In [18]:
#export df_ords
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'))

# 3. Data Wrangling Questions

In [19]:
# Change user_id from a int to a str
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [20]:
df_ords['user_id'].dtype

dtype('O')

In [21]:
# Rename days_since_prior_order to days_since_last_order w/o overwriting
df_ords.rename(columns = {'days_since_prior_order' : 'days_since_last_order'})

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_last_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [22]:
# order counts by time of day
df_ords['order_hour_of_day'].value_counts(dropna = False)

10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: order_hour_of_day, dtype: int64

Our busiest hour of the day for orders is 10am with a total of 288,418 orders

In [23]:
# Search section 4 
print(data_dict.get('4'))

{'department': 'produce'}


Department 4 is Produce

In [24]:
# created df_breakfast for breakfast foods
df_breakfast =  df_prods[df_prods['department_id']==14]

In [25]:
df_breakfast.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6


In [26]:
# dinner party data frame
df_dinner_party = df_prods.loc[df_prods['department_id'].isin([5,7,12,20])]

In [27]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [28]:
df_dinner_party.info

<bound method DataFrame.info of        product_id                                    product_name  aisle_id  \
2               3            Robust Golden Unsweetened Oolong Tea        94   
6               7                  Pure Coconut Water With Orange        98   
9              10  Sparkling Orange Juice & Prickly Pear Beverage       115   
10             11                               Peach Mango Juice        31   
16             17                               Rendered Duck Fat        35   
...           ...                                             ...       ...   
49676       49672                          Cafe Mocha K-Cup Packs        26   
49679       49675             Cinnamon Dolce Keurig Brewed K Cups        26   
49680       49676                          Ultra Red Energy Drink        64   
49686       49682                              California Limeade        98   
49688       49684       Vodka, Triple Distilled, Twist of Vanilla       124   

       department_i

The df_dinner_party has 1750 rows

In [29]:
# create user 1 data frame
df_user_id_one = df_ords.loc[df_ords['user_id'] == "1"]

In [34]:
df_user_id_one.describe()

Unnamed: 0,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


In [None]:
user_id stats:
-Total of 11 orders
-Averages 19 days between orders
-Orders happen from the 1st to 4th day of the week
-Order happen from 7am to 3pm

# 4. Export Data

In [136]:
# export departments wrangled to csv
df_dep_t_new.to_csv(os.path.join(path, '02 Data','Prepared Data', 'departments_wrangled.csv'))