# Table of contents:
## Wrangling procedures (theory)
## 4-4 Task
   #### Changing data type
   #### Changing column name
   #### Frequency of hours of the day
   #### Department_id = 4?
   #### Filter for breakfast items
   #### Dinner parties products
   #### User 1
   #### Exporting data

# Wrangling procedures (theory)

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [3]:
# Import datasets
path = r'C:\Users\raque\Documents\08-2023 Instacart Basket Analysis'
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col = False)
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'), index_col = False)

In [4]:
# Dropping eval_set column from orders.csv
df_ords = df_ords.drop(columns = ['eval_set'])

In [5]:
# Looking for missing values
df_ords['days_since_prior_order'].value_counts(dropna = False)

days_since_prior_order
30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: count, dtype: int64

In [6]:
# Changing column names
df_ords.rename(columns = {'order_dow' : 'orders_day_of_the_week'}, inplace=True)

In [7]:
# Changing data type of order_id to string
# (The describe function ignores strings)
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [8]:
df_ords['order_id'].dtype

dtype('O')

In [9]:
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [10]:
df_dep_t = df_dep.T

In [11]:
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [12]:
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [13]:
# Take the first row of df_dep for the header
new_header = df_dep_t.iloc[0]

In [14]:
#Assigning everything from row 1 til the end to a new dataframe
df_dep_t_new = df_dep_t[1:]

In [15]:
df_dep_t_new.columns = new_header #set the header row as the df header

In [16]:
# Convert df_dep_t_new into a dictionary
data_dict = df_dep_t_new.to_dict('index')

In [17]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [18]:
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [19]:
print(data_dict.get('19'))

{'department': 'snacks'}


In [20]:
df_snacks = df_prods[df_prods['department_id']==19]

In [21]:
df_snacks.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


In [22]:
df_snacks_2 = df_prods.loc[df_prods['department_id'] == 19]

In [23]:
df_snacks_3 = df_prods.loc[df_prods['department_id'].isin([19])]

# 4-4 Task

### Changing data type

In [24]:
df_ords.head() # looking for an identifier variable to change it to string 

Unnamed: 0,order_id,user_id,order_number,orders_day_of_the_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [27]:
# Changing data type of user_id to string
df_ords['user_id'] = df_ords['user_id'].astype('str') 

### Change column name

In [31]:
# Changing column names
df_ords.rename(columns = {'orders_day_of_the_week' : 'order_day_of_the_week'}, inplace=True)

### Frequency of hours of the day

In [34]:
# Looking for the busiest hours of the day through the frequency, seems to be the afternoon from 10h to 16:00h
df_ords['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

### Department_id = 4?

In [35]:
print(data_dict.get('4'))

{'department': 'produce'}


### Filter for breakfast items

In [40]:
df_breakfast = df_prods.loc[df_prods['department_id'] == 14]

In [41]:
df_breakfast

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


### Dinner parties products

In [42]:
# creating a subset for dinner parties with the products in alcohol, beverages, deli and meat/seafood
df_dinner_parties = df_prods.loc[df_prods['department_id'].isin([5,20,7,12])]

In [43]:
df_dinner_parties # it has 7650 rows

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
...,...,...,...,...,...
49676,49672,Cafe Mocha K-Cup Packs,26,7,6.5
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49686,49682,California Limeade,98,7,4.3


### User 1

In [44]:
# Subset for user id 1
df_user_id_1 = df_ords.loc[df_ords['user_id'] == '1']

In [45]:
df_user_id_1 #printing what we have stored in the subset

Unnamed: 0,order_id,user_id,order_number,order_day_of_the_week,order_hour_of_day,days_since_last_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


In [46]:
df_user_id_1.describe() # Basic stats for user id 1

Unnamed: 0,order_number,order_day_of_the_week,order_hour_of_day,days_since_last_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


### Exporting data

In [47]:
# Exporting df_ords
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'))

In [48]:
# Exporting df_dep_t_new
df_dep_t_new.to_csv(os.path.join(path, '02 Data','Prepared Data', 'departments_wrangled.csv'))