# MERGING OF INSTACART DATASETS

## 1. Importing Datasets

In [2]:
# importing libraries
import pandas as pd
import numpy as np

### A. Importing and cleaning orders dataset

In [3]:
# importing order dataset and assigning to dataframe order_df
order_df = pd.read_csv('/Volumes/external_drive/Instacart/Data/Original Data/orders.csv')

In [5]:
order_df.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [6]:
# dropping eval_set column as it not needed for analysis
order_df = order_df.drop('eval_set', axis=1)

In [7]:
# checking for missing values
order_df.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
order_dow                      0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [8]:
# checking the values under the days_since_prior_order column are caused by first time orders
order_df['user_id'].nunique()

206209

The NaN values under the days_since_prior_order column are a result of the order being the first order the customer has made and hence the number of days since the prior order cannot be calculated. The NaN value count is the same as teh count of user_id's, these values will thus not be removed and can be used to identify first time orders.

In [9]:
# checking for duplicates
order_df[order_df.duplicated()]

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order


In [10]:
# describing the data and checking for any anomalies
order_df.describe()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [11]:
# checking data type
order_df.dtypes

order_id                    int64
user_id                     int64
order_number                int64
order_dow                   int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [114]:
# checking shape 
order_df.shape

(3421083, 6)

### B. Importing and cleaning products dataset

In [12]:
# importing product dataset and assigning to dataframe product_df
product_df = pd.read_csv('/Volumes/external_drive/Instacart/Data/Original Data/products.csv')

In [13]:
product_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [14]:
# checking for missing values
product_df.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [16]:
# identifying missing values for further analysis
product_df[product_df['product_name'].isnull() == True]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [17]:
# checking shape
product_df.shape

(49693, 5)

In [21]:
# removing missing values (not possible to impute values for product_name)
product_df.dropna(inplace=True)
product_df.shape

(49677, 5)

In [20]:
# checking for duplicates
product_df[product_df.duplicated()]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [22]:
# dropping duplicates
product_df.drop_duplicates(inplace=True)
product_df.shape

(49672, 5)

In [23]:
# describing the data and checking for any anomalies
product_df.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49672.0,49672.0,49672.0,49672.0
mean,24850.349775,67.762442,11.728942,9.993282
std,14340.705287,38.315784,5.850779,453.615536
min,1.0,1.0,1.0,1.0
25%,12432.75,35.0,7.0,4.1
50%,24850.5,69.0,13.0,7.1
75%,37268.25,100.0,17.0,11.1
max,49688.0,134.0,21.0,99999.0


In [25]:
# checking maximum values for prices as 99999 seems to be an error 
product_df['prices'].sort_values(ascending = False).drop_duplicates()

33666    99999.0
21554    14900.0
19392       25.0
9896        24.9
26078       24.8
          ...   
21525        1.4
29335        1.3
19224        1.2
19388        1.1
42345        1.0
Name: prices, Length: 242, dtype: float64

In [29]:
# identifying mispriced products
product_df.loc[(product_df['prices'] == 14900) | (product_df['prices'] == 99999)].head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
21554,21553,Lowfat 2% Milkfat Cottage Cheese,108,16,14900.0
33666,33664,2 % Reduced Fat Milk,84,16,99999.0


In [30]:
# replacing incorrect values with correct values
product_df['prices'] = product_df['prices'].replace({99999: 9.99, 14900: 14.9})

In [31]:
product_df['prices'].sort_values(ascending = False).drop_duplicates()

9020     25.0
14207    24.9
22193    24.8
22956    24.7
24884    24.6
         ... 
21525     1.4
29335     1.3
19224     1.2
19388     1.1
42345     1.0
Name: prices, Length: 241, dtype: float64

In [32]:
# checking data type
product_df.dtypes

product_id         int64
product_name      object
aisle_id           int64
department_id      int64
prices           float64
dtype: object

In [33]:
# checking shape
product_df.shape

(49672, 5)

### C. Importing and cleaning departments dataset

In [34]:
# importing department dataset and assigning to dataframe department_df
department_df = pd.read_csv('/Volumes/external_drive/Instacart/Data/Original Data/departments.csv')

In [35]:
department_df.head()

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol


In [36]:
# checking for missing values
department_df.isnull().sum()

department_id    0
department       0
dtype: int64

In [37]:
# checking for duplicates
department_df[department_df.duplicated()]

Unnamed: 0,department_id,department


In [38]:
# describing the data and checking for any anomalies
department_df.describe()

Unnamed: 0,department_id
count,21.0
mean,11.0
std,6.204837
min,1.0
25%,6.0
50%,11.0
75%,16.0
max,21.0


In [39]:
# checking data type
department_df.dtypes

department_id     int64
department       object
dtype: object

In [40]:
# checking shape
department_df.shape

(21, 2)

### D. Importing and cleaning orders products prior dataset

In [41]:
# importing orders products prior dataset and assigning to dataframe ords_prior_df 
ords_prior_df = pd.read_csv('/Volumes/external_drive/Instacart/Data/Original Data/order_products__prior.csv')

In [42]:
ords_prior_df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [43]:
# ddropping add_to_cart_order and reorderd columns as they are not required for analysis
ords_prior_df = ords_prior_df.drop(['add_to_cart_order', 'reordered'], axis = 1)

In [44]:
# checking for missing values
ords_prior_df.isnull().sum()

order_id      0
product_id    0
dtype: int64

In [45]:
# checking for duplicates
ords_prior_df[ords_prior_df.duplicated()]

Unnamed: 0,order_id,product_id


In [46]:
# describing the data and checking for any anomalies
ords_prior_df.describe()

Unnamed: 0,order_id,product_id
count,32434490.0,32434490.0
mean,1710749.0,25576.34
std,987300.7,14096.69
min,2.0,1.0
25%,855943.0,13530.0
50%,1711048.0,25256.0
75%,2565514.0,37935.0
max,3421083.0,49688.0


In [47]:
# checking data type
ords_prior_df.dtypes

order_id      int64
product_id    int64
dtype: object

In [48]:
# checking shape
ords_prior_df.shape

(32434489, 2)

### D. Importing and cleaning orders aisles dataset

In [49]:
# importing orders aisles dataset and assigning to dataframe aisle_df 
aisle_df = pd.read_csv('/Volumes/external_drive/Instacart/Data/Original Data/aisles.csv')

In [50]:
aisle_df.head()

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


In [51]:
# checking for missing values
aisle_df.isnull().sum()

aisle_id    0
aisle       0
dtype: int64

In [52]:
# checking for duplicates
aisle_df[aisle_df.duplicated()]

Unnamed: 0,aisle_id,aisle


In [53]:
# describing the data and checking for any anomalies
aisle_df.describe()

Unnamed: 0,aisle_id
count,134.0
mean,67.5
std,38.826537
min,1.0
25%,34.25
50%,67.5
75%,100.75
max,134.0


In [54]:
# checking data type
aisle_df.dtypes

aisle_id     int64
aisle       object
dtype: object

In [55]:
# checking shape
aisle_df.shape

(134, 2)

## 2. Merging Datasets

### A. Merging ords_prior_df and order_df

In [56]:
# merging ords_prior_df and order_df using order_id as the merge column
merged_orders_df = order_df.merge(ords_prior_df, on = 'order_id', indicator = False)

In [57]:
merged_df.head()

NameError: name 'merged_df' is not defined

In [None]:
# checking shape of merged dataset
ords_ordsprior_df.shape

### B. Adding product info to merged_df

In [None]:
# merging product_df and merged_orders_df using product_id as the merge column
merged_orders_prods_df = merged_orders_df.merge(product_df, on = 'product_id', indicator = False)

In [None]:
merged_orders_prods_df.head()

In [None]:
# checking shape of merged dataset
merged_orders_prods_df.shape

### C. Adding department to merged_orders_prods_df

In [None]:
# merging merged_orders_prods_df and department_df using department_id as the merge column
merged_orders_prods_deps_df = merged_orders_prods_df.merge(department_df, on = 'department_id', indicator = False)

In [None]:
merged_orders_prods_deps_df.head()

In [None]:
# checking shape of merged dataset
merged_orders_prods_deps_df.shape

### C. Adding aisle to merged_orders_prods_df

In [None]:
# merging merged_orders_prods_deps_df and aisle_df using department_id as the merge column
merged_df = merged_orders_prods_deps_df.merge(aisle_df, on = 'aisle_id', indicator = False)

In [None]:
merged_df.head()

In [None]:
# checking shape of merged dataset
merged_df.shape

## Exporting Data

In [None]:
# exporting merged_df to Prepared Data folder
merged_df.to_pickle('/Volumes/external_drive/Instacart/Data/Prepared Data/instacart_merged.pkl')