# Table of Contents

01. Importing Libraries
02. Importing Data (products and orders_wrangled)
03. Creating Test Data
04. Missing Values
05. Duplicates
06. Exercise Task (missing and duplicate values on df_ords)
07. Exporting Data

## 01. Importing libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

## 02. Importing Data

In [2]:
#define path
path = r'C:\Users\legra\Desktop\06-2022 Instacart Basket Analysis'

In [3]:
# Import Products csv
df_prods = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'products.csv'), index_col = False)

In [4]:
# Import Wrangled orders csv
df_ords = pd.read_csv(os.path.join(path, 'Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

### 03. Creating test data

In [5]:
# Create a dataframe

df_test = pd.DataFrame()

In [6]:
# Creat a mixed type column

df_test['mix'] = ['a', 'b', 1, True]

In [7]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [8]:
# Check for mixed types

for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


### 04. Checking for missing values

In [9]:
# Check for missing values
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [10]:
#create subset to view missing values
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [11]:
#checking output
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [12]:
df_prods.shape

(49693, 5)

In [13]:
# Creating new df for products.csv without missing values

df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [14]:
df_prods_clean.shape

(49677, 5)

### 05. Duplicates

In [15]:
# Creating df for dups

df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [16]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [17]:
# Creating products df without dups

df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [18]:
df_prods_clean_no_dups.shape

(49672, 5)

### 06. Exercise Task

In [19]:
#descriptive stats for prods df
df_prods.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49693.0,49693.0,49693.0,49693.0
mean,24844.345139,67.770249,11.728433,9.994136
std,14343.717401,38.316774,5.850282,453.519686
min,1.0,1.0,1.0,1.0
25%,12423.0,35.0,7.0,4.1
50%,24845.0,69.0,13.0,7.1
75%,37265.0,100.0,17.0,11.2
max,49688.0,134.0,21.0,99999.0


The maximum value for prices should be further investigated

In [20]:
# Check for mixed type in orders

for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

No mixed data types were found to fix

In [21]:
# Check for missing values in orders

df_ords.isnull().sum()

Unnamed: 0                        0
order_id                          0
user_id                           0
eval_set                          0
order_number                      0
orders_day_of_week                0
order_hour_of_day                 0
days_since_previous_order    206209
dtype: int64

In [22]:
df_ords_nan = df_ords[df_ords['days_since_previous_order'].isnull() == True]

In [23]:
df_ords_nan

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_previous_order
0,0,2539329,1,prior,1,2,8,
11,11,2168274,2,prior,1,2,11,
26,26,1374495,3,prior,1,1,14,
39,39,3343014,4,prior,1,6,11,
45,45,2717275,5,prior,1,3,12,
...,...,...,...,...,...,...,...,...
3420930,3420930,969311,206205,prior,1,4,12,
3420934,3420934,3189322,206206,prior,1,3,18,
3421002,3421002,2166133,206207,prior,1,6,19,
3421019,3421019,2227043,206208,prior,1,1,15,


There are 206209 rows with missing data in the days since previous order column. This is likely because they are new customers who do not have a previous order.

### Addressing missing values

As the missing values are likely first time customes with no previous orders I will be addressing it by creating a new boolean column stating whether its a new customer or not

In [24]:
df_ords_clean = df_ords

In [25]:
df_ords_clean['new_customer'] = df_ords['days_since_previous_order'].isnull() == True

In [26]:
df_ords_clean

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_previous_order,new_customer
0,0,2539329,1,prior,1,2,8,,True
1,1,2398795,1,prior,2,3,7,15.0,False
2,2,473747,1,prior,3,3,12,21.0,False
3,3,2254736,1,prior,4,4,7,29.0,False
4,4,431534,1,prior,5,4,15,28.0,False
...,...,...,...,...,...,...,...,...,...
3421078,3421078,2266710,206209,prior,10,5,18,29.0,False
3421079,3421079,1854736,206209,prior,11,4,10,30.0,False
3421080,3421080,626363,206209,prior,12,1,12,18.0,False
3421081,3421081,2977660,206209,prior,13,1,12,7.0,False


In [27]:
#Checking for duplicates in orders

df_ords_dups = df_ords_clean[df_ords_clean.duplicated()]

In [28]:
#checking output
df_ords_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_previous_order,new_customer


There are no duplicates in the cleaned orders csv file

In [29]:
df_ords_clean.shape

(3421083, 9)

### 07. Exporting Data

In [30]:
#Export df_prods

df_prods_clean_no_dups.to_csv(os.path.join(path, 'Data', 'Prepared Data', 'products_checked.csv'))

In [31]:
#Export df_ords

df_ords_clean.to_csv(os.path.join(path, 'Data', 'Prepared Data', 'orders_checked.csv'))