In [None]:
# File name: Exercise 4.5 - Consistency Checks
# Author: Sam Abrams
# Created: 10/23/24
# Description: This notebook contains consistency checks for the orders and products dataframes.

# Consistency Checks Notes

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os

### Dataframe Uploads

In [3]:
df_prod = pd.read_csv(r'/Users/samabrams/Data Analysis Projects/Instacart Basket Analysis/02 Data/Original Data/products.csv')

In [4]:
df_ord = pd.read_csv(r'/Users/samabrams/Data Analysis Projects/Instacart Basket Analysis/02 Data/Prepared Data/orders_wrangled.csv')

In [14]:
df_ord.describe()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_last_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [16]:
# Create a test dataframe
df_test = pd.DataFrame()

In [20]:
# Create mixed type column
df_test['mix'] = ['a', 'b', 1, True]

In [22]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [26]:
# Finding Mixed Type Columns
for col in df_test.columns.tolist():
  weird = (df_test[[col]].map(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

mix


In [28]:
df_test['mix'] = df_test['mix'].astype('str')

In [30]:
df_test['mix'].dtype

dtype('O')

## Finding Missing Values

In [33]:
df_prod.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [35]:
# Creating subset of missing values
df_nan = df_prod[df_prod['product_name'].isnull() == True]

In [37]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [39]:
df_prod.shape

(49693, 5)

In [41]:
# Create new dataframe excluding missing values
df_prod_clean = df_prod[df_prod['product_name'].isnull() == False]

In [43]:
df_prod_clean.shape

(49677, 5)

## Handling Duplicates

In [50]:
# Finding Duplicates
df_dups = df_prod_clean[df_prod_clean.duplicated()]

In [48]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [52]:
df_prod_clean.shape

(49677, 5)

In [64]:
# Dropping Duplicates

In [54]:
df_prod_clean_no_dups = df_prod_clean.drop_duplicates()

In [56]:
df_prod_CND = df_prod_clean_no_dups

In [58]:
df_prod_CND.shape()

TypeError: 'tuple' object is not callable

In [60]:
df_prod_clean_no_dups.shape()

TypeError: 'tuple' object is not callable

In [62]:
df_prod_CND.shape

(49672, 5)

In [98]:
df_prod_CND.to_csv(os.path.join('/Users/samabrams/Data Analysis Projects/Instacart Basket Analysis/02 Data/Prepared Data', 'products_checked'), index=False)

# Beginning of Task 4.5

In [8]:
## Checking descriptive stats on orders dataframe
df_ord.describe()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_last_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [10]:
df_ord.describe().round(2)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_last_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.21,17.15,2.78,13.45,11.11
std,987581.74,59533.72,17.73,2.05,4.23,9.21
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.5,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


The minimum and maximum values for weekday and hour seemed funky at first, but then I realized they were setting 0 as Sunday or Monday (not sure how to figure that out though). The same is true for hours - 0 represents midnight, while 23 represents 11 PM. There's also a lot of missing values in the "days_since_last_order" column, around 200,000. When you look at the table using the head() function, though, it looks like that's due to the way the data is grouped. Grouping by user_id means that the first order by that user won't have a value for the days_since_last_order column.

I'm also curious why there are 3 million order_ids but only 100 order_numbers. However, again using the head() function below, the order number is specfiic to the user, and is only used to represent that particular customers order number, not the TOTAL order number, which is represented by order_id.

That being said, I don't think anything should necessarily be investigated further, but I would suggest renaming the order_number column to be less obscure.

In [13]:
df_ord.head(50)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_last_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


In [15]:
# Checking for mixed type data
for col in df_ord.columns.tolist():
  weird = (df_ord[[col]].map(type) != df_ord[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ord[weird]) > 0:
    print (col)

Nothing was returned from the code above, therefore there isn't any mixed type data in the dataframe.

In [17]:
# Check for Missing Values
df_ord.isnull().sum()

order_id                      0
user_id                       0
order_number                  0
order_day_of_week             0
order_hour_of_day             0
days_since_last_order    206209
dtype: int64

As mentioned earlier, there are over 200,000 missing values in the days_since_last_order column, but considering how the data is grouped, those missing values make sense.

Nothing should be done about this discrepancy. The NaN values aren't being used in calculations, so there is no need to alter anything here.

In [19]:
# Checking for Duplicates
df_ord_no_dups = df_ord[df_ord.duplicated()]

In [20]:
df_ord_no_dups

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_last_order


Assuming I wrote the code correctly above, there are no duplicates in the Orders dataframe, so there is nothing to address.

In [100]:
## Saving to folder
df_ord.to_csv(os.path.join('/Users/samabrams/Data Analysis Projects/Instacart Basket Analysis/02 Data/Prepared Data', 'orders_checked'), index=False)

In [26]:
df_ord.shape

(3421083, 6)