 # 4.5 Data Consistency Checks

## This script contains the following points:
#### 01. Importing libraries
#### 02. Importing data
#### 03. Creating a small test dataframe
#### 04. Finding Missing Values
#### 05. Finding Duplicates
#### 06. Descriptive statistics
#### 07. Checking for mixed-type data
#### 08. Check for missing values
#### 09. Adressing the missing values
#### 10. Checking for duplicates
#### 11. Addressing duplicates
#### 12. Exporting changes

## 01. Importing libraries

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os

## 02. Importing data

In [2]:
# Set path

path = r'C:\Users\Asus\OneDrive\Documents\Data Analytics\Data Immersion\4. Python Fundamentals for Data Analysts\05-05-2023 Instacart Basket Analysis'

In [3]:
# Import ords data

df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

In [4]:
# Import products data

df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [5]:
# Check descriptive statistics

df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


## 03. Creating a small test dataframe

In [6]:
# Create a dataframe

df_test = pd.DataFrame()

In [7]:
# Create a mixed type column

df_test['mix'] = ['a', 'b', 1, True]

In [8]:
# Display first 4 rows

df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [9]:
# Check for mixed types

for col in df_test.columns.tolist():
    weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_test[weird]) > 0:
        print (col)

mix


In [10]:
# Change data type of "mix" from numeric to string

df_test['mix'] = df_test['mix'].astype('str')

# PRODUCTS DATA SET

## 04. Finding Missing Values

In [11]:
# Finding missing values in Products Data Set

df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [12]:
# 16 missing values in the "product_name"

df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [13]:
# Check df_nan

df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [14]:
# Making a subset of dataframe for values that are not missing

df_prods.shape

(49693, 5)

In [15]:
# Making a new dataframe with non-missing values

df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [16]:
# Check the number of rows and columns in df_prods_clean

df_prods_clean.shape

(49677, 5)

## 05. Finding Duplicates 

In [17]:
# Finding duplicates

df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [18]:
# Display first 5 rows of df_dups

df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [19]:
# Addressing Duplicates 

df_prods_clean.shape

(49677, 5)

In [20]:
# A new data frame that doesn't include the duplicates

df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [21]:
# Check the number of rows and columns in df_prods_clean_no_dups

df_prods_clean_no_dups.shape

(49672, 5)

# ORDERS DATA SET

## 06. Descriptive Statistics

In [22]:
# Descriptive statistics of Orders data set

df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


#### "Orders_day_of_week" column can't have a maximum larger than 7 (there are only 7 days in a week). Min should be 0 and maximum 6, for a total of exactly 7 values. The 50th percentile should be 3 which it is. This column doesn't have any strange or incorrect values in it.

#### We don't expect to see a negative minimum value for "days_since_prior_order" as this would imply a negative numbers of days. 

#### We don't expect to see a maximum number larger than 24 for "order_hour_of_day" as there are only 24 hours in a day.

## 07. Checking for mixed-type data

In [23]:
# Check for mixed types

for col in df_test.columns.tolist():
    weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_test[weird]) > 0:
        print (col)


#### No mixed-type data.

## 08. Check for missing values

In [24]:
# Check for missing values

df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

##### The only column with missing values is the "days_since_prior_order" column, and it's missing 206 209 values.

In [25]:
# Subset with the values in question

df_nan = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [26]:
# Check df_nan

df_nan

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,1,2,8,
11,11,2168274,2,1,2,11,
26,26,1374495,3,1,1,14,
39,39,3343014,4,1,6,11,
45,45,2717275,5,1,3,12,
...,...,...,...,...,...,...,...
3420930,3420930,969311,206205,1,4,12,
3420934,3420934,3189322,206206,1,3,18,
3421002,3421002,2166133,206207,1,6,19,
3421019,3421019,2227043,206208,1,1,15,


#### We can observe that every new customer, meaning the one that placed his first order, will have missing information regarding days_since_prior_order. We can see that days_since_prior_order column keeps track about how many days have passed between the previous and a new order. If a customer placed his first order he wouldn't have any prior orders, so that means that he will not have any data showing how many days have been since his previous order as that order doesn't exist.

## 09. Addressing the missing values

#### We can observe that there are 206 209 user_id's and the same number of the missing values for days_since_prior_order. Because of that I would not like to simply remove missing values. If a customer has missing value in that column it means he is a new customer and serves as a one type of check-up. I would probably create a new column containing the string value "A new customer" to flag that information.

## 10. Checking for Duplicates

In [27]:
# Finding Duplicates

df_ords_dups = df_ords[df_ords.duplicated()]

In [28]:
# Check df_ords_dups

df_ords_dups

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


#### Although we can see that there are no duplicate rows wthin our dataframe, we can actually observe that we have Unnamed column that is actually duplicate of index column.

## 11. Addressing duplicates

In [29]:
# Checking number of columns before dropping the "Unnamed: 0" column

df_ords.shape

(3421083, 7)

In [30]:
# Dropping 'Unnamed: 0' column from orders.csv

df_ords = df_ords.drop(columns = ['Unnamed: 0'])

In [31]:
# Check df_ords

df_ords

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [32]:
# Check the number of rows and columns in df_ords

df_ords.shape

(3421083, 6)

#### I dropped "Unnamed: 0" column as it was a duplicate of index column.

## 12. Exporting Changes

In [33]:
# Export of df_prods_clean_no_dups

df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'))

In [34]:
# Export of df_ords

df_ords.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_checked.csv'))