**This script contains the following points:**

    1. Importing libraries
    2. Importing data
    3. Consistency checks
        3.1 products dataframe
            3.1.1 Missing values
            3.1.2 Mixed-type data
            3.1.3 Duplicates
        3.2 orders_wrangled dataframe
            3.2.1 Missing values
            3.2.2 Mixed-type data
            3.2.3 Duplicates   
    4. Exporting data

# 1. Importing libraries

In [48]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 2. Importing data

In [49]:
# Turn project folder path into a string
path = r'/Users/sarahtischer/Desktop/CareerFoundry/Data Immersion/Achievement 4/01-2024_Instacart_Basket_Analysis'

In [50]:
# Import "products.csv"
df_prods = pd.read_csv(os.path.join(path, '02_Data', 'Original_data', 'products.csv'), index_col = False)

In [51]:
df_prods.shape

(49693, 5)

In [52]:
# Import "orders_wrangled.csv"
df_ords = pd.read_csv(os.path.join(path, '02_Data', 'Prepared_data', 'orders_wrangled.csv'), index_col = False)

In [53]:
df_ords.shape

(3421083, 6)

# 3. Consistency checks

## 3.1 products dataframe

In [54]:
# Suppress scientific notation by specifying the display format for floating-point numbers
pd.options.display.float_format = '{:,.2f}'.format

# Print descriptive statistics (Question 2)
df_prods.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49693.0,49693.0,49693.0,49693.0
mean,24844.35,67.77,11.73,9.99
std,14343.72,38.32,5.85,453.52
min,1.0,1.0,1.0,1.0
25%,12423.0,35.0,7.0,4.1
50%,24845.0,69.0,13.0,7.1
75%,37265.0,100.0,17.0,11.2
max,49688.0,134.0,21.0,99999.0


#### *<mark>Answer:</mark> The descriptive statistics for the products dataframe show a maximum value for the 'prices' column of 99,999.00, which seems unusually high for grocery prices. It is possible that there is an outlier or an error in the data. Realistic prices are typically not that high.*

### 3.1.1 Missing values

In [55]:
# Find missing values
df_prods.isnull().sum() # Find missing observations and sum results by column

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [56]:
# Create subset of missing observations
df_nan = df_prods[df_prods['product_name'].isnull() == True]

df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [57]:
# Print dimensions of df_prods
df_prods.shape

(49693, 5)

In [58]:
# Create subset without missing observations
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

# Print dimensions of df_prods_clean
df_prods_clean.shape

(49677, 5)

### 3.1.2 Mixed-type data

In [59]:
# Check for mixed types (Question 3)
for col in df_prods_clean.columns.tolist():
  mixed = (df_prods_clean[[col]].map(type) != df_prods_clean[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_prods_clean[mixed]) > 0:
    print (col)

### 3.1.3 Duplicates

In [60]:
# Find full duplicates
df_dups = df_prods_clean[df_prods_clean.duplicated()]

df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [61]:
# Print dimensions of df_prods_clean
df_prods_clean.shape

(49677, 5)

In [62]:
# Create subset without duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

# Print dimensions of df_prods_clean_no_dups
df_prods_clean_no_dups.shape

(49672, 5)

## 3.2 orders_wrangled dataframe

In [63]:
# Suppress scientific notation by specifying the display format for floating-point numbers
pd.options.display.float_format = '{:,.2f}'.format

# Print descriptive statistics (Question 2)
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.21,17.15,2.78,13.45,11.11
std,987581.74,59533.72,17.73,2.05,4.23,9.21
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.5,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


#### *<mark>Answer:</mark> The descriptive statistics for the orders dataframe don't show any unreasonable values.*

### 3.2.1 Missing values

In [64]:
# Find missing values (Question 5)
df_ords.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [65]:
# Create subset of missing observations
df_empty = df_ords[df_ords['days_since_prior_order'].isnull() == True]

df_empty

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
11,2168274,2,1,2,11,
26,1374495,3,1,1,14,
39,3343014,4,1,6,11,
45,2717275,5,1,3,12,
...,...,...,...,...,...,...
3420930,969311,206205,1,4,12,
3420934,3189322,206206,1,3,18,
3421002,2166133,206207,1,6,19,
3421019,2227043,206208,1,1,15,


#### *<mark>Answer:</mark> The instances of NaN values in the days_since_prior_order column are not really missing values. In fact, they correlate with order_number = 1, which means that this is the user's first order, and therefore there has been no prior order and no time since. Hence, these NaN values should be kept.*

#### *<mark>Note:</mark> No action required for Question 6.*

### 3.2.2 Mixed-type data

In [66]:
# Check for mixed types (Question 3)
for col in df_ords.columns.tolist():
  nonconforming = (df_ords[[col]].map(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[nonconforming]) > 0:
    print (col)

#### *<mark>Answer:</mark> The orders dataframe doesn't contain any mixed-type columns.*

#### *<mark>Note:</mark> No action required for Question 4.*

### 3.2.3 Duplicates

In [67]:
# Find full duplicates (Question 7)
df_dups_ords = df_ords[df_ords.duplicated()]

df_dups_ords

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


#### *<mark>Answer:</mark> There are no duplicates in the orders dataframe.*

#### *<mark>Note:</mark>No action required for Question 8.*

# 04. Exporting data

In [68]:
df_prods_clean_no_dups.shape

(49672, 5)

In [69]:
# Export df_prods_clean_no_dups as "products_checked.csv"
df_prods_clean_no_dups.to_csv(os.path.join(path, '02_Data','Prepared_data', 'products_clean.csv'), index = False)

In [70]:
df_ords.shape

(3421083, 6)

In [71]:
# Export df_ords as "orders_checked.csv"
df_ords.to_csv(os.path.join(path, '02_Data','Prepared_data', 'orders_clean.csv'), index = False)