# 01 — Build Analytic Dataset (Instacart)

Creates a reusable analytic dataset from raw Instacart tables.

**What’s inside:** imports, data quality checks, merges, and export steps.

**Output:** a single dataset saved locally (CSV/Parquet/Pickle—your choice).

## 4.3 IC Data Import

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [None]:
path = r'/Users/spencer/Documents/Career Foundry/Data Immersion/4 Python Fundamentals for Data Analysts/Instacart Basket Analysis'

In [None]:
vars_list = ['order_id', 'user_id', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']

In [None]:
df = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), usecols = vars_list)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [None]:
df_prods.head(20)

In [None]:
df_prods.tail(35)

In [None]:
df_prods.columns

In [None]:
df_prods.shape

In [None]:
df_prods.describe()

In [None]:
df_prods.info()

## 4.5 Data Consistency Checks

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [None]:
# Import datasets
path = r'/Users/spencer/Documents/Career Foundry/Data Immersion/4 Python Fundamentals for Data Analysts/Instacart Basket Analysis'
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'departments_wrangled.csv'), index_col = False)

In [None]:
# Create a dataframe

df_test = pd.DataFrame()

In [None]:
# Create a mixed type column

df_test['mix'] = ['a', 'b', 1, True]

In [None]:
df_test.head()

In [None]:
for col in df_test.columns.tolist():
  weird = (df_test[[col]].map(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)

In [None]:
df_test['mix'] = df_test['mix'].astype('str')

In [None]:
df_prods.isnull().sum()

In [None]:
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [None]:
df_nan

In [None]:
df_prods.shape

In [None]:
# Make new dataframe, removing null 'product_name' values
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [None]:
df_prods_clean.shape

In [None]:
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [None]:
df_dups

In [None]:
df_prods_clean.shape

In [None]:
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [None]:
df_prods_clean_no_dups.shape

In [None]:
df_ords.describe()

In [None]:
# Sanity check to make sure it wasnt a trick question?
df_ords.head(10)

In [None]:
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].map(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)

In [None]:
df_ords.isnull().sum()

In [None]:
# Create a new column 'first_order' if 'days_since_prior_order' is null
df_ords['first_order'] = df_ords['days_since_prior_order'].isnull()

# Check work
df_ords.head()

In [None]:
df_ords_dups = df_ords[df_ords.duplicated()]

In [None]:
df_ords_dups

In [None]:
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_checked.csv'), index=False)

In [None]:
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_checked.csv'), index=False)

## 4.6 Combining & Exporting Data

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [None]:
# Import datasets
path = r'/Users/spencer/Documents/Career Foundry/Data Immersion/4 Python Fundamentals for Data Analysts/Instacart Basket Analysis'
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_checked.csv'), index_col = False)
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'), index_col = False)
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'departments_wrangled.csv'), index_col = False)
df_ords_prior = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders_products_prior.csv'), index_col = False)

In [None]:
df_ords_prior.head()

In [None]:
df_ords.head()

In [None]:
df_ords_prior.shape

In [None]:
df_ords.shape

In [None]:
df_merged_large = df_ords.merge(df_ords_prior, on = 'order_id', indicator = True)

In [None]:
# check output
df_merged_large.head()

In [None]:
df_merged_large.shape

In [None]:
df_merged_large['_merge'].value_counts()

In [None]:
# drop indicator flag (_merge)
df_merged_large = df_merged_large.drop(columns = ['_merge'])

In [None]:
df_merged_large.head()

In [None]:
df_merged_large.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_combined.pkl'))

## 4.6 part 2 Combining & Exporting Data

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [None]:
# Import datasets
path = r'/Users/spencer/Documents/Career Foundry/Data Immersion/4 Python Fundamentals for Data Analysts/Instacart Basket Analysis'
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_checked.csv'), index_col = False)
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'), index_col = False)
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'departments_wrangled.csv'), index_col = False)
df_ords_prior = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders_products_prior.csv'), index_col = False)
df_ords_prods_comb = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl'))

In [None]:
# verify shape
df_ords_prods_comb.shape

In [None]:
df_ords_prods_comb.head()

In [None]:
df_prods.head()

In [None]:
df_prods.shape

In [None]:
print(df_prods[df_prods.duplicated()])

In [None]:
df_dups = df_prods[df_prods.duplicated(subset=['product_id'])]
print(df_dups)

In [None]:
df_prods = df_prods.drop_duplicates(subset=['product_id'])

In [None]:
df_prods.shape

In [None]:
df_merged = df_ords_prods_comb.merge(df_prods, on = 'product_id', how = 'left', indicator = True)

In [None]:
df_merged['_merge'].value_counts()

In [None]:
# expect 32434489 rows
df_merged.shape

In [None]:
df_merged = df_merged.drop(columns = ['_merge'])

In [None]:
df_merged.head()

In [None]:
df_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'ords_prods_merge.pkl'))