# Table of Contents
1. Import libraries
2. Import data: orders_products_combined.pkl, products_checked2.csv
3. Fix duplicate product_ids in products_checked2
4. Merging datasets: orders_products_combined and products_checked2
5. Export merged data: ords_prods_merge_extrarows.pkl

# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import os

# Importing Data

In [2]:
# Define path
path = r'/users/stephenhelvig/documents/python projects/instacart basket analysis'

In [3]:
# Import orders_products_combined.pkl
df_ords_prods = pd.read_pickle(os.path.join(path, 'Data', 'Prepared Data', 'orders_products_combined.pkl'))

In [13]:
# Import products_checked2.csv

df_prods = pd.read_csv(os.path.join(path, 'Data', 'Prepared Data', 'products_checked2.csv'))

# Fixing duplicates in df_prods (products_checked2.csv)
After trying and failing a merge, I need to do some detective work and further data cleaning. 

In [15]:
# check for duplicates
df_prods['product_id'].duplicated().sum()

np.int64(2)

In [16]:
dupes = df_prods[df_prods['product_id'].duplicated(keep=False)]
print(dupes.head())

       product_id                                       product_name  \
6798         6800                          Revive Zero Vitamin Water   
6799         6800                 Sprouted Quinoa Flakes Baby Cereal   
26518       26520  Clinical Advanced Solid Ultimate Fresh Anti-Pe...   
26519       26520       Cheese Shredded Sharp Cheddar Reduced Fat 2%   

       aisle_id  department_id  prices  
6798         64              7     6.4  
6799         92             18    14.0  
26518        80             11    10.6  
26519        21             16     2.9  


In [18]:
# Find min and max product_id
min_id = df_prods['product_id'].min()
max_id = df_prods['product_id'].max()

# Create the full expected range
full_range = set(range(min_id, max_id + 1))

# Convert your actual product_ids into a set
actual_ids = set(df_prods['product_id'])

# Find missing ones
missing_ids = sorted(full_range - actual_ids)

print('First 20 missing IDs:', missing_ids[:20])
print('Total missing:', len(missing_ids))

First 20 missing IDs: [6799, 26519]
Total missing: 2


In [19]:
# see if any of the missing ids are referenced in df_ords_prods
used_missing = df_ords_prods[df_ords_prods['product_id'].isin(missing_ids)]

print(used_missing['product_id'].unique())

[ 6799 26519]


In [20]:
# verify duplicates
df_prods[df_prods['product_id'].isin([6800, 26520])]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
6798,6800,Revive Zero Vitamin Water,64,7,6.4
6799,6800,Sprouted Quinoa Flakes Baby Cereal,92,18,14.0
26518,26520,Clinical Advanced Solid Ultimate Fresh Anti-Pe...,80,11,10.6
26519,26520,Cheese Shredded Sharp Cheddar Reduced Fat 2%,21,16,2.9


In [21]:
# reassigning 2 of the duplicate product_ids to missing IDs

# Reassign one of the duplicate 6800s → 6799
mask1 = (df_prods['product_id'] == 6800) & (df_prods.duplicated(subset='product_id', keep='first'))
df_prods.loc[mask1, 'product_id'] = 6799

# Reassign one of the duplicate 26520s → 26519
mask2 = (df_prods['product_id'] == 26520) & (df_prods.duplicated(subset='product_id', keep='first'))
df_prods.loc[mask2, 'product_id'] = 26519

In [22]:
print(df_prods['product_id'].duplicated().sum())  # should be 0
missing_ids = sorted(set(range(df_prods['product_id'].min(), df_prods['product_id'].max() + 1)) - set(df_prods['product_id']))

print(missing_ids)  # should NOT contain 6799 or 26519 anymore

0
[]


# Merging Datasets: orders_products_combined and products_checked2

In [24]:
# dropping existing _merge column
df_ords_prods = df_ords_prods.drop(columns=['_merge'])

In [25]:
# Merging df_ords_prods with df_prods
df_orders_products_large = df_ords_prods.merge(df_prods, on='product_id', how='left', indicator = True, validate='m:1')

In [26]:
df_orders_products_large['_merge'].value_counts(dropna=False)

_merge
both          32434489
left_only            0
right_only           0
Name: count, dtype: int64

In [27]:
len_before = len(df_ords_prods)
len_after  = len(df_orders_products_large)
len_before, len_after  # should be equal for a left merge without duplication on the right

(32434489, 32434489)

In [34]:
df_orders_products_large.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0,both
1,2539329,1,1,2,8,,True,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,both
2,2539329,1,1,2,8,,True,12427,3,0,Original Beef Jerky,23,19,4.4,both
3,2539329,1,1,2,8,,True,26088,4,0,Aged White Cheddar Popcorn,23,19,4.7,both
4,2539329,1,1,2,8,,True,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,both


In [35]:
# Count how many rows with missing product_name
df_orders_products_large['product_name'].isnull().sum()

np.int64(28171)

In [36]:
# Drop the merge flag column
df_orders_products_large = df_orders_products_large.drop(columns=['_merge'])

# Export Data

In [37]:
# Export to pickle
df_orders_products_large.to_pickle(os.path.join(path, 'Data', 'Prepared Data', 'ords_prods_merge_extrarows.pkl'))