In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("psparks/instacart-market-basket-analysis")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\DELL\.cache\kagglehub\datasets\psparks\instacart-market-basket-analysis\versions\1


In [2]:
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors

In [3]:
aisles = pd.read_csv(f"{path}/aisles.csv")
dept = pd.read_csv(f"{path}/departments.csv")
products = pd.read_csv(f"{path}/products.csv")
orders = pd.read_csv(f"{path}/orders.csv")
order_prior = pd.read_csv(f"{path}/order_products__prior.csv")
order_train = pd.read_csv(f"{path}/order_products__train.csv")

# Preview first 5 rows of one file
aisles.head()

Unnamed: 0,aisle_id,aisle
0,1,prepared soups salads
1,2,specialty cheeses
2,3,energy granola bars
3,4,instant foods
4,5,marinades meat preparation


In [4]:
# Merge prior orders with products
order_products = pd.merge(order_prior, products, on="product_id", how="left")

# Merge with orders to get user/order info
order_products = pd.merge(order_products, orders[['order_id', 'user_id']], on="order_id", how="left")

order_products.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,user_id
0,2,33120,1,1,Organic Egg Whites,86,16,202279
1,2,28985,2,1,Michigan Organic Kale,83,4,202279
2,2,9327,3,0,Garlic Powder,104,13,202279
3,2,45918,4,1,Coconut Butter,19,13,202279
4,2,30035,5,0,Natural Sweetener,17,13,202279


In [5]:
# Group by order_id to create transaction lists
transactions = order_products.groupby("order_id")["product_name"].apply(list).values.tolist()

print(transactions[0:5]) 


[['Organic Egg Whites', 'Michigan Organic Kale', 'Garlic Powder', 'Coconut Butter', 'Natural Sweetener', 'Carrots', 'Original Unflavored Gelatine Mix', 'All Natural No Stir Creamy Almond Butter', 'Classic Blend Cole Slaw'], ['Total 2% with Strawberry Lowfat Greek Strained Yogurt', 'Unsweetened Almondmilk', 'Lemons', 'Organic Baby Spinach', 'Unsweetened Chocolate Almond Breeze Almond Milk', 'Organic Ginger Root', 'Air Chilled Organic Boneless Skinless Chicken Breasts', 'Organic Ezekiel 49 Bread Cinnamon Raisin'], ['Plain Pre-Sliced Bagels', 'Honey/Lemon Cough Drops', 'Chewy 25% Low Sugar Chocolate Chip Granola', 'Oats & Chocolate Chewy Bars', "Kellogg's Nutri-Grain Apple Cinnamon Cereal", 'Nutri-Grain Soft Baked Strawberry Cereal Breakfast Bars', "Kellogg's Nutri-Grain Blueberry Cereal", 'Tiny Twists Pretzels', 'Traditional Snack Mix', 'Goldfish Cheddar Baked Snack Crackers', 'Original Orange Juice', 'Sugarfree Energy Drink', 'Energy Drink'], ['Bag of Organic Bananas', 'Just Crisp, Parm

In [7]:
# Limit transactions to the first 10000 rows
transactions = transactions[:10000]

In [8]:
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df = pd.DataFrame(te_ary, columns=te.columns_)

df.head()

Unnamed: 0,#2 Coffee Filters,& Go! Hazelnut Spread + Pretzel Sticks,0 Calorie Strawberry Dragonfruit Water Beverage,0% Fat Black Cherry Greek Yogurt y,0% Fat Blueberry Greek Yogurt,0% Fat Free Organic Milk,0% Fat Organic Greek Vanilla Yogurt,0% Fat Superfruits Greek Yogurt,0% Greek Strained Yogurt,0% Greek Yogurt Black Cherry on the Bottom,...,of Hanover 100 Calorie Pretzels Mini,of Norwich Original English Mustard Powder Double Superfine,smartwater® Electrolyte Enhanced Water,vitaminwater® XXX Acai Blueberry Pomegranate,w/Banana Pulp Free Juice,with Crispy Almonds Cereal,with Olive Oil Mayonnaise,with Olive Oil Mayonnaise Dressing,with Xylitol Cinnamon 18 Sticks Sugar Free Gum,with Xylitol Minty Sweet Twist 18 Sticks Sugar Free Gum
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [None]:
from mlxtend.frequent_patterns import fpgrowth, association_rules
frequent_itemsets = fpgrowth(df, min_support=0.01, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules = rules.sort_values(by="confidence", ascending=False)

In [28]:
print(rules.head(100))

Empty DataFrame
Columns: [antecedents, consequents, antecedent support, consequent support, support, confidence, lift, representativity, leverage, conviction, zhangs_metric, jaccard, certainty, kulczynski]
Index: []


In [29]:
def recommend_products(cart_items, rules, top_n=5):
    """
    Recommend products based on unseen cart items using association rules.
    
    cart_items : list of str
        Products currently in the user's cart
    rules : pd.DataFrame
        Association rules dataframe from mlxtend
    top_n : int
        Number of recommendations to return
    """
    recommendations = []

    for item in cart_items:
        # Find rules where this item appears in the antecedents
        matched_rules = rules[rules['antecedents'].apply(lambda x: item in x)]
        
        for _, row in matched_rules.iterrows():
            for consequent in row['consequents']:
                if consequent not in cart_items:  # avoid suggesting same item
                    recommendations.append((consequent, row['confidence'], row['lift']))
    
    # Sort by confidence, then lift
    recommendations = sorted(recommendations, key=lambda x: (x[1], x[2]), reverse=True)

    # Keep only unique top_n items
    seen, final_recs = set(), []
    for rec in recommendations:
        if rec[0] not in seen:
            final_recs.append(rec)
            seen.add(rec[0])
        if len(final_recs) >= top_n:
            break

    return final_recs


In [30]:
# Example unseen cart
cart = ["Bag of Organic Bananas", "Whole Milk"]

# Get recommendations
recs = recommend_products(cart, rules, top_n=5)

print("Cart:", cart)
print("\nRecommended Products:")
for item, conf, lift in recs:
    print(f"- {item} (confidence={conf:.2f}, lift={lift:.2f})")


Cart: ['Bag of Organic Bananas', 'Whole Milk']

Recommended Products:
