In [1]:
!pip install annoy

Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/647.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.5/647.5 kB[0m [31m8.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.3-cp311-cp311-linux_x86_64.whl size=551657 sha256=c70381cb3bb9d1964e913ee49f4f1229fd77477cf50c78a7cd790ec5fd9de424
  Stored in directory: /root/.cache/pip/wheels/33/e5/58/0a3e34b92bedf09b4c57e37a63ff395ade6f6c1099ba59877c
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.3


In [2]:
# %%
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Item Clustering and Recommendation Script (Using KNN with Combined Features and Multi‐Product Combinations)
-----------------------------------------------------------------------------------------------------------
Optimized with:
1. PCA to reduce dimensionality;
2. Approximate Nearest Neighbors with Annoy (instead of scikit-learn NearestNeighbors);
3. Single bulk building of the Annoy index for all items;
4. Multi-product combos on demand or a reduced subset of products.
"""

import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from itertools import combinations
from annoy import AnnoyIndex  # pip install annoy

# ----------------------------
# 1. Load and Prepare the Transaction Data
# ----------------------------

data_file = 'final_df.parquet'
df = pd.read_parquet(data_file)

# Convert TransactionDate to datetime if not already.
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'])

# Create a BasketID by combining ClientID and TransactionDate.
df['BasketID'] = df['ClientID'].astype(str) + '_' + df['TransactionDate'].dt.strftime('%Y-%m-%d')

print("Number of unique products in transactions:", df['ProductID'].nunique())


Number of unique products in transactions: 20638


In [3]:

# ----------------------------
# 2. Filter Transactions by Stock Availability
# ----------------------------

stocks_data_file = 'stocks_data.csv'
stocks_df = pd.read_csv(stocks_data_file)
print("\nStocks Data Sample:")
print(stocks_df.head())

# Get the unique product IDs from the stocks file.
stock_product_ids = stocks_df['ProductID'].unique()
stock_product_ids_set = set(stock_product_ids)

# Filter transactions to only include products available in stock.
df = df[df['ProductID'].isin(stock_product_ids_set)]
print("\nFiltered transactions shape:", df.shape)
print("Filtered transactions sample:")
print(df[['BasketID', 'ProductID', 'Quantity_sold']].head())

# (Optional) Group by basket and product to check aggregation.
grouped = df.groupby(['BasketID', 'ProductID'])['Quantity_sold'].sum()
print("\nGrouped data shape:", grouped.shape)
print(grouped.head())



Stocks Data Sample:
  StoreCountry            ProductID  Quantity
0          AUS  1284651161701379667       2.0
1          AUS  6076274819885027797       2.0
2          AUS  6019386668821120661       2.0
3          AUS  2122575437123245322       2.0
4          AUS  5901681811213086415       2.0

Filtered transactions shape: (590037, 59)
Filtered transactions sample:
                         BasketID            ProductID  Quantity_sold
0  4388436561084682799_2023-01-01  3260004767786243986              2
1  5475934562856106533_2023-01-01  4081002095016762501              1
2  7571493122530801912_2023-01-01  6392464777854173474              1
4  7828763863563966653_2023-01-01  1064014581685647413              5
6  8027320461473133237_2023-01-01  7562536307774112492              1

Grouped data shape: (589659,)
BasketID                        ProductID          
1000031093718265133_2024-07-18  1789767510212328793     2
1000031093718265133_2025-01-28  5758680089498959695     2
10000451027

In [4]:

# ----------------------------
# 3. Build the Sparse Basket Matrix
# ----------------------------

# Get unique baskets and products.
baskets = df['BasketID'].unique()
products = df['ProductID'].unique()

# Map baskets and products to indices.
basket_to_idx = {basket: i for i, basket in enumerate(baskets)}
product_to_idx = {product: i for i, product in enumerate(products)}

row_indices = df['BasketID'].map(basket_to_idx).values
col_indices = df['ProductID'].map(product_to_idx).values

# Create binary data: 1 if purchased.
data_values = (df['Quantity_sold'] > 0).astype(int).values

# Build the sparse matrix.
basket_sparse = csr_matrix((data_values, (row_indices, col_indices)),
                           shape=(len(baskets), len(products)))
print("\nSparse basket matrix shape:", basket_sparse.shape)

# Convert to a pandas Sparse DataFrame.
basket_encoded = pd.DataFrame.sparse.from_spmatrix(
    basket_sparse,
    index=baskets,
    columns=products
)
print("Basket encoded shape (sparse DataFrame):", basket_encoded.shape)

# Convert columns to string just to be safe.
basket_encoded.columns = basket_encoded.columns.astype(str)
basket_encoded = (basket_encoded > 0).astype(int)

# Transpose so that rows represent products (each row => product, each column => basket).
product_features = basket_encoded.T
print("Product features shape (from baskets):", product_features.shape)



Sparse basket matrix shape: (553436, 4517)
Basket encoded shape (sparse DataFrame): (553436, 4517)
Product features shape (from baskets): (4517, 553436)


In [5]:

# ----------------------------
# 4. Load and Process Additional Product Metadata
# ----------------------------

metadata_cols = ['ProductID', 'Category', 'FamilyLevel1', 'FamilyLevel2', 'Brand']
metadata = df[metadata_cols].drop_duplicates(subset='ProductID')

metadata['ProductID'] = metadata['ProductID'].astype(str)
metadata.set_index('ProductID', inplace=True)
metadata = metadata.fillna('Unknown')

# One-hot encode the metadata.
metadata_encoded = pd.get_dummies(metadata)
# Align metadata with products present in product_features.
metadata_encoded = metadata_encoded.reindex(product_features.index).fillna(0)
print("Metadata encoded shape:", metadata_encoded.shape)


Metadata encoded shape: (4517, 170)


In [6]:

# ----------------------------
# 5. Combine Basket-Based Features with Metadata Features
# ----------------------------

combined_features = pd.concat([product_features, metadata_encoded], axis=1)
print("Combined features shape:", combined_features.shape)

# Scale data
scaler = StandardScaler()
combined_features_scaled = pd.DataFrame(
    scaler.fit_transform(combined_features),
    index=combined_features.index,
    columns=combined_features.columns
)


Combined features shape: (4517, 553606)


In [7]:

# ----------------------------
# 5.1 (New) Dimensionality Reduction with PCA
# ----------------------------
# (You can adjust n_components according to your data size/performance needs)
n_components = 50
pca = PCA(n_components=n_components)
combined_reduced = pca.fit_transform(combined_features_scaled)

print(f"Shape after PCA reduction: {combined_reduced.shape}")


Shape after PCA reduction: (4517, 50)


In [8]:

# ----------------------------
# 6. Build Approximate Nearest Neighbors Index (Annoy)
# ----------------------------

# Each product => an n-dimensional vector in "combined_reduced".
# We'll build an Annoy index of dimension = n_components
annoy_index = AnnoyIndex(n_components, metric='angular')
# 'angular' effectively approximates cosine distance in Annoy

product_index_list = combined_features_scaled.index.tolist()  # ProductIDs
productID_to_annoyIndex = {}
annoyIndex_to_productID = {}

# Add items to the Annoy index
for i, pid in enumerate(product_index_list):
    vector = combined_reduced[i]
    annoy_index.add_item(i, vector.tolist())
    productID_to_annoyIndex[pid] = i
    annoyIndex_to_productID[i] = pid

# Build the index. "n_trees" ~ 10..50 can be tuned for speed/accuracy tradeoff
annoy_index.build(n_trees=10)
print("Annoy index built.")


Annoy index built.


In [9]:

# ----------------------------
# 7. Recommendation Function (Single & Multi-Product) using Annoy
# ----------------------------

def recommend_for_single_product(pid, annoy_idx, top_n=5):
    """
    Return top_n nearest neighbor product IDs for a single product pid.
    Excludes pid itself.
    """
    if pid not in productID_to_annoyIndex:
        return []
    idx = productID_to_annoyIndex[pid]
    # Get nns from Annoy
    neighbor_indices = annoy_idx.get_nns_by_item(idx, top_n+1, include_distances=False)
    # The first one might be the item itself
    recommendations = []
    for ni in neighbor_indices:
        if ni != idx:
            recommendations.append(annoyIndex_to_productID[ni])
        if len(recommendations) >= top_n:
            break
    return recommendations


def recommend_for_combinations(basket_products, annoy_idx, top_n=5, combination_sizes=[1, 2, 3]):
    """
    For a given set of products (basket_products), compute approximate recommendations
    for every combination (of sizes in combination_sizes) by averaging the PCA vectors and querying Annoy.

    NOTE: Doing this for large sets is expensive. Typically do it on-demand (e.g. for a user’s actual cart).
    """
    results = {}
    # Gather valid products
    valid_products = [p for p in basket_products if p in productID_to_annoyIndex]

    for r in combination_sizes:
        for combo in combinations(valid_products, r):
            # Average the reduced vectors
            vectors = []
            for pid in combo:
                i = productID_to_annoyIndex[pid]
                vectors.append(combined_reduced[i])
            mean_vec = np.mean(vectors, axis=0)

            # Query Annoy using the vector
            neighbor_indices = annoy_idx.get_nns_by_vector(mean_vec.tolist(), top_n+len(combo))

            # Filter out the products in combo
            neighbor_ids = []
            for ni in neighbor_indices:
                candidate_pid = annoyIndex_to_productID[ni]
                if candidate_pid not in combo:
                    neighbor_ids.append(candidate_pid)
                if len(neighbor_ids) >= top_n:
                    break
            results[combo] = neighbor_ids
    return results


In [12]:
def combos_to_serializable(combos_dict):
    """
    Convert combos_dict from:
       {('A',): [rec1, rec2, ...], ('B','C'): [rec1, rec2, ...], ...}
    to a list of dicts with string keys:
       [
         {'combo': "A", 'recommendations': [rec1, rec2, ...]},
         {'combo': "B,C", 'recommendations': [...]},
         ...
       ]
    """
    output = []
    for combo_tuple, recs in combos_dict.items():
        combo_str = ",".join(combo_tuple)
        output.append({'combo': combo_str, 'recommendations': recs})
    return output

In [14]:

# ----------------------------
# 8. Generate and Save Recommendations
# ----------------------------
if __name__ == '__main__':
    # -----------------------------------------------------------------
    # A) Single-product recommendations for ALL products (fast approach)
    # -----------------------------------------------------------------
    recommendations_list = []
    for pid in product_index_list:
        recs = recommend_for_single_product(pid, annoy_index, top_n=5)
        recommendations_list.append({
            'ProductID': pid,
            'SingleProduct_Recommendations': recs
        })

    single_df = pd.DataFrame(recommendations_list)
    single_df.to_parquet('product_recommendations_single_ANN_with_PCA.parquet', index=False)
    print("\nSingle-product recommendations saved to product_recommendations_single_ANN_with_PCA.parquet")

    # B) Multi-product combos for ALL products
    # WARNING: Enumerating multi-product combinations can be computationally expensive.
    # Here, for each product, we call recommend_for_combinations with a single-item basket,
    # which for combination sizes 2 and 3 will be empty, so effectively it's similar to single-product.
    # In a real-world scenario, multi-product combos are usually computed on-demand (e.g. for a user's cart).

    combos_recommendations = []
    for pid in product_index_list:
        combo_recs = recommend_for_combinations([pid], annoy_index, top_n=5, combination_sizes=[1,2,3,4,5])
        serialized = combos_to_serializable(combo_recs)
        combos_recommendations.append({
            'ProductID': pid,
            'Combination_Recommendations': serialized
        })

    combos_df = pd.DataFrame(combos_recommendations)
    combos_df.to_parquet('multi_product_combo_recommendations_ANN_with_PCA.parquet', index=False)
    print("\nMulti-product combo recommendations for ALL products saved to multi_product_combo_recommendations_ANN_with_PCA.parquet")

    print("\nRecommendation generation complete.")


Single-product recommendations saved to product_recommendations_single_ANN_with_PCA.parquet

Multi-product combo recommendations for ALL products saved to multi_product_combo_recommendations_ANN_with_PCA.parquet

Recommendation generation complete.
