In [1]:
# -*- coding: utf-8 -*-
# Smart Grocery List Optimizer - Data Preprocessing Pipeline
# Saves embeddings & configs into multiple formats

import os, json, yaml, pickle, warnings
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import h5py

# =========================
# CONFIG
# =========================
BASE_DIR = r"C:\Users\sagni\Downloads\Smart Grocery List Optimizer"
ARCHIVE  = os.path.join(BASE_DIR, "archive")

OUT_H5   = os.path.join(BASE_DIR, "recipes_embeddings.h5")
OUT_PKL  = os.path.join(BASE_DIR, "recipes_embeddings.pkl")
OUT_JSON = os.path.join(BASE_DIR, "recipes_embeddings.json")
OUT_YAML = os.path.join(BASE_DIR, "recipes_embeddings.yaml")

# Dataset paths
INGR_MAP   = os.path.join(ARCHIVE, "ingr_map.pkl")
RAW_RECIP  = os.path.join(ARCHIVE, "RAW_recipes.csv")
RAW_INTER  = os.path.join(ARCHIVE, "RAW_interactions.csv")
PP_RECIP   = os.path.join(ARCHIVE, "PP_recipes.csv")
PP_USERS   = os.path.join(ARCHIVE, "PP_users.csv")
INTER_TRAIN = os.path.join(ARCHIVE, "interactions_train.csv")
INTER_TEST  = os.path.join(ARCHIVE, "interactions_test.csv")
INTER_VAL   = os.path.join(ARCHIVE, "interactions_validation.csv")

# =========================
# Load datasets
# =========================
print("[INFO] Loading datasets...")
df_recipes = pd.read_csv(RAW_RECIP)
df_inter   = pd.read_csv(RAW_INTER)
df_pp_rec  = pd.read_csv(PP_RECIP)
df_pp_users= pd.read_csv(PP_USERS)

df_train   = pd.read_csv(INTER_TRAIN)
df_test    = pd.read_csv(INTER_TEST)
df_val     = pd.read_csv(INTER_VAL)

with open(INGR_MAP, "rb") as f:
    ingr_map = pickle.load(f)

print(f"Recipes: {df_recipes.shape}, Interactions: {df_inter.shape}, Ingr_map: {len(ingr_map)}")

# =========================
# Clean and preprocess recipes
# =========================
def clean_ingredients(x):
    if isinstance(x, str):
        return x.lower().replace("[","").replace("]","").replace("'","").replace(","," ")
    return ""

df_recipes["clean_ingredients"] = df_recipes["ingredients"].astype(str).map(clean_ingredients)

# =========================
# Vectorize recipes (TF-IDF on ingredients)
# =========================
print("[INFO] Computing TF-IDF embeddings...")
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X = vectorizer.fit_transform(df_recipes["clean_ingredients"].tolist())
X = normalize(X)

print(f"[OK] Embedding matrix shape: {X.shape}")

# =========================
# Save artifacts
# =========================

# Save H5
with h5py.File(OUT_H5, "w") as hf:
    hf.create_dataset("embeddings", data=X.toarray())
    hf.create_dataset("recipe_ids", data=df_recipes["id"].astype(str).values.astype("S"))
print(f"[OK] Saved H5 → {OUT_H5}")

# Save PKL
with open(OUT_PKL, "wb") as f:
    pickle.dump({
        "embeddings": X,
        "recipe_ids": df_recipes["id"].tolist(),
        "vectorizer": vectorizer
    }, f)
print(f"[OK] Saved PKL → {OUT_PKL}")

# Save JSON
json_data = {
    "recipe_ids": df_recipes["id"].tolist()[:200],  # store sample to keep size small
    "ingredients": df_recipes["clean_ingredients"].tolist()[:200],
    "shape": X.shape
}
with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump(json_data, f, indent=2)
print(f"[OK] Saved JSON → {OUT_JSON}")

# Save YAML
yaml_data = {
    "n_recipes": int(X.shape[0]),
    "embedding_dim": int(X.shape[1]),
    "vectorizer_params": vectorizer.get_params(),
    "columns": list(df_recipes.columns)
}
with open(OUT_YAML, "w", encoding="utf-8") as f:
    yaml.dump(yaml_data, f)
print(f"[OK] Saved YAML → {OUT_YAML}")

print("[DONE] All artifacts saved successfully.")


[INFO] Loading datasets...


ModuleNotFoundError: No module named 'pandas.core.indexes.numeric'

In [2]:
pip install pandas==1.5.3


Collecting pandas==1.5.3
  Downloading pandas-1.5.3-cp311-cp311-win_amd64.whl.metadata (12 kB)
Downloading pandas-1.5.3-cp311-cp311-win_amd64.whl (10.3 MB)
   ---------------------------------------- 0.0/10.3 MB ? eta -:--:--
   -------- ------------------------------- 2.1/10.3 MB 13.0 MB/s eta 0:00:01
   ------------------ --------------------- 4.7/10.3 MB 11.9 MB/s eta 0:00:01
   --------------------------- ------------ 7.1/10.3 MB 11.8 MB/s eta 0:00:01
   ------------------------------------ --- 9.4/10.3 MB 12.0 MB/s eta 0:00:01
   ---------------------------------------- 10.3/10.3 MB 11.5 MB/s eta 0:00:00
Installing collected packages: pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 2.3.1
    Uninstalling pandas-2.3.1:
      Successfully uninstalled pandas-2.3.1
Successfully installed pandas-1.5.3
Note: you may need to restart the kernel to use updated packages.


  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
geopandas 1.1.1 requires pandas>=2.0.0, but you have pandas 1.5.3 which is incompatible.
stable-baselines3 2.6.0 requires torch<3.0,>=2.3, but you have torch 2.2.2 which is incompatible.
tsfresh 0.21.0 requires scipy>=1.14.0; python_version >= "3.10", but you have scipy 1.13.1 which is incompatible.
yfinance 0.2.65 requires websockets>=13.0, but you have websockets 10.4 which is incompatible.

[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# -*- coding: utf-8 -*-
# Smart Grocery List Optimizer - Data Preprocessing Pipeline
# Saves embeddings & configs into multiple formats

import os, json, yaml, pickle, warnings
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import h5py

# =========================
# CONFIG
# =========================
BASE_DIR = r"C:\Users\sagni\Downloads\Smart Grocery List Optimizer"
ARCHIVE  = os.path.join(BASE_DIR, "archive")

OUT_H5   = os.path.join(BASE_DIR, "recipes_embeddings.h5")
OUT_PKL  = os.path.join(BASE_DIR, "recipes_embeddings.pkl")
OUT_JSON = os.path.join(BASE_DIR, "recipes_embeddings.json")
OUT_YAML = os.path.join(BASE_DIR, "recipes_embeddings.yaml")

# Dataset paths
INGR_MAP   = os.path.join(ARCHIVE, "ingr_map.pkl")
RAW_RECIP  = os.path.join(ARCHIVE, "RAW_recipes.csv")
RAW_INTER  = os.path.join(ARCHIVE, "RAW_interactions.csv")
PP_RECIP   = os.path.join(ARCHIVE, "PP_recipes.csv")
PP_USERS   = os.path.join(ARCHIVE, "PP_users.csv")
INTER_TRAIN = os.path.join(ARCHIVE, "interactions_train.csv")
INTER_TEST  = os.path.join(ARCHIVE, "interactions_test.csv")
INTER_VAL   = os.path.join(ARCHIVE, "interactions_validation.csv")

# =========================
# Load datasets
# =========================
print("[INFO] Loading datasets...")
df_recipes = pd.read_csv(RAW_RECIP)
df_inter   = pd.read_csv(RAW_INTER)
df_pp_rec  = pd.read_csv(PP_RECIP)
df_pp_users= pd.read_csv(PP_USERS)

df_train   = pd.read_csv(INTER_TRAIN)
df_test    = pd.read_csv(INTER_TEST)
df_val     = pd.read_csv(INTER_VAL)

with open(INGR_MAP, "rb") as f:
    ingr_map = pickle.load(f)

print(f"Recipes: {df_recipes.shape}, Interactions: {df_inter.shape}, Ingr_map: {len(ingr_map)}")

# =========================
# Clean and preprocess recipes
# =========================
def clean_ingredients(x):
    if isinstance(x, str):
        return x.lower().replace("[","").replace("]","").replace("'","").replace(","," ")
    return ""

df_recipes["clean_ingredients"] = df_recipes["ingredients"].astype(str).map(clean_ingredients)

# =========================
# Vectorize recipes (TF-IDF on ingredients)
# =========================
print("[INFO] Computing TF-IDF embeddings...")
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X = vectorizer.fit_transform(df_recipes["clean_ingredients"].tolist())
X = normalize(X)

print(f"[OK] Embedding matrix shape: {X.shape}")

# =========================
# Save artifacts
# =========================

# Save H5
with h5py.File(OUT_H5, "w") as hf:
    hf.create_dataset("embeddings", data=X.toarray())
    hf.create_dataset("recipe_ids", data=df_recipes["id"].astype(str).values.astype("S"))
print(f"[OK] Saved H5 → {OUT_H5}")

# Save PKL
with open(OUT_PKL, "wb") as f:
    pickle.dump({
        "embeddings": X,
        "recipe_ids": df_recipes["id"].tolist(),
        "vectorizer": vectorizer
    }, f)
print(f"[OK] Saved PKL → {OUT_PKL}")

# Save JSON
json_data = {
    "recipe_ids": df_recipes["id"].tolist()[:200],  # store sample to keep size small
    "ingredients": df_recipes["clean_ingredients"].tolist()[:200],
    "shape": X.shape
}
with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump(json_data, f, indent=2)
print(f"[OK] Saved JSON → {OUT_JSON}")

# Save YAML
yaml_data = {
    "n_recipes": int(X.shape[0]),
    "embedding_dim": int(X.shape[1]),
    "vectorizer_params": vectorizer.get_params(),
    "columns": list(df_recipes.columns)
}
with open(OUT_YAML, "w", encoding="utf-8") as f:
    yaml.dump(yaml_data, f)
print(f"[OK] Saved YAML → {OUT_YAML}")

print("[DONE] All artifacts saved successfully.")


[INFO] Loading datasets...
Recipes: (231637, 12), Interactions: (1132367, 5), Ingr_map: 11659
[INFO] Computing TF-IDF embeddings...
[OK] Embedding matrix shape: (231637, 4171)
[OK] Saved H5 → C:\Users\sagni\Downloads\Smart Grocery List Optimizer\recipes_embeddings.h5
[OK] Saved PKL → C:\Users\sagni\Downloads\Smart Grocery List Optimizer\recipes_embeddings.pkl
[OK] Saved JSON → C:\Users\sagni\Downloads\Smart Grocery List Optimizer\recipes_embeddings.json
[OK] Saved YAML → C:\Users\sagni\Downloads\Smart Grocery List Optimizer\recipes_embeddings.yaml
[DONE] All artifacts saved successfully.
