
# Retail Product Discovery & Recommendation – Multimodal Chatbot

This single notebook implements:
- Text-based recommendation (Groq LLM)
- Image-based similarity search (CLIP + FAISS)
- Voice-based search (Whisper STT)
- Interactive UI using Jupyter widgets
- Evaluation plots and tables

This notebook is designed for **academic submission and viva demonstration**.


## 1. Install & Import Dependencies

In [None]:

# Run once if required
# !pip install ipywidgets torch torchvision torchaudio
# !pip install faiss-cpu pandas matplotlib scikit-learn
# !pip install git+https://github.com/openai/CLIP.git
# !pip install openai-whisper soundfile pydub groq python-dotenv

import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display, Image, clear_output


## 2. Environment & Utility Functions

In [None]:

from dotenv import load_dotenv
load_dotenv()

from groq import Groq

def extract_intent_entities(text):
    text = text.lower()
    if "jacket" in text:
        return "search", {"category": "jacket"}
    return "search", {}

class GroqClient:
    def __init__(self):
        self.client = Groq(api_key=os.getenv("GROQ_API_KEY"))

    def chat(self, prompt):
        response = self.client.chat.completions.create(
            model="llama-3.1-8b-instant",
            messages=[
                {"role": "system", "content": "You are a retail recommendation assistant."},
                {"role": "user", "content": prompt}
            ]
        )
        return response.choices[0].message.content

llm = GroqClient()


## 3. Image Embeddings (CLIP) + FAISS

In [None]:

import clip
import torch
from PIL import Image as PILImage
import faiss

device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)

def encode_image(path):
    image = preprocess(PILImage.open(path)).unsqueeze(0).to(device)
    with torch.no_grad():
        return clip_model.encode_image(image).cpu().numpy()

class AmazonFashionIndexer:
    def __init__(self, dataset_dir):
        self.image_dir = os.path.join(dataset_dir, "images")
        with open(os.path.join(dataset_dir, "metadata.json"), "r", encoding="utf-8-sig") as f:
            self.metadata = json.load(f)
        self.embeddings = []
        self.products = []

    def build(self):
        for item in self.metadata:
            img_path = os.path.join(self.image_dir, item["image"])
            if os.path.exists(img_path):
                self.embeddings.append(encode_image(img_path)[0])
                self.products.append(item)
        self.index = faiss.IndexFlatL2(len(self.embeddings[0]))
        self.index.add(np.array(self.embeddings))

    def search(self, query, k=5):
        distances, indices = self.index.search(query, k)
        results = []
        seen = set()
        for d, i in zip(distances[0], indices[0]):
            pid = self.products[i]["product_id"]
            if pid not in seen:
                results.append({
                    "product": self.products[i],
                    "confidence": round(100 * (1 / (1 + d)), 2)
                })
                seen.add(pid)
        return results


## 4. Whisper Speech-to-Text

In [None]:

import whisper
whisper_model = whisper.load_model("base")

def transcribe_audio(path):
    return whisper_model.transcribe(path)["text"]


## 5. Interactive Widgets (Text, Image, Voice)

In [None]:

# TEXT SEARCH
text_query = widgets.Text(description="Query:", layout=widgets.Layout(width="70%"))
text_button = widgets.Button(description="Search", button_style="primary")
text_out = widgets.Output()

def text_search(b):
    with text_out:
        clear_output()
        intent, entities = extract_intent_entities(text_query.value)
        print(llm.chat(f"User intent: {intent}, entities: {entities}"))

text_button.on_click(text_search)
display(widgets.VBox([text_query, text_button, text_out]))


In [None]:

# IMAGE SEARCH
image_upload = widgets.FileUpload(accept=".jpg,.png", multiple=False)
img_out = widgets.Output()

def image_search(change):
    with img_out:
        clear_output()
        up = list(image_upload.value.values())[0]
        with open("query.jpg", "wb") as f:
            f.write(up["content"])
        display(Image("query.jpg", width=200))

        indexer = AmazonFashionIndexer("data/amazon_fashion")
        indexer.build()
        results = indexer.search(encode_image("query.jpg"))

        for r in results:
            p = r["product"]
            display(Image(f"data/amazon_fashion/images/{p['image']}", width=180))
            print(p["category"], "| €", p["price"], "|", r["confidence"], "%")

image_upload.observe(image_search, names="value")
display(widgets.VBox([image_upload, img_out]))


In [None]:

# VOICE SEARCH
audio_upload = widgets.FileUpload(accept=".wav,.mp3", multiple=False)
voice_out = widgets.Output()

def voice_search(change):
    with voice_out:
        clear_output()
        up = list(audio_upload.value.values())[0]
        with open("voice.wav", "wb") as f:
            f.write(up["content"])
        text = transcribe_audio("voice.wav")
        print("Recognized:", text)
        print(llm.chat(text))

audio_upload.observe(voice_search, names="value")
display(widgets.VBox([audio_upload, voice_out]))


## 6. Evaluation Plots

In [None]:

df = pd.read_json("data/amazon_fashion/metadata.json")

df["price"] = df["price"].astype(float)

plt.figure()
df["category"].value_counts().plot(kind="bar", title="Category Distribution")
plt.show()

plt.figure()
df["gender"].value_counts().plot(kind="bar", title="Gender Distribution")
plt.show()

plt.figure()
plt.hist(df["price"], bins=5)
plt.title("Price Distribution")
plt.show()
