In [1]:
!pip install pandas numpy torch sentence-transformers pinecone-client transformers tqdm

[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/dill-0.3.9-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/texttable-1.7.0-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/nvfuser-0.2.13a0+0d33366-py3.12-linux-x86_64.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-package

In [3]:
!pip install pinecone
import pinecone

pinecone.init(api_key="", environment="us-east1-gcp")
index_name = "furniture-recommendations"

if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=384, metric="cosine")

index = pinecone.Index(index_name)

[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/dill-0.3.9-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/texttable-1.7.0-py3.12.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-packages/nvfuser-0.2.13a0+0d33366-py3.12-linux-x86_64.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330[0m[33m
[0m[33mDEPRECATION: Loading egg at /usr/local/lib/python3.12/dist-package

AttributeError: init is no longer a top-level attribute of the pinecone package.

Please create an instance of the Pinecone class instead.

Example:

    import os
    from pinecone import Pinecone, ServerlessSpec

    pc = Pinecone(
        api_key=os.environ.get("PINECONE_API_KEY")
    )

    # Now do stuff
    if 'my_index' not in pc.list_indexes().names():
        pc.create_index(
            name='my_index',
            dimension=1536,
            metric='euclidean',
            spec=ServerlessSpec(
                cloud='aws',
                region='us-west-2'
            )
        )



In [1]:
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.16.2
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

# # Product Recommendation System: Model Training & Vector DB Setup
#
# **Objective:** This notebook covers the end-to-end machine learning pipeline for the product recommendation web app.
#
# **Key Tasks:**
# 1.  **Data Loading & Preprocessing:** Load the furniture dataset and clean it for model consumption.
# 2.  **Feature Extraction (Embeddings):** Use OpenAI's CLIP model to generate powerful multi-modal (text and image) embeddings for each product.
# 3.  **Vector Database Setup:** Initialize and configure a Pinecone vector database to store these embeddings.
# 4.  **Upsert to Pinecone:** Populate the Pinecone index with our product embeddings for efficient similarity search.
# 5.  **Semantic Search:** Implement and test a function to find similar products based on a text query or image.
# 6.  **Generative AI for Descriptions:** Use a lightweight GenAI model via LangChain to create creative product descriptions.
#
# **Folder Structure Context:**
# ```


# ## Step 1: Setup and Installations
#
# First, let's install the necessary libraries for this project. We've added `safetensors` to ensure models are loaded securely and avoid potential `torch.load` errors.

# +
# !pip install -q -U pandas torch transformers safetensors pinecone-client sentence-transformers pillow requests tqdm langchain langchain-core langchain-huggingface
# -

# ## Step 2: Import Libraries and Load Data

import io
import os

import ipywidgets.widgets as widgets  # type: ignore

# +
import pandas as pd
import requests
import torch
from IPython.display import display  # type: ignore
from langchain_core.prompts import (
    PromptTemplate,  # Updated import for recent langchain versions
)
from langchain_huggingface import HuggingFacePipeline
from PIL import Image
from pinecone import Pinecone, ServerlessSpec  # Updated Pinecone import
from tqdm.auto import tqdm
from transformers import CLIPModel, CLIPProcessor

# Check for GPU availability for faster processing
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# -

# Load the dataset. Make sure the path is correct based on the folder structure.
try:
    df = pd.read_csv('intern_data_ikarus.csv')
    print("Dataset loaded successfully.")
    print("Dataset shape:", df.shape)
except FileNotFoundError:
    print("Error: Dataset file not found. Please ensure 'intern_data_ikarus.csv' is in the same directory as the notebook or provide the correct path.")

df.head()

# ## Step 3: Data Cleaning and Preprocessing
#
# We need to prepare the data for our models. This involves handling missing values and creating a unified text field for each product that will be used to generate text embeddings. We also need to clean the image URLs.

# +
# --- Handle Missing Values ---
# Fill missing values in key text columns with an empty string or a placeholder.
# This ensures that our combined text field doesn't fail on NaN values.
text_cols = ['title', 'description', 'brand', 'categories', 'material', 'color']
for col in text_cols:
    df[col] = df[col].fillna('')

# --- Create a Combined Text Feature ---
# We create a single, rich text description for each product. This helps the model
# understand the product from multiple textual attributes simultaneously.
def combine_text_features(row):
    return (
        f"Title: {row['title']}. "
        f"Brand: {row['brand']}. "
        f"Categories: {row['categories']}. "
        f"Description: {row['description']}. "
        f"Material: {row['material']}. "
        f"Color: {row['color']}."
    )

df['combined_text'] = df.apply(combine_text_features, axis=1)

print("Created 'combined_text' feature.")
df[['uniq_id', 'combined_text']].head()
# -

# --- Clean Image URLs ---
# The 'images' column is a string representation of a list. We need to parse it
# and extract the first valid image URL.
def extract_first_image_url(images_str):
    try:
        # The string looks like "['url1', 'url2', ...]". We strip brackets and quotes.
        image_list = images_str.strip("[]").split("', '")
        first_image = image_list[0].strip("'")
        return first_image if first_image else None
    except (IndexError, AttributeError):
        return None

df['main_image_url'] = df['images'].apply(extract_first_image_url)

# Drop rows where we couldn't find a valid image URL, as they can't be processed by CLIP.
df_clean = df.dropna(subset=['main_image_url']).copy()
df_clean = df_clean.reset_index(drop=True)

print(f"Original dataframe rows: {len(df)}")
print(f"Cleaned dataframe rows (with valid image URLs): {len(df_clean)}")
df_clean[['uniq_id', 'main_image_url']].head()


# ## Step 4: Initialize CLIP Model
#
# We'll use the `openai/clip-vit-base-patch32` model. This model is a powerful multi-modal encoder that can process both text and images into the same embedding space, making it perfect for our use case.

# +
model_name = "openai/clip-vit-base-patch32"
processor = CLIPProcessor.from_pretrained(model_name)
# Use `use_safetensors=True` to avoid the torch.load vulnerability error with older torch versions.
model = CLIPModel.from_pretrained(model_name, use_safetensors=True).to(device)

print("CLIP model and processor loaded.")
# -

# ## Step 5: Generate Embeddings
#
# Now, we'll process our entire dataset. For each product, we'll generate a text embedding, an image embedding, and then combine them.

# ### Helper function to download and process images
def get_image_from_url(url):
    """Downloads an image from a URL and returns a PIL Image."""
    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status()
        image = Image.open(io.BytesIO(response.content)).convert("RGB")
        return image
    except Exception as e:
        # print(f"Warning: Could not download or process image from {url}. Error: {e}")
        return None


# ### Generate Text and Image Embeddings in Batches
# We process in batches to manage memory usage, especially if using a GPU.

# +
batch_size = 64
all_combined_embeddings = []
valid_ids = []

# We'll iterate through the dataframe in chunks (batches)
for i in tqdm(range(0, len(df_clean), batch_size), desc="Generating Embeddings"):
    batch_df = df_clean.iloc[i:i+batch_size]
    
    # --- Process Texts ---
    texts = batch_df['combined_text'].tolist()
    text_inputs = processor(text=texts, return_tensors="pt", padding=True, truncation=True, max_length=77).to(device)
    with torch.no_grad():
        text_features = model.get_text_features(**text_inputs)
        
    # --- Process Images ---
    image_urls = batch_df['main_image_url'].tolist()
    images = [get_image_from_url(url) for url in image_urls]
    
    # Filter out any images that failed to download
    valid_images = [img for img in images if img is not None]
    valid_indices = [idx for idx, img in enumerate(images) if img is not None]

    if not valid_images:
        continue # Skip batch if no images are valid
        
    image_inputs = processor(images=valid_images, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        image_features_valid = model.get_image_features(**image_inputs)
    
    # Create a tensor for all images in the batch, filling failed ones with zeros
    image_features = torch.zeros(len(batch_df), image_features_valid.shape[1]).to(device)
    image_features[valid_indices, :] = image_features_valid

    # --- Combine and Normalize Embeddings ---
    # Concatenate text and image features to create a multi-modal embedding
    combined_features = torch.cat((text_features, image_features), dim=1)
    
    # Normalize the embeddings to have a unit length (magnitude of 1).
    # This is a best practice for similarity search with cosine distance.
    normalized_features = combined_features / combined_features.norm(dim=1, keepdim=True)
    
    all_combined_embeddings.append(normalized_features.cpu())
    valid_ids.extend(batch_df['uniq_id'].tolist())

# Concatenate all batch results into a single tensor
final_embeddings = torch.cat(all_combined_embeddings, dim=0)

print("Embedding generation complete.")
print("Shape of final embeddings tensor:", final_embeddings.shape)
# -

# ## Step 6: Setup Pinecone Vector Database (Updated Syntax)
#
# **Action Required:** You need to sign up for a free Pinecone account at [pinecone.io](https://www.pinecone.io/) and get your API key.

# +
# --- Pinecone Initialization ---
# PLEASE REPLACE with your actual Pinecone API key
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY", ")

if PINECONE_API_KEY == "YOUR_API_KEY":
    print("🚨 Please replace 'YOUR_API_KEY' with your actual Pinecone API key.")
else:
    # New initialization method
    pc = Pinecone(api_key=PINECONE_API_KEY)
    print("Pinecone client initialized.")

# -

# --- Create a Pinecone Index ---
# An index is where our vectors will be stored. We define its name, dimension, and spec.
index_name = "furniture-recommendations"
embedding_dim = final_embeddings.shape[1]

# New method to check if an index exists
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    print(f"Creating index '{index_name}'...")
    # New method to create an index
    pc.create_index(
        name=index_name,
        dimension=embedding_dim,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1' # You can choose a region that is close to you
        )
    )
    print("Index created successfully.")
else:
    print(f"Index '{index_name}' already exists.")

# Connect to the index (this is also updated)
index = pc.Index(index_name)
index.describe_index_stats()

# ## Step 7: Upsert Embeddings to Pinecone
#
# Now we'll upload our generated embeddings into the Pinecone index. We'll do this in batches for efficiency. We will also store the `combined_text` as metadata.

# +
# We need to associate each vector with its unique product ID and text.
vectors_to_upsert = []
for i, (uniq_id, embedding) in enumerate(zip(valid_ids, final_embeddings)):
    # Pinecone expects a list of tuples: (id, vector, metadata)
    metadata = {'text': df_clean.loc[i, 'combined_text']}
    vectors_to_upsert.append((uniq_id, embedding.tolist(), metadata))

# Upsert in batches to avoid overwhelming the API
upsert_batch_size = 100
for i in tqdm(range(0, len(vectors_to_upsert), upsert_batch_size), desc="Upserting to Pinecone"):
    batch = vectors_to_upsert[i:i+upsert_batch_size]
    index.upsert(vectors=batch)

print("Upserting complete.")
index.describe_index_stats()
# -

# ## Step 8: Implement Semantic Search
#
# Let's test our system! We'll create a function that takes a text query, generates a combined embedding for it (we'll use a blank image for a pure text query), and searches Pinecone.

# +
def find_similar_products(query_text, top_k=5):
    """
    Finds similar products in the Pinecone index based on a text query.
    """
    # 1. Generate text embedding for the query
    text_inputs = processor(text=[query_text], return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        query_text_embedding = model.get_text_features(**text_inputs)
        
    # 2. Create a "zero" image embedding as we are only searching with text
    # CORRECTED: Use `model.config.projection_dim` which is the final output dimension (512)
    # for both text and image embeddings, ensuring the dimensions match.
    # The original error was caused by using a different internal dimension.
    query_image_embedding = torch.zeros(1, model.config.projection_dim).to(device)
    
    # 3. Combine and normalize
    query_embedding = torch.cat((query_text_embedding, query_image_embedding), dim=1)
    query_embedding = query_embedding / query_embedding.norm(dim=1, keepdim=True)
    
    # 4. Query Pinecone
    results = index.query(
        vector=query_embedding.cpu().tolist(),
        top_k=top_k,
        include_metadata=True
    )
    
    return results['matches']

# --- Let's test it! ---
query = "a comfortable wooden chair for the living room"
matches = find_similar_products(query)

print(f"Query: '{query}'\n")
for match in matches:
    print(f"  - ID: {match['id']}")
    print(f"    Score: {match['score']:.4f}")
    # print(f"    Text: {match['metadata']['text'][:150]}...")
    print("-" * 20)
# -

# ## Step 9: Generative AI for Creative Descriptions
#
# As per the assignment, we need to use a GenAI model to generate creative descriptions for the recommended products. We'll use LangChain for this.
#
# **Note:** We'll use a lightweight, open-source model from HuggingFace to avoid API costs. `google/gemma-2b-it` is a great choice. You may need to log in to HuggingFace for access.

# +
# from huggingface_hub import notebook_login

# notebook_login() # Run this if you need to authenticate with HuggingFace

# +
# Initialize the LLM pipeline
# Using a smaller, faster model like gemma-2b-it is ideal for this task.
llm_model_name = "google/gemma-2b-it"
llm = HuggingFacePipeline.from_model_id(
    model_id=llm_model_name,
    task="text-generation",
    device=device,
    pipeline_kwargs={"max_new_tokens": 100, "torch_dtype":torch.bfloat16},
    use_safetensors=True # Add this to ensure secure loading
)

# Create a prompt template using LangChain
prompt_template = """
You are a creative copywriter for a high-end furniture store.
Write a short, engaging, and creative product description based on the following details.
Do not just repeat the details, but weave them into a compelling narrative.

Product Details:
- Title: {title}
- Brand: {brand}
- Material: {material}
- Color: {color}

Creative Description:
"""

prompt = PromptTemplate(template=prompt_template, input_variables=["title", "brand", "material", "color"])

# Create the LangChain chain using the modern LCEL (LangChain Expression Language) pipe syntax.
# This is more robust with recent langchain versions.
chain = prompt | llm

print("LangChain with Gemma model initialized.")


# -

# ### Function to get product details and generate new description
def generate_creative_description(product_id):
    """
    Fetches product details and uses the LLM to generate a new description.
    """
    # Find the product details from our original dataframe
    product_details = df_clean[df_clean['uniq_id'] == product_id].iloc[0]
    
    # The input for the chain is a dictionary with keys matching the prompt's input variables
    input_data = {
        'title': product_details['title'],
        'brand': product_details['brand'],
        'material': product_details['material'],
        'color': product_details['color']
    }
    
    # Run the chain. The result of a simple `prompt | llm` chain is the generated string.
    creative_description = chain.invoke(input_data)
    
    return creative_description


# --- Test the full recommendation-to-generation pipeline ---
print("--- Testing the Full Pipeline ---")
query = "a stylish metal table for outdoor dining"
search_results = find_similar_products(query, top_k=2)

for result in search_results:
    product_id = result['id']
    print(f"\nProduct ID: {product_id}")
    print(f"Similarity Score: {result['score']:.4f}")
    
    # Generate a new description
    creative_desc = generate_creative_description(product_id)
    print("\nGenerated Creative Description:")
    print(creative_desc)
    print("="*30)

# ## Conclusion
#
# This notebook has successfully:
# 1. Processed and cleaned the furniture dataset.
# 2. Generated multi-modal embeddings using the CLIP model.
# 3. Set up a Pinecone index and populated it with the product embeddings.
# 4. Implemented a semantic search function to find relevant products.
# 5. Integrated a generative AI model using LangChain to create compelling product descriptions.
#
# The assets and logic created here are now ready to be integrated into the FastAPI backend. The `find_similar_products` and `generate_creative_description` functions form the core logic for your API endpoints.

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Using device: cuda
Dataset loaded successfully.
Dataset shape: (312, 12)
Created 'combined_text' feature.
Original dataframe rows: 312
Cleaned dataframe rows (with valid image URLs): 312
CLIP model and processor loaded.


Generating Embeddings:   0%|          | 0/5 [00:00<?, ?it/s]

Embedding generation complete.
Shape of final embeddings tensor: torch.Size([312, 1024])
Pinecone client initialized.
Index 'furniture-recommendations' already exists.


Upserting to Pinecone:   0%|          | 0/4 [00:00<?, ?it/s]

Upserting complete.
Query: 'a comfortable wooden chair for the living room'

  - ID: 0acd8c1c-d689-5fc2-b0ba-9909a60f47dd
    Score: 0.6779
--------------------
  - ID: e0ea5029-8dae-5261-9c57-e98bd40e5bdb
    Score: 0.6618
--------------------
  - ID: 487adf3a-9485-5500-9c98-bcc391eda169
    Score: 0.6611
--------------------
  - ID: 0583ef58-47cd-509b-9e6d-89a0ad8490b2
    Score: 0.6560
--------------------
  - ID: ce921425-0121-53b8-9ce8-1f455be7c9e8
    Score: 0.6527
--------------------


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/google/gemma-2b-it.
401 Client Error. (Request ID: Root=1-68f2c93d-1b3ec6894f41bd5b45693053;ef1492c0-11a1-493d-8a40-f58ea752a741)

Cannot access gated repo for url https://huggingface.co/google/gemma-2b-it/resolve/main/config.json.
Access to model google/gemma-2b-it is restricted. You must have access to it and be authenticated to access it. Please log in.

In [4]:
# ---
# jupyter:
#   jupytext:
#     text_representation:
#       extension: .py
#       format_name: light
#       format_version: '1.5'
#       jupytext_version: 1.16.2
#   kernelspec:
#     display_name: Python 3
#     language: python
#     name: python3
# ---

# # Product Recommendation System: Model Training & Vector DB Setup
#
# **Objective:** This notebook covers the end-to-end machine learning pipeline for the product recommendation web app.
#
# **Key Tasks:**
# 1.  **Data Loading & Preprocessing:** Load the furniture dataset and clean it for model consumption.
# 2.  **Feature Extraction (Embeddings):** Use OpenAI's CLIP model to generate powerful multi-modal (text and image) embeddings for each product.
# 3.  **Vector Database Setup:** Initialize and configure a Pinecone vector database to store these embeddings.
# 4.  **Upsert to Pinecone:** Populate the Pinecone index with our product embeddings for efficient similarity search.
# 5.  **Semantic Search:** Implement and test a function to find similar products based on a text query or image.
# 6.  **Generative AI for Descriptions:** Use a lightweight GenAI model via LangChain to create creative product descriptions.
#
# **Folder Structure Context:**
# ```
# AI-ML-Assignment/
# │
# ├── notebooks/
# │   ├── model_training.ipynb  <-- YOU ARE HERE
# │   └── analytics.ipynb
# │
# ├── data/
# │   └── intern_data_ikarus.csv
# ```

# ## Step 1: Setup and Installations
#
# First, let's install the necessary libraries for this project. We've added `safetensors` to ensure models are loaded securely and avoid potential `torch.load` errors.

# +
# !pip install -q -U pandas torch transformers safetensors pinecone-client sentence-transformers pillow requests tqdm langchain langchain-core langchain-huggingface
# -

# ## Step 2: Import Libraries and Load Data

# +
import pandas as pd
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests
import io
import os
from tqdm.auto import tqdm
from pinecone import Pinecone, ServerlessSpec # Updated Pinecone import
from langchain_huggingface import HuggingFacePipeline
from langchain_core.prompts import PromptTemplate # Updated import for recent langchain versions

# Check for GPU availability for faster processing
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# -

# Load the dataset. Make sure the path is correct based on the folder structure.
try:
    df = pd.read_csv('intern_data_ikarus.csv')
    print("Dataset loaded successfully.")
    print("Dataset shape:", df.shape)
except FileNotFoundError:
    print("Error: Dataset file not found. Please ensure 'intern_data_ikarus.csv' is in the same directory as the notebook or provide the correct path.")

df.head()

# ## Step 3: Data Cleaning and Preprocessing
#
# We need to prepare the data for our models. This involves handling missing values and creating a unified text field for each product that will be used to generate text embeddings. We also need to clean the image URLs.

# +
# --- Handle Missing Values ---
# Fill missing values in key text columns with an empty string or a placeholder.
# This ensures that our combined text field doesn't fail on NaN values.
text_cols = ['title', 'description', 'brand', 'categories', 'material', 'color']
for col in text_cols:
    df[col] = df[col].fillna('')

# --- Create a Combined Text Feature ---
# We create a single, rich text description for each product. This helps the model
# understand the product from multiple textual attributes simultaneously.
def combine_text_features(row):
    return (
        f"Title: {row['title']}. "
        f"Brand: {row['brand']}. "
        f"Categories: {row['categories']}. "
        f"Description: {row['description']}. "
        f"Material: {row['material']}. "
        f"Color: {row['color']}."
    )

df['combined_text'] = df.apply(combine_text_features, axis=1)

print("Created 'combined_text' feature.")
df[['uniq_id', 'combined_text']].head()
# -

# --- Clean Image URLs ---
# The 'images' column is a string representation of a list. We need to parse it
# and extract the first valid image URL.
def extract_first_image_url(images_str):
    try:
        # The string looks like "['url1', 'url2', ...]". We strip brackets and quotes.
        image_list = images_str.strip("[]").split("', '")
        first_image = image_list[0].strip("'")
        return first_image if first_image else None
    except (IndexError, AttributeError):
        return None

df['main_image_url'] = df['images'].apply(extract_first_image_url)

# Drop rows where we couldn't find a valid image URL, as they can't be processed by CLIP.
df_clean = df.dropna(subset=['main_image_url']).copy()
df_clean = df_clean.reset_index(drop=True)

print(f"Original dataframe rows: {len(df)}")
print(f"Cleaned dataframe rows (with valid image URLs): {len(df_clean)}")
df_clean[['uniq_id', 'main_image_url']].head()


# ## Step 4: Initialize CLIP Model
#
# We'll use the `openai/clip-vit-base-patch32` model. This model is a powerful multi-modal encoder that can process both text and images into the same embedding space, making it perfect for our use case.

# +
model_name = "openai/clip-vit-base-patch32"
processor = CLIPProcessor.from_pretrained(model_name)
# Use `use_safetensors=True` to avoid the torch.load vulnerability error with older torch versions.
model = CLIPModel.from_pretrained(model_name, use_safetensors=True).to(device)

print("CLIP model and processor loaded.")
# -

# ## Step 5: Generate Embeddings
#
# Now, we'll process our entire dataset. For each product, we'll generate a text embedding, an image embedding, and then combine them.

# ### Helper function to download and process images
def get_image_from_url(url):
    """Downloads an image from a URL and returns a PIL Image."""
    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status()
        image = Image.open(io.BytesIO(response.content)).convert("RGB")
        return image
    except Exception as e:
        # print(f"Warning: Could not download or process image from {url}. Error: {e}")
        return None


# ### Generate Text and Image Embeddings in Batches
# We process in batches to manage memory usage, especially if using a GPU.

# +
batch_size = 64
all_combined_embeddings = []
valid_ids = []

# We'll iterate through the dataframe in chunks (batches)
for i in tqdm(range(0, len(df_clean), batch_size), desc="Generating Embeddings"):
    batch_df = df_clean.iloc[i:i+batch_size]
    
    # --- Process Texts ---
    texts = batch_df['combined_text'].tolist()
    text_inputs = processor(text=texts, return_tensors="pt", padding=True, truncation=True, max_length=77).to(device)
    with torch.no_grad():
        text_features = model.get_text_features(**text_inputs)
        
    # --- Process Images ---
    image_urls = batch_df['main_image_url'].tolist()
    images = [get_image_from_url(url) for url in image_urls]
    
    # Filter out any images that failed to download
    valid_images = [img for img in images if img is not None]
    valid_indices = [idx for idx, img in enumerate(images) if img is not None]

    if not valid_images:
        continue # Skip batch if no images are valid
        
    image_inputs = processor(images=valid_images, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        image_features_valid = model.get_image_features(**image_inputs)
    
    # Create a tensor for all images in the batch, filling failed ones with zeros
    image_features = torch.zeros(len(batch_df), image_features_valid.shape[1]).to(device)
    image_features[valid_indices, :] = image_features_valid

    # --- Combine and Normalize Embeddings ---
    # Concatenate text and image features to create a multi-modal embedding
    combined_features = torch.cat((text_features, image_features), dim=1)
    
    # Normalize the embeddings to have a unit length (magnitude of 1).
    # This is a best practice for similarity search with cosine distance.
    normalized_features = combined_features / combined_features.norm(dim=1, keepdim=True)
    
    all_combined_embeddings.append(normalized_features.cpu())
    valid_ids.extend(batch_df['uniq_id'].tolist())

# Concatenate all batch results into a single tensor
final_embeddings = torch.cat(all_combined_embeddings, dim=0)

print("Embedding generation complete.")
print("Shape of final embeddings tensor:", final_embeddings.shape)
# -

# ## Step 6: Setup Pinecone Vector Database (Updated Syntax)
#
# **Action Required:** You need to sign up for a free Pinecone account at [pinecone.io](https://www.pinecone.io/) and get your API key.

# +
# --- Pinecone Initialization ---
# PLEASE REPLACE with your actual Pinecone API key
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY", "pcsk_3uhm1m_DyXFRsMZxZpf7Q5m1Q7hrN7Fj7s97PpSup2Eo3EryDySx2Eu3z3Yhyywr1XDdZd")

if PINECONE_API_KEY == "YOUR_API_KEY":
    print("🚨 Please replace 'YOUR_API_KEY' with your actual Pinecone API key.")
else:
    # New initialization method
    pc = Pinecone(api_key=PINECONE_API_KEY)
    print("Pinecone client initialized.")

# -

# --- Create a Pinecone Index ---
# An index is where our vectors will be stored. We define its name, dimension, and spec.
index_name = "furniture-recommendations"
embedding_dim = final_embeddings.shape[1]

# New method to check if an index exists
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    print(f"Creating index '{index_name}'...")
    # New method to create an index
    pc.create_index(
        name=index_name,
        dimension=embedding_dim,
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1' # You can choose a region that is close to you
        )
    )
    print("Index created successfully.")
else:
    print(f"Index '{index_name}' already exists.")

# Connect to the index (this is also updated)
index = pc.Index(index_name)
index.describe_index_stats()

# ## Step 7: Upsert Embeddings to Pinecone
#
# Now we'll upload our generated embeddings into the Pinecone index. We'll do this in batches for efficiency. We will also store the `combined_text` as metadata.

# +
# We need to associate each vector with its unique product ID and text.
vectors_to_upsert = []
for i, (uniq_id, embedding) in enumerate(zip(valid_ids, final_embeddings)):
    # Pinecone expects a list of tuples: (id, vector, metadata)
    metadata = {'text': df_clean.loc[i, 'combined_text']}
    vectors_to_upsert.append((uniq_id, embedding.tolist(), metadata))

# Upsert in batches to avoid overwhelming the API
upsert_batch_size = 100
for i in tqdm(range(0, len(vectors_to_upsert), upsert_batch_size), desc="Upserting to Pinecone"):
    batch = vectors_to_upsert[i:i+upsert_batch_size]
    index.upsert(vectors=batch)

print("Upserting complete.")
index.describe_index_stats()
# -

# ## Step 8: Implement Semantic Search
#
# Let's test our system! We'll create a function that takes a text query, generates a combined embedding for it (we'll use a blank image for a pure text query), and searches Pinecone.

# +
def find_similar_products(query_text, top_k=5):
    """
    Finds similar products in the Pinecone index based on a text query.
    """
    # 1. Generate text embedding for the query
    text_inputs = processor(text=[query_text], return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        query_text_embedding = model.get_text_features(**text_inputs)
        
    # 2. Create a "zero" image embedding as we are only searching with text
    # CORRECTED: Use `model.config.projection_dim` which is the final output dimension (512)
    # for both text and image embeddings, ensuring the dimensions match.
    # The original error was caused by using a different internal dimension.
    query_image_embedding = torch.zeros(1, model.config.projection_dim).to(device)
    
    # 3. Combine and normalize
    query_embedding = torch.cat((query_text_embedding, query_image_embedding), dim=1)
    query_embedding = query_embedding / query_embedding.norm(dim=1, keepdim=True)
    
    # 4. Query Pinecone
    results = index.query(
        vector=query_embedding.cpu().tolist(),
        top_k=top_k,
        include_metadata=True
    )
    
    return results['matches']

# --- Let's test it! ---
query = "a comfortable wooden chair for the living room"
matches = find_similar_products(query)

print(f"Query: '{query}'\n")
for match in matches:
    print(f"  - ID: {match['id']}")
    print(f"    Score: {match['score']:.4f}")
    # print(f"    Text: {match['metadata']['text'][:150]}...")
    print("-" * 20)
# -

# ## Step 9: Generative AI for Creative Descriptions
#
# As per the assignment, we need to use a GenAI model to generate creative descriptions for the recommended products. We'll use LangChain for this.
#
# **Note:** We are switching to a non-gated model, `TinyLlama/TinyLlama-1.1B-Chat-v1.0`, to avoid the authentication error caused by the gated `google/gemma-2b-it` model. This model is open-access and does not require a Hugging Face token.

# +
# Initialize the LLM pipeline
# Using a smaller, public model like TinyLlama is ideal to avoid authentication issues.
llm_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# CORRECTED: Arguments for the underlying model (`torch_dtype`) should go in `model_kwargs`,
# and arguments for the pipeline itself (`max_new_tokens`) go in `pipeline_kwargs`.
# The `use_safetensors` argument that caused the error is removed, as it's not a valid
# top-level parameter and is handled by default.
llm = HuggingFacePipeline.from_model_id(
    model_id=llm_model_name,
    task="text-generation",
    device=0 if device == "cuda" else -1,
    model_kwargs={"torch_dtype": torch.bfloat16},
    pipeline_kwargs={"max_new_tokens": 100},
)

# Create a prompt template using LangChain
prompt_template = """
You are a creative copywriter for a high-end furniture store.
Write a short, engaging, and creative product description based on the following details.
Do not just repeat the details, but weave them into a compelling narrative.

Product Details:
- Title: {title}
- Brand: {brand}
- Material: {material}
- Color: {color}

Creative Description:
"""

prompt = PromptTemplate(template=prompt_template, input_variables=["title", "brand", "material", "color"])

# Create the LangChain chain using the modern LCEL (LangChain Expression Language) pipe syntax.
# This is more robust with recent langchain versions.
chain = prompt | llm

print("LangChain with TinyLlama model initialized.")


# -

# ### Function to get product details and generate new description
def generate_creative_description(product_id):
    """
    Fetches product details and uses the LLM to generate a new description.
    """
    # Find the product details from our original dataframe
    product_details = df_clean[df_clean['uniq_id'] == product_id].iloc[0]
    
    # The input for the chain is a dictionary with keys matching the prompt's input variables
    input_data = {
        'title': product_details['title'],
        'brand': product_details['brand'],
        'material': product_details['material'],
        'color': product_details['color']
    }
    
    # Run the chain. The result of a simple `prompt | llm` chain is the generated string.
    creative_description = chain.invoke(input_data)
    
    return creative_description


# --- Test the full recommendation-to-generation pipeline ---
print("--- Testing the Full Pipeline ---")
query = "a stylish metal table for outdoor dining"
search_results = find_similar_products(query, top_k=2)

for result in search_results:
    product_id = result['id']
    print(f"\nProduct ID: {product_id}")
    print(f"Similarity Score: {result['score']:.4f}")
    
    # Generate a new description
    creative_desc = generate_creative_description(product_id)
    print("\nGenerated Creative Description:")
    print(creative_desc)
    print("="*30)

# ## Conclusion
#
# This notebook has successfully:
# 1. Processed and cleaned the furniture dataset.
# 2. Generated multi-modal embeddings using the CLIP model.
# 3. Set up a Pinecone index and populated it with the product embeddings.
# 4. Implemented a semantic search function to find relevant products.
# 5. Integrated a generative AI model using LangChain to create compelling product descriptions.
#
# The assets and logic created here are now ready to be integrated into the FastAPI backend. The `find_similar_products` and `generate_creative_description` functions form the core logic for your API endpoints.



Using device: cuda
Dataset loaded successfully.
Dataset shape: (312, 12)
Created 'combined_text' feature.
Original dataframe rows: 312
Cleaned dataframe rows (with valid image URLs): 312
CLIP model and processor loaded.


Generating Embeddings:   0%|          | 0/5 [00:00<?, ?it/s]

Embedding generation complete.
Shape of final embeddings tensor: torch.Size([312, 1024])
Pinecone client initialized.
Index 'furniture-recommendations' already exists.


Upserting to Pinecone:   0%|          | 0/4 [00:00<?, ?it/s]

Upserting complete.
Query: 'a comfortable wooden chair for the living room'

  - ID: 0acd8c1c-d689-5fc2-b0ba-9909a60f47dd
    Score: 0.6779
--------------------
  - ID: e0ea5029-8dae-5261-9c57-e98bd40e5bdb
    Score: 0.6618
--------------------
  - ID: 487adf3a-9485-5500-9c98-bcc391eda169
    Score: 0.6611
--------------------
  - ID: 0583ef58-47cd-509b-9e6d-89a0ad8490b2
    Score: 0.6560
--------------------
  - ID: ce921425-0121-53b8-9ce8-1f455be7c9e8
    Score: 0.6527
--------------------


`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use cuda:0


LangChain with TinyLlama model initialized.
--- Testing the Full Pipeline ---

Product ID: 9e82e445-4d8d-5fd1-9dcf-a92d9783365c
Similarity Score: 0.6438

Generated Creative Description:

You are a creative copywriter for a high-end furniture store.
Write a short, engaging, and creative product description based on the following details.
Do not just repeat the details, but weave them into a compelling narrative.

Product Details:
- Title: Need Fold Wall Mounted Workbench Folding Wall Table Length 47.2" Width 20" Perfect Addition to Garage & Shed/Home Office/Laundry Room/Home Bar/Kitchen & Dining Room
- Brand: Need Store
- Material: Metal
- Color: Teak Color Desktop & Warm White Folding Brackets

Creative Description:
The Need Fold Wall Mounted Workbench Folding Wall Table is an excellent addition to any garage, shed, home office, laundry room, home bar, kitchen, and dining room. Its sleek design is perfect for those who love to work in a compact space. The table folds flat for easy stor