# Furniture Product Recommendation Model Training

This notebook implements the ML models for product recommendation, NLP clustering, CV classification, and GenAI description generation.

In [None]:
import pandas as pd
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import pinecone
from PIL import Image
import requests
from io import BytesIO
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

## 1. Load Cleaned Dataset

In [None]:
# Load cleaned dataset
df = pd.read_csv('../data/furniture_dataset_cleaned.csv')
print(f"Dataset shape: {df.shape}")
print("Columns:", list(df.columns))
df.head()

## 2. Text Embedding and Recommendation Model

Using SentenceTransformers to create embeddings for semantic search.

In [None]:
# Initialize sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create combined text for embedding
df['text_for_embedding'] = df['title'] + '. ' + df['description']

# Generate embeddings
print("Generating text embeddings...")
embeddings = model.encode(df['text_for_embedding'].tolist(), show_progress_bar=True)
print(f"Embeddings shape: {embeddings.shape}")

# Save embeddings
np.save('../models/text_embeddings.npy', embeddings)
print("Embeddings saved to ../models/text_embeddings.npy")

## 3. NLP Clustering for Similar Products

Using K-means clustering on text embeddings to group similar products.

In [None]:
# Determine optimal number of clusters using silhouette score
silhouette_scores = []
k_range = range(5, 21, 2)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    cluster_labels = kmeans.fit_predict(embeddings)
    silhouette_avg = silhouette_score(embeddings, cluster_labels)
    silhouette_scores.append(silhouette_avg)
    print(f"k={k}, Silhouette Score: {silhouette_avg:.4f}")

# Plot silhouette scores
plt.figure(figsize=(10, 6))
plt.plot(k_range, silhouette_scores, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs Number of Clusters')
plt.grid(True)
plt.show()

# Choose optimal k (highest silhouette score)
optimal_k = k_range[np.argmax(silhouette_scores)]
print(f"Optimal number of clusters: {optimal_k}")

# Fit final model
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(embeddings)

# Save cluster model
import joblib
joblib.dump(kmeans, '../models/kmeans_model.pkl')
print("K-means model saved to ../models/kmeans_model.pkl")

In [None]:
# Analyze clusters
cluster_counts = df['cluster'].value_counts().sort_index()
print("Cluster distribution:")
print(cluster_counts)

# Sample products from each cluster
for cluster_id in range(min(5, optimal_k)):
    print(f"\nCluster {cluster_id} sample products:")
    cluster_products = df[df['cluster'] == cluster_id]['title'].head(3)
    for product in cluster_products:
        print(f"- {product[:60]}...")

## 4. Computer Vision Model for Image Classification

Using a pre-trained Vision Transformer for category classification.

In [None]:
# Load image classification pipeline
image_classifier = pipeline("image-classification", model="google/vit-base-patch16-224")

# Function to classify image
def classify_image(image_url):
    try:
        response = requests.get(image_url, timeout=10)
        img = Image.open(BytesIO(response.content))
        # Resize if too large
        if img.size[0] > 224 or img.size[1] > 224:
            img.thumbnail((224, 224))
        results = image_classifier(img)
        return results[0]['label']
    except Exception as e:
        print(f"Error classifying image {image_url}: {e}")
        return "Unknown"

# Classify images for products with valid URLs
print("Classifying product images...")
df['image_category'] = df['images'].apply(lambda x: classify_image(x) if pd.notnull(x) else "No Image")

# Save updated dataframe
df.to_csv('../data/furniture_dataset_with_categories.csv', index=False)
print("Dataset with image categories saved.")

In [None]:
# Analyze image categories
image_cat_counts = df['image_category'].value_counts().head(10)
print("Top image categories:")
print(image_cat_counts)

plt.figure(figsize=(12, 6))
sns.barplot(x=image_cat_counts.values, y=image_cat_counts.index)
plt.title('Top 10 Image Categories')
plt.xlabel('Count')
plt.show()

## 5. Generative AI for Creative Descriptions

Using LangChain with a HuggingFace model for generating creative product descriptions.

In [None]:
# Initialize text generation pipeline
text_generator = pipeline("text-generation", model="distilgpt2", max_length=100, temperature=0.7)

# Create LangChain LLM
llm = HuggingFacePipeline(pipeline=text_generator)

# Create prompt template for creative descriptions
description_prompt = PromptTemplate(
    input_variables=["product_title", "original_description"],
    template="""Create a creative and engaging product description for:
Product: {product_title}
Original: {original_description}

Creative Description:"""
)

# Create chain
description_chain = LLMChain(llm=llm, prompt=description_prompt)

# Function to generate creative description
def generate_creative_description(title, original_desc):
    try:
        result = description_chain.run(product_title=title, original_description=original_desc)
        # Clean up the generated text
        creative_desc = result.split('Creative Description:')[-1].strip()
        return creative_desc
    except Exception as e:
        print(f"Error generating description: {e}")
        return original_desc

# Generate creative descriptions for a sample
print("Generating creative descriptions for sample products...")
sample_df = df.head(10).copy()
sample_df['creative_description'] = sample_df.apply(
    lambda row: generate_creative_description(row['title'], row['description']), axis=1
)

# Display sample results
for idx, row in sample_df.iterrows():
    print(f"\nOriginal: {row['description'][:100]}...")
    print(f"Creative: {row['creative_description'][:100]}...")

## 6. Pinecone Vector Database Setup

Store embeddings in Pinecone for semantic search.

In [None]:
# Initialize Pinecone (you'll need to set your API key)
# pinecone.init(api_key="your-api-key", environment="your-environment")

# Create index
# index_name = "furniture-recommendations"
# if index_name not in pinecone.list_indexes():
#     pinecone.create_index(index_name, dimension=embeddings.shape[1])

# index = pinecone.Index(index_name)

# Prepare data for Pinecone
# vectors = []
# for i, (idx, row) in enumerate(df.iterrows()):
#     vector = {
#         "id": row['uniq_id'],
#         "values": embeddings[i].tolist(),
#         "metadata": {
#             "title": row['title'],
#             "description": row['description'],
#             "price": float(row['price']),
#             "brand": row['brand'],
#             "categories": row['categories'],
#             "image": row['images'],
#             "cluster": int(row['cluster'])
#         }
#     }
#     vectors.append(vector)

# Upload to Pinecone in batches
# batch_size = 100
# for i in range(0, len(vectors), batch_size):
#     batch = vectors[i:i+batch_size]
#     index.upsert(vectors=batch)
#     print(f"Uploaded batch {i//batch_size + 1}/{len(vectors)//batch_size + 1}")

print("Pinecone setup code prepared. Uncomment and run with your API key.")
print("Note: Pinecone integration will be completed in the backend implementation.")

## 7. Model Evaluation and Summary

In [None]:
print("Model Training Summary:")
print(f"- Dataset size: {len(df)} products")
print(f"- Embedding dimension: {embeddings.shape[1]}")
print(f"- Number of clusters: {optimal_k}")
print(f"- Silhouette score: {max(silhouette_scores):.4f}")
print(f"- Products with images: {df['images'].notnull().sum()}")
print(f"- Image categories identified: {df['image_category'].nunique()}")

# Save final dataset
df.to_csv('../data/furniture_dataset_final.csv', index=False)
print("\nFinal dataset saved to ../data/furniture_dataset_final.csv")

# Save model artifacts
model.save('../models/sentence_transformer_model')
print("Sentence transformer model saved to ../models/sentence_transformer_model")

print("\nNext steps:")
print("1. Set up Pinecone API key and run vector database setup")
print("2. Implement backend API endpoints")
print("3. Build frontend interface")
print("4. Test end-to-end functionality")