In [23]:
# Code for loading data, preprocessing text, applying TF-IDF vectorization,
# K-means clustering, and recommending similar products.
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Load the data from CSV
# file_path = 'path_to_your_file.csv'  # Replace with your file path
df = pd.read_csv(file_path)

# Create a new column for product IDs from the index
df.reset_index(inplace=True)
df.rename(columns={'index': 'product_id'}, inplace=True)

print("Data loaded successfully:")
print(df.head())

# Function to preprocess text
def preprocess_text(text):
    if isinstance(text, float):
        text = str(text)
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    return text

# Apply preprocessing to the product descriptions
df['processed_description'] = df['Description'].apply(preprocess_text)

print("Text preprocessing completed:")
print(df['processed_description'].head())

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features=500)  # Adjust max_features as needed
tfidf_matrix = vectorizer.fit_transform(df['processed_description'])

print("TF-IDF matrix shape:", tfidf_matrix.shape)

# Apply K-means clustering
num_clusters = 300  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(tfidf_matrix)

# Assign cluster labels to products
df['cluster'] = kmeans.labels_

print("K-means clustering completed:")
print(df[['product_id', 'cluster']].head())

# Function to recommend similar products
def recommend_similar_products(product_id, n=5):
    # Get the cluster of the given product
    cluster_label = df.loc[df['product_id'] == product_id, 'cluster'].values[0]

    # Filter products from the same cluster
    similar_products = df[df['cluster'] == cluster_label]
    
    # Remove the original product from recommendations
    similar_products = similar_products[similar_products['product_id'] != product_id]

    # Select top 20 products based on similarity
    top_20_products = similar_products.head(20)
    
    # Get the category of the given product
    category = df.loc[df['product_id'] == product_id, 'Category'].values[0]
    
    # Prioritize products from the same category within the top 20
    same_category = top_20_products[top_20_products['Category'] == category]
    different_category = top_20_products[top_20_products['Category'] != category]
    
    # Combine them to ensure same category products are on top
    top_20_products = pd.concat([same_category, different_category])
    
    # Select top N products
    top_n_products = top_20_products.head(n)[['product_id', 'Name', 'Description']].values
    return top_n_products

# Example: Recommend similar products for a specific product
example_product_id = df['product_id'].iloc[1789]
example_product_name = df.loc[df['product_id'] == example_product_id, 'Name'].values[0]
example_product_description = df.loc[df['product_id'] == example_product_id, 'Description'].values[0]
similar_products = recommend_similar_products(example_product_id, n=5)

print(f"Product ID: {example_product_id}")
print(f"Product Name: {example_product_name}")
print(f"Product Description: {example_product_description}\n")

print("Top recommended products:")
for product in similar_products:
    print(f"Product ID: {product[0]}")
    print(f"Product Name: {product[1]}")
    print(f"Product Description: {product[2]}\n")


Data loaded successfully:
   product_id                               Name   Brand   Price  \
0           0             Premia Badam (Almonds)  Premia   451.0   
1           1             Premia Badam (Almonds)  Premia   109.0   
2           2             Premia Badam (Almonds)  Premia   202.0   
3           3  Nutraj California Almonds (Badam)  Nutraj   599.0   
4           4  Nutraj California Almonds (Badam)  Nutraj  1549.0   

   DiscountedPrice Category         SubCategory Quantity Description  \
0            329.0  Grocery  Grocery/Dry Fruits   500 gm       India   
1             85.0  Grocery  Grocery/Dry Fruits   100 gm       India   
2            175.0  Grocery  Grocery/Dry Fruits   200 gm       India   
3            349.0  Grocery          Dry Fruits   500 gm         USA   
4            659.0  Grocery          Dry Fruits     1 kg         USA   

                    BreadCrumbs  
0  Grocery > Grocery/Dry Fruits  
1  Grocery > Grocery/Dry Fruits  
2  Grocery > Grocery/Dry Fruit