In [24]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

# Load the dataset
file_path = '/content/BigBasket Products.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()


Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."
3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.0,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...
4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.0,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...


In [25]:
# Checking for missing values
missing_values = data.isnull().sum()

# Checking for duplicate entries
duplicate_entries = data.duplicated().sum()

# Overview of the rating distribution
rating_distribution = data['rating'].describe()

missing_values, duplicate_entries, rating_distribution


(index              0
 product            1
 category           0
 sub_category       0
 brand              1
 sale_price         1
 market_price       1
 type               1
 rating          5958
 description       81
 dtype: int64,
 0,
 count    12987.000000
 mean         3.938654
 std          0.744717
 min          1.000000
 25%          3.700000
 50%          4.100000
 75%          4.300000
 max          5.000000
 Name: rating, dtype: float64)

In [26]:

# Drop rows with missing 'rating' as they are critical for popularity-based recommendations
data.dropna(subset=['rating'], inplace=True)

categorical_features = ['category','sub_category','brand']
label_encoders = {}

for feature in categorical_features:
    le = LabelEncoder()
    data[feature] = le.fit_transform(data[feature].astype(str))  # Use 'le' instead of 'encoder'
    label_encoders[feature] = le

data['description'] = data['description'].astype(str)

# Now, proceed with tokenizing the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['description'])
text_sequences = tokenizer.texts_to_sequences(data['description'])
text_padded = pad_sequences(text_sequences, maxlen=100)

# Normalize numerical features
scaler = MinMaxScaler()
data['rating'] = scaler.fit_transform(data[['rating']])

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(text_padded, data['rating'], test_size=0.2, random_state=42)


In [27]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Flatten, Dropout

input_dim = text_padded.shape[1]  # This is the maxlen of your text sequences

# Define the encoder
input_layer = Input(shape=(input_dim,))
encoded = Dense(128, activation='relu')(input_layer)
encoded = Dense(64, activation='relu')(encoded)

# Define the decoder
decoded = Dense(64, activation='relu')(encoded)
decoded = Dense(128, activation='relu')(decoded)
output_layer = Dense(input_dim, activation='sigmoid')(decoded)

# Compile the autoencoder
autoencoder = Model(input_layer, output_layer)
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')

# Summary of the model
autoencoder.summary()


Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 100)]             0         
                                                                 
 dense_10 (Dense)            (None, 128)               12928     
                                                                 
 dense_11 (Dense)            (None, 64)                8256      
                                                                 
 dense_12 (Dense)            (None, 64)                4160      
                                                                 
 dense_13 (Dense)            (None, 128)               8320      
                                                                 
 dense_14 (Dense)            (None, 100)               12900     
                                                                 
Total params: 46564 (181.89 KB)
Trainable params: 46564 (18

In [28]:
autoencoder.fit(X_train, X_train, epochs=50, batch_size=256, validation_split=0.2)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7ffa50505a80>

In [29]:
# Extract the encoder part of the autoencoder
encoder_model = Model(inputs=autoencoder.input, outputs=autoencoder.layers[-3].output)  # Assuming -3 is the encoded layer

# Generate embeddings for the entire dataset
item_embeddings = encoder_model.predict(text_padded)




In [35]:
def recommend_items(item_id, similarity_matrix, items, label_encoders, top_k=3):
    # Fetch the category of the specified item
    item_category = items.iloc[item_id]['category']

    # Get similarity scores for the item
    similarity_scores = list(enumerate(similarity_matrix[item_id]))

    # Sort the items based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Initialize a list to hold indices of items that match the category constraint
    filtered_indices = []

    # Iterate over sorted similarity scores and filter by category
    for idx, score in similarity_scores:
        if items.iloc[idx]['category'] == item_category and idx != item_id:
            filtered_indices.append(idx)
        if len(filtered_indices) == top_k:
            break

    # Fetch the recommended items
    recommended_items = items.iloc[filtered_indices]

    # Decode the categorical features for readability
    for feature in ['category', 'sub_category', 'brand']:
        recommended_items[feature] = label_encoders[feature].inverse_transform(recommended_items[feature])

    # Sort the recommended items in descending order of rating
    recommended_items_sorted = recommended_items.sort_values(by='rating', ascending=False)

    return recommended_items_sorted


In [36]:
# Updated example usage
recommended_items = recommend_items(6, similarity_matrix, data, label_encoders, top_k=3)
print(recommended_items[['product', 'category', 'sub_category', 'brand', 'rating']])


                                        product          category  \
259  Aeda Glycerine Bathing Bar - Natural Green  Beauty & Hygiene   
186             Herbal Wine Grapefruit Lip Balm  Beauty & Hygiene   
354                 Active Fresh Gel Toothpaste  Beauty & Hygiene   

         sub_category             brand  rating  
259  Bath & Hand Wash  K.P. Namboodiris   0.925  
186         Skin Care     Khadi Natural   0.850  
354         Oral Care          Himalaya   0.825  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommended_items[feature] = label_encoders[feature].inverse_transform(recommended_items[feature])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommended_items[feature] = label_encoders[feature].inverse_transform(recommended_items[feature])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recomme

In [32]:
# Evaluate the model on the test set
test_loss = autoencoder.evaluate(X_test, X_test)
print(f'Test Loss: {test_loss}')


Test Loss: -2.1792618705049354e+18


In [33]:
item_0_data = data.iloc[0]
item_0_data

index                                                           1
product                    Garlic Oil - Vegetarian Capsule 500 mg
category                                                        2
sub_category                                                   42
brand                                                        1489
sale_price                                                  220.0
market_price                                                220.0
type                                             Hair Oil & Serum
rating                                                      0.775
description     This Product contains Garlic Oil that is known...
Name: 0, dtype: object

In [38]:
from tensorflow.keras.models import load_model
import joblib
import pickle

# Save the autoencoder model
autoencoder.save('/content/autoencoder_model.h5')  # Adjust path as needed

# Save the encoder part of the model
encoder_model.save('/content/encoder_model.h5')  # Adjust path as needed

# Save the tokenizer
with open('/content/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Save the label encoders
joblib.dump(label_encoders, '/content/label_encoders.joblib')

# Save the scaler
joblib.dump(scaler, '/content/scaler.joblib')


  saving_api.save_model(


['/content/scaler.joblib']

In [39]:
# Load the autoencoder model
autoencoder = load_model('/content/autoencoder_model.h5')

# Load the encoder model
encoder_model = load_model('/content/encoder_model.h5')

# Load the tokenizer
with open('/content/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

# Load the label encoders
label_encoders = joblib.load('/content/label_encoders.joblib')

# Load the scaler
scaler = joblib.load('/content/scaler.joblib')




In [43]:
def get_recommendations_for_product(product_name, data, similarity_matrix, label_encoders, top_k=3):
    # Find the index of the product
    product_indices = data[data['product'].str.contains(product_name, case=False, na=False)].index
    if not product_indices.empty:
        product_index = product_indices[0]
    else:
        return f"No product found matching name '{product_name}'"

    # Generate recommendations
    recommended_items = recommend_items(product_index, similarity_matrix, data, label_encoders, top_k=top_k)

    # Return recommended product details
    return recommended_items[['product', 'category', 'sub_category', 'brand', 'rating']]

# Example usage
product_name = 'Skin Oil'
recommendations = get_recommendations_for_product(product_name, data, similarity_matrix, label_encoders, top_k=3)
print(recommendations)


                                product          category      sub_category  \
224  Pure Neem Skin Purifying Face Wash  Beauty & Hygiene         Skin Care   
247           Skincare Hand Wash Refill  Beauty & Hygiene  Bath & Hand Wash   
6                          Multani Mati  Beauty & Hygiene         Skin Care   

         brand  rating  
224        Joy   0.825  
247     Dettol   0.825  
6    Satinance   0.650  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommended_items[feature] = label_encoders[feature].inverse_transform(recommended_items[feature])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommended_items[feature] = label_encoders[feature].inverse_transform(recommended_items[feature])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recomme