## **Dataset and Details**

In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
products_path="/content/drive/MyDrive/BigBasket Products.csv"
products_df = pd.read_csv(products_path)
products_df.head(3)

Mounted at /content/drive


Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."


In [None]:
products_df = products_df.drop(["index"],axis=1)
products_df.head(3)

Unnamed: 0,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."


In [None]:
products_df.shape

(27555, 9)

In [None]:
products_df.columns

Index(['product', 'category', 'sub_category', 'brand', 'sale_price',
       'market_price', 'type', 'rating', 'description'],
      dtype='object')

In [None]:
products_df.nunique()

product         23540
category           11
sub_category       90
brand            2313
sale_price       3256
market_price     1348
type              426
rating             40
description     21944
dtype: int64

In [None]:
products_df.describe()

Unnamed: 0,sale_price,market_price,rating
count,27555.0,27555.0,18929.0
mean,322.514808,382.056664,3.94341
std,486.263116,581.730717,0.739063
min,2.45,3.0,1.0
25%,95.0,100.0,3.7
50%,190.0,220.0,4.1
75%,359.0,425.0,4.3
max,12500.0,12500.0,5.0


## **Null Values**

In [None]:
null_df =  products_df.copy(deep=True)

In [None]:
import numpy as np
na_cols=null_df.columns[null_df.isna().any()].tolist()
null_values=pd.DataFrame(null_df[na_cols].isna().sum(), columns=['Number'])
null_values['Percentage']=np.round(100*null_values['Number']/len(null_df),2)
print(null_values)

             Number  Percentage
product           1        0.00
brand             1        0.00
rating         8626       31.30
description     115        0.42


In [None]:
null_df = null_df.dropna(subset=['description'], axis=0)
null_df = null_df.dropna(subset=['product'], axis=0)
null_df['rating'] = null_df['rating'].fillna(null_df['rating'].median())

## **Encoding**

In [None]:
encoded_df = null_df.copy(deep=True)

### **Category:**

In [None]:
onehot_BC = pd.get_dummies(encoded_df['category'], prefix='category')
encoded_df = pd.concat([encoded_df, onehot_BC], axis=1)

In [None]:
encoded_df = encoded_df.drop(["category"],axis=1)
encoded_df.head(3)

Unnamed: 0,product,sub_category,brand,sale_price,market_price,type,rating,description,category_Baby Care,"category_Bakery, Cakes & Dairy",category_Beauty & Hygiene,category_Beverages,category_Cleaning & Household,"category_Eggs, Meat & Fish","category_Foodgrains, Oil & Masala",category_Fruits & Vegetables,category_Gourmet & World Food,"category_Kitchen, Garden & Pets",category_Snacks & Branded Foods
0,Garlic Oil - Vegetarian Capsule 500 mg,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...,0,0,1,0,0,0,0,0,0,0,0
1,Water Bottle - Orange,Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ...",0,0,0,0,0,0,0,0,0,1,0
2,"Brass Angle Deep - Plain, No.2",Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m...",0,0,0,0,1,0,0,0,0,0,0


In [None]:
encoded_df.shape

(27439, 19)

###**Subcategory:**

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
encoded_df['sub_category'] = label_encoder.fit_transform(encoded_df['sub_category'])

###**Brand:**

In [None]:
brand_counts = encoded_df["brand"].value_counts(normalize=True)
encoded_df["brand_freq_encoded"] = encoded_df["brand"].map(brand_counts) * 100
encoded_df = encoded_df.drop(["brand"],axis=1)

In [None]:
encoded_df.head(3)

Unnamed: 0,product,sub_category,sale_price,market_price,type,rating,description,category_Baby Care,"category_Bakery, Cakes & Dairy",category_Beauty & Hygiene,category_Beverages,category_Cleaning & Household,"category_Eggs, Meat & Fish","category_Foodgrains, Oil & Masala",category_Fruits & Vegetables,category_Gourmet & World Food,"category_Kitchen, Garden & Pets",category_Snacks & Branded Foods,brand_freq_encoded
0,Garlic Oil - Vegetarian Capsule 500 mg,49,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...,0,0,1,0,0,0,0,0,0,0,0,0.043733
1,Water Bottle - Orange,86,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ...",0,0,0,0,0,0,0,0,0,1,0,0.211378
2,"Brass Angle Deep - Plain, No.2",73,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m...",0,0,0,0,1,0,0,0,0,0,0,0.153067


In [None]:
encoded_df.shape

(27439, 19)

###**Type:**

In [None]:
encoded_df['type'] = label_encoder.fit_transform(encoded_df['type'])

###**Description:**

In [None]:
description = encoded_df['description'].values.tolist()
description

In [None]:
unique_lengths = set(len(x) for x in description)
print(unique_lengths)

{7, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 

In [None]:
len(unique_lengths)

1895

In [None]:
min(unique_lengths)

7

In [None]:
max(unique_lengths)

4486

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
def word_embeding(max_w, max_l, column_):
  max_words = max_w
  maxlen = max_l
  tokenizer = Tokenizer(num_words=max_words)
  tokenizer.fit_on_texts(column_)
  sequences = tokenizer.texts_to_sequences(column_)
  word_index = tokenizer.word_index
  print('Found %s unique tokens.' % len(word_index))
  return  word_index, sequences

In [None]:
word_index, sequences = word_embeding(20000, 4500, encoded_df['description'])

Found 32831 unique tokens.


In [None]:
encoded_df['description_sequences'] = sequences

In [None]:
encoded_df.head(3)

Unnamed: 0,product,sub_category,sale_price,market_price,type,rating,description,category_Baby Care,"category_Bakery, Cakes & Dairy",category_Beauty & Hygiene,category_Beverages,category_Cleaning & Household,"category_Eggs, Meat & Fish","category_Foodgrains, Oil & Masala",category_Fruits & Vegetables,category_Gourmet & World Food,"category_Kitchen, Garden & Pets",category_Snacks & Branded Foods,brand_freq_encoded,description_sequences
0,Garlic Oil - Vegetarian Capsule 500 mg,49,220.0,220.0,204,4.1,This Product contains Garlic Oil that is known...,0,0,1,0,0,0,0,0,0,0,0,0.043733,"[15, 62, 71, 574, 29, 14, 5, 144, 6, 83, 1582,..."
1,Water Bottle - Orange,86,180.0,180.0,420,2.3,"Each product is microwave safe (without lid), ...",0,0,0,0,0,0,0,0,0,1,0,0.211378,"[210, 62, 5, 599, 96, 87, 329, 1664, 96, 607, ..."
2,"Brass Angle Deep - Plain, No.2",73,119.0,250.0,249,3.4,"A perfect gift for all occasions, be it your m...",0,0,0,0,1,0,0,0,0,0,0,0.153067,"[4, 48, 667, 10, 30, 1284, 27, 8, 11, 1466, 45..."


In [None]:
import numpy as np
NoNull_df = encoded_df.copy(deep=True)
columns_= NoNull_df.columns[NoNull_df.isna().any()].tolist()
Nan_value=pd.DataFrame(NoNull_df[columns_].isna().sum(), columns=['Number of null values'])
Nan_value['Percentage']=np.round(100*Nan_value['Number of null values']/len(NoNull_df),2)
Nan_value

Unnamed: 0,Number of null values,Percentage


In [None]:
encoded_df = encoded_df.drop('description', axis = 1)
encoded_df.head()

Unnamed: 0,product,sub_category,sale_price,market_price,type,rating,category_Baby Care,"category_Bakery, Cakes & Dairy",category_Beauty & Hygiene,category_Beverages,category_Cleaning & Household,"category_Eggs, Meat & Fish","category_Foodgrains, Oil & Masala",category_Fruits & Vegetables,category_Gourmet & World Food,"category_Kitchen, Garden & Pets",category_Snacks & Branded Foods,brand_freq_encoded,description_sequences
0,Garlic Oil - Vegetarian Capsule 500 mg,49,220.0,220.0,204,4.1,0,0,1,0,0,0,0,0,0,0,0,0.043733,"[15, 62, 71, 574, 29, 14, 5, 144, 6, 83, 1582,..."
1,Water Bottle - Orange,86,180.0,180.0,420,2.3,0,0,0,0,0,0,0,0,0,1,0,0.211378,"[210, 62, 5, 599, 96, 87, 329, 1664, 96, 607, ..."
2,"Brass Angle Deep - Plain, No.2",73,119.0,250.0,249,3.4,0,0,0,0,1,0,0,0,0,0,0,0.153067,"[4, 48, 667, 10, 30, 1284, 27, 8, 11, 1466, 45..."
3,Cereal Flip Lid Container/Storage Jar - Assort...,9,149.0,176.0,250,3.7,0,0,0,0,1,0,0,0,0,0,0,0.375378,"[1016, 366, 7, 23, 684, 208, 1, 24, 17, 40, 19..."
4,Creme Soft Soap - For Hands & Body,8,162.0,162.0,39,4.4,0,0,1,0,0,0,0,0,0,0,0,0.317067,"[1481, 1028, 73, 223, 97, 11, 16, 2, 54, 107, ..."


In [None]:
def conversion(con_df,column):
  max_length = max(len(v) for v in con_df[column])
  new_columns = [f'{column}{i + 1}' for i in range(max_length)]
  con_df[new_columns] = con_df[column].apply(lambda x: pd.Series(x + [0] * (max_length - len(x))))
  print(con_df)

In [None]:
conversion(encoded_df,'description_sequences')

In [None]:
encoded_df.head(3)

Unnamed: 0,product,sub_category,sale_price,market_price,type,rating,category_Baby Care,"category_Bakery, Cakes & Dairy",category_Beauty & Hygiene,category_Beverages,...,description_sequences721,description_sequences722,description_sequences723,description_sequences724,description_sequences725,description_sequences726,description_sequences727,description_sequences728,description_sequences729,description_sequences730
0,Garlic Oil - Vegetarian Capsule 500 mg,49,220.0,220.0,204,4.1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Water Bottle - Orange,86,180.0,180.0,420,2.3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Brass Angle Deep - Plain, No.2",73,119.0,250.0,249,3.4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
encoded_df = encoded_df.drop('description_sequences', axis = 1)
encoded_df.head()

Unnamed: 0,product,sub_category,sale_price,market_price,type,rating,category_Baby Care,"category_Bakery, Cakes & Dairy",category_Beauty & Hygiene,category_Beverages,...,description_sequences721,description_sequences722,description_sequences723,description_sequences724,description_sequences725,description_sequences726,description_sequences727,description_sequences728,description_sequences729,description_sequences730
0,Garlic Oil - Vegetarian Capsule 500 mg,49,220.0,220.0,204,4.1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Water Bottle - Orange,86,180.0,180.0,420,2.3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Brass Angle Deep - Plain, No.2",73,119.0,250.0,249,3.4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Cereal Flip Lid Container/Storage Jar - Assort...,9,149.0,176.0,250,3.7,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Creme Soft Soap - For Hands & Body,8,162.0,162.0,39,4.4,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


##**Recommender System**

In [None]:
recomm_df = encoded_df.copy(deep=True)

In [None]:
def vectorize_rows(dataframe, columns_to_exclude=None):
    if columns_to_exclude:
        dataframe = dataframe.drop(columns=columns_to_exclude)
    return dataframe.values.tolist()

In [None]:
columns_for_vector = recomm_df.columns
columns_to_exclude = ['product','sale_price']
vectorized_rows = vectorize_rows(recomm_df, columns_to_exclude)
for idx, row in enumerate(vectorized_rows[:3]):
    print(f"Row {idx + 1}: {row}")

Row 1: [49.0, 220.0, 204.0, 4.1, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04373337220744196, 15.0, 62.0, 71.0, 574.0, 29.0, 14.0, 5.0, 144.0, 6.0, 83.0, 1582.0, 570.0, 419.0, 1582.0, 372.0, 481.0, 837.0, 4094.0, 1.0, 33.0, 1142.0, 406.0, 10.0, 138.0, 195.0, 250.0, 46.0, 317.0, 235.0, 133.0, 193.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 

In [None]:
recomm_df.head(3)

Unnamed: 0,product,sub_category,sale_price,market_price,type,rating,category_Baby Care,"category_Bakery, Cakes & Dairy",category_Beauty & Hygiene,category_Beverages,...,description_sequences721,description_sequences722,description_sequences723,description_sequences724,description_sequences725,description_sequences726,description_sequences727,description_sequences728,description_sequences729,description_sequences730
0,Garlic Oil - Vegetarian Capsule 500 mg,49,220.0,220.0,204,4.1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Water Bottle - Orange,86,180.0,180.0,420,2.3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Brass Angle Deep - Plain, No.2",73,119.0,250.0,249,3.4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Create a dictionary where keys are product values and values are corresponding vectorized rows
product_vector_dict = dict(zip(recomm_df['product'], vectorized_rows))

In [None]:
dict_size = len(product_vector_dict)
print(f"The size of the dictionary is: {dict_size}")

The size of the dictionary is: 23449


In [None]:
recomm_df.shape

(27439, 748)

### **Jaccard Similarity**


####**Example**:

In [69]:
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    similarity = intersection / union if union != 0 else 0
    return similarity

def find_similar_products(customer_product, product_vectors, threshold=0.325):
    customer_set = set(product_vectors.get(customer_product, []))
    similar_products = []

    for product, vector in product_vectors.items():
        if product != customer_product:
            vector_set = set(vector)
            similarity = jaccard_similarity(customer_set, vector_set)
            if similarity > threshold:
                similar_products.append(product)

    return similar_products


In [70]:
customer_wanted_product = "Garlic Oil - Vegetarian Capsule 500 mg"
similar_products = find_similar_products(customer_wanted_product, product_vector_dict, threshold=0.33)

In [71]:
similar_products

['Evening Primrose Oil - Vegetarian Capsule (500 mg)',
 'Cotton Balls',
 'Coconut Oil - 100 % Pure',
 'Flaxseed Oil - Omega-3, Omega-6, Omega-9 Vegetarian Capsule',
 'Hair Serum - Anti Danduruff',
 'Hair Oil - Amla',
 'Shanti - Badam Amla Hair Oil']

#### **The System:**

In [72]:
flag = True
while(flag):
  column_value = recomm_df['product'].tolist()
  print("This is the list of our products:", column_value)
  user_input = input("Please enter your considered product: ")
  outputs = find_similar_products(user_input, product_vector_dict, threshold=0.325)
  print("Here you go:")
  [print(item) for item in outputs]
  print("connect again? y/n")
  user_input = input()
  if(user_input == 'y'):
    continue
  else:
    break
    #Strawberry Conserve

This is the list of our products: ['Garlic Oil - Vegetarian Capsule 500 mg', 'Water Bottle - Orange', 'Brass Angle Deep - Plain, No.2', 'Cereal Flip Lid Container/Storage Jar - Assorted Colour', 'Creme Soft Soap - For Hands & Body', 'Germ - Removal Multipurpose Wipes', 'Multani Mati', 'Hand Sanitizer - 70% Alcohol Base', 'Biotin & Collagen Volumizing Hair Shampoo + Biotin & Collagen Hair Conditioner', 'Scrub Pad - Anti- Bacterial, Regular', 'Wheat Grass Powder - Raw', 'Butter Cookies Gold Collection', 'Face Wash - Oil Control, Active', 'Mold & Mildew Remover with Bleach', 'Just Spray - Mosquito Repellent Room Spray', 'Dove Plastic Soap Case - Assorted Colour', 'Smooth Skin Oil - For Dry Skin', 'Salted Pumpkin', 'Flax Seeds - Roasted', 'Organic Tofu - Soy Paneer', 'Ceramic Barrel Brush - Colour May Vary', 'Instant Noodles - Chicken Satay Flavor', 'Chia Seeds', 'Cleanse Green Tea - Whole Leaf Loose Tea', 'Veggie Cutter', 'Insulated Hot Fresh Casserole For Roti/Chapati - White', 'Granola 