In [1]:
#Basic Libraries
import numpy as np
import pandas as pd

#Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

#Text Handling Libraries
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

# clustering
from sklearn.cluster import KMeans

#
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [2]:
# Function for removing NonAscii characters
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

# Function for converting into lower case
def make_lower_case(text):
    return text.lower()

# Function for removing stop words
def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

# Function for removing punctuation
def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

# Function for removing the html tags
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

In [5]:
df = pd.read_csv('products.csv')
df['product'] = df['product'].drop_duplicates()
df = df.dropna()

In [6]:
df_clean = df.reset_index()
products = df_clean['product']
indices = pd.Series(df_clean.index, index=df_clean['product'])
df_clean.head()

Unnamed: 0,level_0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."
3,3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.0,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...
4,4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.0,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...


In [8]:
df_clean['soup'] = df_clean['description'] + ' ' + df_clean['category'] + ' ' +df_clean['sub_category'] + ' ' + df_clean['brand'] + ' ' +df_clean['type']
df_clean['soup'][0]

'This Product contains Garlic Oil that is known to help proper digestion, maintain proper cholesterol levels, support cardiovascular and also build immunity.  For Beauty tips, tricks & more visit https://bigbasket.blog/ Beauty & Hygiene Hair Care Sri Sri Ayurveda  Hair Oil & Serum'

In [11]:
# Applying all the functions in description and storing as a cleaned_desc
df_clean['soup'] = df_clean['soup'].apply(_removeNonAscii)
df_clean['soup'] = df_clean.soup.apply(func = make_lower_case)
df_clean['soup'] = df_clean.soup.apply(func=remove_punctuation)
df_clean['soup'] = df_clean.soup.apply(func=remove_html)
df_clean['soup'][0]

'this product contains garlic oil that is known to help proper digestion maintain proper cholesterol levels support cardiovascular and also build immunity for beauty tips tricks more visit https bigbasket blog beauty hygiene hair care sri sri ayurveda hair oil serum'

In [12]:
df_clean[['soup','product']]

Unnamed: 0,soup,product
0,this product contains garlic oil that is known...,Garlic Oil - Vegetarian Capsule 500 mg
1,each product is microwave safe without lid ref...,Water Bottle - Orange
2,a perfect gift for all occasions be it your mo...,"Brass Angle Deep - Plain, No.2"
3,multipurpose container with an attractive desi...,Cereal Flip Lid Container/Storage Jar - Assort...
4,nivea creme soft soap gives your skin the best...,Creme Soft Soap - For Hands & Body
...,...,...
16019,this round toilet brush is made up of virgin q...,Toilet Cleaning Brush - Round With Holder (Big)
16020,organic tattva garam masala is a famous spice ...,Organic Powder - Garam Masala
16021,layerr brings you wottagirl classic fragrant b...,"Wottagirl! Perfume Spray - Heaven, Classic"
16022,we have taken the richness of sweet potatoes s...,Peri-Peri Sweet Potato Chips


In [13]:
# recommendation
def get_tfid_recommendation(title, sim):
    idx = indices[title]
    sim_scores = list(enumerate(sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    product_indices = [i[0] for i in sim_scores]
    return products.iloc[product_indices]

In [14]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df_clean['soup'])

In [15]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [16]:
searchstr = 'Peri-Peri Sweet Potato Chips'
get_tfid_recommendation(searchstr,cosine_sim)

2945                  Salsa Sweet Potato Chips
3905            Potato Chips - Honey & Mustard
5177                  Potato Crisps - Jalapeno
10595                     Potato Chips - Pizza
13222                     Potato Chips - Ranch
11441    South African Style Peri Peri Flavour
3385                   High Protein Soya Chips
1087            Chips - Keralas Nendran Banana
178                         Sour Cream & Onion
7862                          Chia Seeds Chips
Name: product, dtype: object

In [17]:
searchstr = 'Creme Soft Soap - For Hands & Body'
get_tfid_recommendation(searchstr,cosine_sim)

2661                  Bathing Soap (Lavender & Milk Cream)
4519       Fruity Soap Enriched with Narural Grape Extract
6227        Fruity Soap Enriched with Narural Lime Extract
6578       Fruity Soap Enriched with Narural Lemon Extract
15433          Bathing Soap - with Neem, Tulsi & Aloe Vera
3160                    Creme Care Soap - For Hands & Body
7265                                     Soap - Creme Care
8302     Crème Care Women Body Wash - Shower Gel For So...
3444                       Laboratory Reagent CH3, CO, CH3
3419                       Moisturise Lotion - Body Cocoon
Name: product, dtype: object