In [1]:
#Basic Libraries
import numpy as np
import pandas as pd

#Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

#Text Handling Libraries
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

# clustering
from sklearn.cluster import KMeans

#
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [2]:
# Function for removing NonAscii characters
def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

# Function for converting into lower case
def make_lower_case(text):
    return text.lower()

# Function for removing stop words
def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

# Function for removing punctuation
def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

# Function for removing the html tags
def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

In [3]:
df = pd.read_csv('products.csv')
df['product'] = df['product'].drop_duplicates()
df = df.dropna()
#df.head()

In [4]:
df_new = df.reset_index()
products = df_new['product']
indices = pd.Series(df_new.index, index=df_new['product'])
df_new[['description','product']]

Unnamed: 0,description,product
0,This Product contains Garlic Oil that is known...,Garlic Oil - Vegetarian Capsule 500 mg
1,"Each product is microwave safe (without lid), ...",Water Bottle - Orange
2,"A perfect gift for all occasions, be it your m...","Brass Angle Deep - Plain, No.2"
3,Multipurpose container with an attractive desi...,Cereal Flip Lid Container/Storage Jar - Assort...
4,Nivea Creme Soft Soap gives your skin the best...,Creme Soft Soap - For Hands & Body
...,...,...
16019,This round toilet brush is made up of virgin q...,Toilet Cleaning Brush - Round With Holder (Big)
16020,Organic Tattva Garam masala is a famous spice ...,Organic Powder - Garam Masala
16021,Layerr brings you Wottagirl Classic fragrant b...,"Wottagirl! Perfume Spray - Heaven, Classic"
16022,We have taken the richness of Sweet Potatoes (...,Peri-Peri Sweet Potato Chips


In [5]:
# recommendation
def get_tfid_recommendation(title, sim):
    idx = indices[title]
    sim_scores = list(enumerate(sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    product_indices = [i[0] for i in sim_scores]
    return products.iloc[product_indices]

In [6]:
df_new.head()

Unnamed: 0,level_0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."
3,3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.0,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...
4,4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.0,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...


In [7]:
# Applying all the functions in description and storing as a cleaned_desc
df_new['cleaned_desc'] = df_new['description'].apply(_removeNonAscii)
df_new['cleaned_desc'] = df_new.cleaned_desc.apply(func = make_lower_case)
df_new['cleaned_desc'] = df_new.cleaned_desc.apply(func=remove_punctuation)
df_new['cleaned_desc'] = df_new.cleaned_desc.apply(func=remove_html)
df_new['cleaned_desc'][0]

'this product contains garlic oil that is known to help proper digestion maintain proper cholesterol levels support cardiovascular and also build immunity for beauty tips tricks more visit https bigbasket blog'

In [8]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df_new['cleaned_desc'])

In [9]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [10]:
searchstr = 'Peri-Peri Sweet Potato Chips'
get_tfid_recommendation(searchstr,cosine_sim)

2945                          Salsa Sweet Potato Chips
11441            South African Style Peri Peri Flavour
10442                      Mad Angles - Very Peri Peri
6486              Peri Peri Crackers - 100% Wholewheat
268      Peri Peri Sauce - Bird Eye Chili, Hot & Tangy
7097                               Banana Chips Masala
2573                       Nachoz - Sizzling Peri Peri
1087                    Chips - Keralas Nendran Banana
7903                       African Peri Peri Seasoning
10308                     Tapioca Chips - Classic Salt
Name: product, dtype: object

In [11]:
searchstr = 'Creme Soft Soap - For Hands & Body'
get_tfid_recommendation(searchstr,cosine_sim)

2661               Bathing Soap (Lavender & Milk Cream)
3419                    Moisturise Lotion - Body Cocoon
3444                    Laboratory Reagent CH3, CO, CH3
3591                                       Cotton Balls
4519    Fruity Soap Enriched with Narural Grape Extract
6227     Fruity Soap Enriched with Narural Lime Extract
6578    Fruity Soap Enriched with Narural Lemon Extract
7647                 Pure Herbs - Skin Kasturi Turmeric
7708                    Classic Deodorant Spray for Men
8428           Deodorant Body Spray - Magnetism for Men
Name: product, dtype: object