In [122]:
import pandas as pd
import numpy as np

In [123]:
import nltk

In [124]:
amazon_df = pd.read_csv('amazon_product.csv')
amazon_df.head(5)

Unnamed: 0,id,Title,Description,Category
0,1,Swissmar Capstore Select Storage Rack for 18-...,Swissmar's capstore select 18 storage unit kee...,Home & Kitchen Kitchen & Dining Kitchen Utens...
1,2,Gemini200 Delta CV-880 Gold Crown Livery Airc...,Welcome to the exciting world of GeminiJets! O...,Toys & Games Hobbies Models & Model Kits Pre-...
2,5,Superior Threads 10501-2172 Magnifico Cream P...,"For quilting and embroidery, this product is m...","Arts, Crafts & Sewing Sewing Thread & Floss S..."
3,6,Fashion Angels Color Rox Hair Chox Kit,Experiment with the haute trend of hair chalki...,Beauty & Personal Care Hair Care Hair Colorin...
4,8,Union Creative Giant Killing Figure 05: Daisu...,From Union Creative. Turn your display shelf i...,Toys & Games › Action Figures & Statues › Sta...


In [125]:
amazon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 668 entries, 0 to 667
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           668 non-null    int64 
 1   Title        668 non-null    object
 2   Description  668 non-null    object
 3   Category     668 non-null    object
dtypes: int64(1), object(3)
memory usage: 21.0+ KB


In [126]:
amazon_df.isnull().sum()

id             0
Title          0
Description    0
Category       0
dtype: int64

In [127]:
#id is not required, therefore drop it
amazon_df.drop('id', axis=1,inplace = True)

In [128]:
amazon_df.head(5)

Unnamed: 0,Title,Description,Category
0,Swissmar Capstore Select Storage Rack for 18-...,Swissmar's capstore select 18 storage unit kee...,Home & Kitchen Kitchen & Dining Kitchen Utens...
1,Gemini200 Delta CV-880 Gold Crown Livery Airc...,Welcome to the exciting world of GeminiJets! O...,Toys & Games Hobbies Models & Model Kits Pre-...
2,Superior Threads 10501-2172 Magnifico Cream P...,"For quilting and embroidery, this product is m...","Arts, Crafts & Sewing Sewing Thread & Floss S..."
3,Fashion Angels Color Rox Hair Chox Kit,Experiment with the haute trend of hair chalki...,Beauty & Personal Care Hair Care Hair Colorin...
4,Union Creative Giant Killing Figure 05: Daisu...,From Union Creative. Turn your display shelf i...,Toys & Games › Action Figures & Statues › Sta...


In [129]:
#nltk.download('punkt')
print(nltk.word_tokenize('hello world!!!'))
print(nltk.sent_tokenize('how are you. are you well. tell me'))

['hello', 'world', '!', '!', '!']
['how are you.', 'are you well.', 'tell me']


In [130]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')


In [140]:
#flow of function:
#text-->lower-->tokenize-->stemming-->joining comma seperated words with space(" ")
def tokenize_stem(text):
    tokens = nltk.word_tokenize(text.lower())
    stemmed= [stemmer.stem(w) for w in tokens]
    return " ".join(stemmed)


In [141]:
#tokenizin and stemming of title and description column
amazon_df['stem_token'] = amazon_df.apply(lambda row:tokenize_stem(row['Title'] + " " + row['Description']),axis=1)

In [142]:
amazon_df['stem_token']

0      swissmar capstor select storag rack for 18-pac...
1      gemini200 delta cv-880 gold crown liveri aircr...
2      superior thread 10501-2172 magnifico cream puf...
3      fashion angel color rox hair chox kit experi w...
4      union creativ giant kill figur 05 : daisuk tsu...
                             ...                        
663    rosemeri ( rosemari ) - box of six 20 stick he...
664    interdesign linus stack organ bin , extra larg...
665    gourmet rubber stamp diagon stripe stencil , 6...
666    spenco rx arch cushion full length comfort sup...
667                                  car kit kit for car
Name: stem_token, Length: 668, dtype: object

In [143]:
amazon_df.head(5)

Unnamed: 0,Title,Description,Category,stem_token
0,Swissmar Capstore Select Storage Rack for 18-...,Swissmar's capstore select 18 storage unit kee...,Home & Kitchen Kitchen & Dining Kitchen Utens...,swissmar capstor select storag rack for 18-pac...
1,Gemini200 Delta CV-880 Gold Crown Livery Airc...,Welcome to the exciting world of GeminiJets! O...,Toys & Games Hobbies Models & Model Kits Pre-...,gemini200 delta cv-880 gold crown liveri aircr...
2,Superior Threads 10501-2172 Magnifico Cream P...,"For quilting and embroidery, this product is m...","Arts, Crafts & Sewing Sewing Thread & Floss S...",superior thread 10501-2172 magnifico cream puf...
3,Fashion Angels Color Rox Hair Chox Kit,Experiment with the haute trend of hair chalki...,Beauty & Personal Care Hair Care Hair Colorin...,fashion angel color rox hair chox kit experi w...
4,Union Creative Giant Killing Figure 05: Daisu...,From Union Creative. Turn your display shelf i...,Toys & Games › Action Figures & Statues › Sta...,union creativ giant kill figur 05 : daisuk tsu...


In [144]:
%pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [145]:
from sklearn.feature_extraction.text import TfidfVectorizer   #count the occurences of text word in the document
                                                              #evaluate the importance of a word in a document relative to a collection of documents (corpus)
from sklearn.metrics.pairwise import cosine_similarity     # determine how similar two vectors are, irrespective of their magnitude.
tfidf = TfidfVectorizer(tokenizer=tokenize_stem)

def cosine_sim(txt1,txt2):
    # matrix = tfidf.fit_transform([txt1,txt2])      
    txt1_concatenate = ' '.join(txt1)
    txt2_concatenate = ' '.join(txt2)
    tfidf_matrix = tfidf.fit_transform(txt1_concatenate,txt2_concatenate)
    return cosine_similarity(tfidf_matrix)[0][1]

### Recommender model to recommend the best products

In [146]:
from sklearn.feature_extraction.text import CountVectorizer
def search_product(query):
    stemmed_q = tokenize_stem(query)
    #calculating cosine similarity between query and stemmed tokens columns
    #stemmed_q = user entered text
    #x = text in stem_token column (that is data in our dataset)
    text_data = [stemmed_q] + amazon_df['stem_token'].tolist()
    
    # Vectorize the text data
    vectorizer = CountVectorizer().fit_transform(text_data)
    vectors = vectorizer.toarray()
    
    # Compute cosine similarity between the query vector and the dataset vectors
    cosine_similarities = cosine_similarity(vectors[0:1], vectors[1:]).flatten()
    amazon_df['similarity'] = cosine_similarities

    #showing the 10 contents(with columns like title,desc...) which has more(that is why ascending = false(descending)) similarity with user input
    res = amazon_df.sort_values(by = ['similarity'], ascending =  False).head(10)[['Title','Description','Category']]
    return res


In [153]:
amazon_df['Title'][0]

' Swissmar Capstore Select Storage Rack for 18-Pack '

In [154]:
search_product(' Swissmar Capstore Select Storage Rack for 18-Pack '
)

Unnamed: 0,Title,Description,Category
0,Swissmar Capstore Select Storage Rack for 18-...,Swissmar's capstore select 18 storage unit kee...,Home & Kitchen Kitchen & Dining Kitchen Utens...
541,Remington SP290 for F4790 Shaver (2-Pack),Technical Features for Remington SP290-2 The R...,Beauty & Personal Care › Shave & Hair Removal...
463,Versio Mobile 3-Pack Screen Protector for LG ...,Clear 3 Pack Screen Protector For LG G2,Cell Phones & Accessories Accessories Screen ...
410,Maglite Replacement Lamps for 2-Cell AA Mini ...,Maglite Replacement Lamps for 2-Cell AA Mini F...,Tools & Home Improvement › Light Bulbs › Halo...
160,"Dixie 8.5""Medium-Weight Paper Plates by GP PR...",The WiseSize product offering provides a packa...,Health & Household › Household Supplies › Pap...
349,Fixodent Free Denture Adhesive Cream 2.40 Oun...,"For the most up to date information, we recomm...",Beauty & Personal Care Oral Care Denture Care...
427,"St. Ives Naturally Clear Apricot Scrub, Blemi...","For the most up to date information, we recomm...",Beauty & Personal Care › Skin Care › Face › C...
240,C-Line Poly 3-Compartment Storage Box with Sn...,C-Line's durable storage box features three co...,Office Products Office & School Supplies Offi...
381,"Zoom 6"" Lizard Plastic Fishing Baits 9-Pack",The Zoom Lizard gives anglers a wide color sel...,Sports & Outdoors Sports & Fitness Hunting & ...
469,"InterDesign Axis Metal Loop Scarf Hanger, No ...",The iDesign Axis 18 Loop Scarf Hanger helps ke...,Home & Kitchen Storage & Organization Clothin...
