In [1]:
#import libraries
import numpy as np
import pandas as pd 
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 2000)
pd.set_option('display.float_format', '{:20,.2f}'.format)
pd.set_option('display.max_colwidth', None)

In [2]:
dataset = pd.read_csv(r'/kaggle/input/amazon-productcsv/amazon products.csv')
dataset["Category"].head()

0    Sports & Outdoors | Outdoor Recreation | Skates, Skateboards & Scooters | Skateboarding | Standard Skateboards & Longboards | Longboards
1                                                                                   Toys & Games | Learning & Education | Science Kits & Toys
2                                                                                                   Toys & Games | Arts & Crafts | Craft Kits
3                                                             Toys & Games | Hobbies | Models & Model Kits | Model Kits | Airplane & Jet Kits
4                                                                                                     Toys & Games | Puzzles | Jigsaw Puzzles
Name: Category, dtype: object

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10002 entries, 0 to 10001
Data columns (total 28 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Uniq Id                10002 non-null  object 
 1   Product Name           10002 non-null  object 
 2   Brand Name             0 non-null      float64
 3   Asin                   0 non-null      float64
 4   Category               9172 non-null   object 
 5   Upc Ean Code           34 non-null     object 
 6   List Price             0 non-null      float64
 7   Selling Price          9895 non-null   object 
 8   Quantity               0 non-null      float64
 9   Model Number           8232 non-null   object 
 10  About Product          9729 non-null   object 
 11  Product Specification  8370 non-null   object 
 12  Technical Details      9212 non-null   object 
 13  Shipping Weight        8864 non-null   object 
 14  Product Dimensions     479 non-null    object 
 15  Im

In [4]:
dataset['Selling Price_processed'] = dataset['Selling Price'].apply(lambda x: str(x).replace('$',''))
dataset['Selling Price_processed'] = dataset['Selling Price_processed'].astype(float)

**TF-IDF**

In [5]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
dataset["Category"] = dataset["Category"].fillna("")

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(dataset["Category"])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(10002, 1133)

In [6]:
#Array mapping from feature integer indices to feature name.
tfidf.get_feature_names()[0:20]



['accent',
 'accents',
 'accessories',
 'accessory',
 'action',
 'activities',
 'activity',
 'additives',
 'adhesives',
 'adirondack',
 'adult',
 'advent',
 'agility',
 'aids',
 'air',
 'airbrush',
 'aircraft',
 'airplane',
 'airplanes',
 'albums']

**Content Based Filtering**

In [7]:
# Import linear_kernel, cosine_similarity, and sigmoid_kernel
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the cosine similarity matrix
linear = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
sig_score = sigmoid_kernel(tfidf_matrix, tfidf_matrix)

In [8]:
print(linear.shape)
print(cosine_sim.shape)
print(sig_score.shape)

(10002, 10002)
(10002, 10002)
(10002, 10002)


In [9]:
print(linear[1])
print(cosine_sim[1])
print(sig_score[1])

[0.         1.         0.27876877 ... 0.25244281 0.27876877 0.        ]
[0.         1.         0.27876877 ... 0.25244281 0.27876877 0.        ]
[0.76159416 0.76196458 0.76169747 ... 0.76168771 0.76169747 0.76159416]


In [10]:
#Construct a reverse map of indices and product names
indices = pd.Series(dataset.index, index=dataset["Product Name"])

In [11]:
indices[:20]

Product Name
DB Longboards CoreFlex Crossbow 41" Bamboo Fiberglass Longboard Complete                                                                             0
Electronic Snap Circuits Mini Kits Classpack, FM Radio, Motion Detector, Music Box (Set of 5)                                                        1
3Doodler Create Flexy 3D Printing Filament Refill Bundle (X5 Pack, Over 1000'. of Extruded Plastics! - Innovate                                      2
Guillow Airplane Design Studio with Travel Case Building Kit                                                                                         3
Woodstock- Collage 500 pc Puzzle                                                                                                                     4
Terra by Battat – 4 Dinosaur Toys, Medium – Dinosaurs for Kids & Collectors, Scientifically Accurate & Designed by A Paleo-Artist; Age 3+ (4 Pc)     5
Rubie's Child's Pokemon Deluxe Pikachu Costume, X-Small                          

**UFD**

In [12]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

In [13]:
choices = list(indices.index)

In [14]:
extracted = process.extract("lego", choices, limit=1)
extracted[0][0]

'LEGO Lunch Box, Medium Pink'

In [15]:
# Function that takes in product name as input and outputs most similar product
def rec_lin(user_input, linear=linear):
    
    # use fuzzywuzzy to grab the product with name closest to user input
    extracted = process.extract(user_input, choices, limit=1)
    product_name = extracted[0][0]
    
    # Get the index of the product that matches the product name
    idx = indices[product_name]

    # Get the pairwise similarity scores
    sim_scores = list(enumerate(linear[idx]))

    # Sort the products based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar products
    sim_scores = sim_scores[1:11]

    # Get the product indices
    product_indices = [i[0] for i in sim_scores]

    df_return = dataset[["Product Name","Selling Price", "Selling Price_processed"]].loc[product_indices]
    # Return the top 10 most similar products
    return df_return.sort_values(by="Selling Price_processed", ascending=True)[["Product Name","Selling Price"]]

In [16]:
name = input("What would you like to search for today? ")
rec_lin(name)

What would you like to search for today?  cube


Unnamed: 0,Product Name,Selling Price
1156,"ETA hand2mind Blue Plastic Base Ten Rods, Set of 50",$6.95
3165,Melissa & Doug Disney Baby Mickey Mouse and Donald Duck Wooden Stacker Toy (12 pcs),$8.99
433,"Loftus SW-0249 4 Pc Classic Wooden Games in A Tin Set, 6 inches Long, Brown",$12.54
172,BeginAgain BuddyBlocks Safari Animals - Matching and Problem Solving - Kids 18 Months and Up,$12.99
3098,Constructive Playthings CPX-594 Animal Stack & Count/Number Sort Animal Counters,$13.99
1628,Infantino Stack & Spin Seal,$14.23
214,TOMY John Deere Learn 'n Pop Farmyard Friends Toy,$14.99
1382,Little Tikes Lil' Ocean Explorers - Ball Chase Octopus,$14.99
2956,Melissa & Doug Counting Shape Stacker (Wooden Educational Toy with 55 Shapes and 10 Number Tiles),$19.19
666,Bigjigs Toys Stacking Cubes,$31.50
