In [None]:
# Change the source file path and the Model embeddings
!pip install sentence-transformers

In [11]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import re
import h5py
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

ps = PorterStemmer()

In [12]:
nltk.download("stopwords")
stopwords = stopwords.words("english")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
# Load a pre-trained BERT model
model = SentenceTransformer('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


.gitattributes:   0%|          | 0.00/491 [00:00<?, ?B/s]

LICENSE:   0%|          | 0.00/11.4k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

(…)kage/Data/com.apple.CoreML/model.mlmodel:   0%|          | 0.00/165k [00:00<?, ?B/s]

weight.bin:   0%|          | 0.00/532M [00:00<?, ?B/s]

(…)sk/float32_model.mlpackage/Manifest.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/532M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



In [17]:
# Load the Data
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/product_names.csv', on_bad_lines='skip', engine='python')

In [18]:
df[:10]

Unnamed: 0,Product Name
0,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,..."
1,Kindle Oasis E-reader with Leather Charging Co...
2,"Amazon Kindle Lighted Leather Cover,,,\r\nAmaz..."
3,"Amazon Kindle Lighted Leather Cover,,,\r\nKind..."
4,"Kindle Keyboard,,,\r\nKindle Keyboard,,,"
5,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,..."
6,"Fire HD 8 Tablet with Alexa, 8 HD Display, 32 ..."
7,Amazon 5W USB Official OEM Charger and Power A...
8,"All-New Kindle E-reader - Black, 6 Glare-Free ..."
9,"Amazon Kindle Fire Hd (3rd Generation) 8gb,,,\..."


In [19]:
def data_preprocessor(text):
  # Convert the text to lower for Consistency
  text = text.lower()
  # Remove sentence breaks
  text = re.sub(r'\r\n', '', text)
  # Remove unwanted commas
  text = re.sub(r',', '', text)
  text = re.sub(r'-', ' ', text)
  tokens = text.split(" ")
  tokens = [ps.stem(t) for t in tokens if t not in stopwords]
  text = " ".join(tokens)
  return text

In [20]:
df['Normalized Product Name'] = df['Product Name'].str.lower().str.strip()

In [21]:
df.shape

(2397873, 2)

In [22]:
df_1 = df.drop_duplicates(subset='Normalized Product Name')

In [23]:
df_1.shape

(2394761, 2)

In [24]:
df_1

Unnamed: 0,Product Name,Normalized Product Name
0,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...","all-new fire hd 8 tablet, 8 hd display, wi-fi,..."
1,Kindle Oasis E-reader with Leather Charging Co...,kindle oasis e-reader with leather charging co...
2,"Amazon Kindle Lighted Leather Cover,,,\r\nAmaz...","amazon kindle lighted leather cover,,,\r\namaz..."
3,"Amazon Kindle Lighted Leather Cover,,,\r\nKind...","amazon kindle lighted leather cover,,,\r\nkind..."
4,"Kindle Keyboard,,,\r\nKindle Keyboard,,,","kindle keyboard,,,\r\nkindle keyboard,,,"
...,...,...
2397868,Blooming Circles Quilt Collection,blooming circles quilt collection
2397869,"Sunwashed Percale Sheet Collection, Print","sunwashed percale sheet collection, print"
2397870,Lakeside Percale Sheet Collection,lakeside percale sheet collection
2397871,Classic Colors Down Comforter,classic colors down comforter


In [None]:
df['Product Name'][:10].apply(data_preprocessor)

In [25]:
def create_embedings(data):
  text_embed = model.encode(data)
  return text_embed

In [26]:
def store_embeddings(embeddings):
  # Save to HDF5
  with h5py.File('product_embeddings.h5', 'w') as hf:
      hf.create_dataset('embeddings', data=embeddings)

In [None]:
embeds = create_embedings(df['Product Name'].apply(data_preprocessor).tolist())
store_embeddings(embeds)

In [27]:
def get_inference(search_query, embed):
  search_embed = model.encode([data_preprocessor(search_query)])[0]
  # Calculate cosine similarity between the search query and product names
  similarities = cosine_similarity([search_embed], embed)[0]

  # Get indices of the top N most similar product names
  top_indices = similarities.argsort()[-10:][::-1]  # You can adjust '5' based on the number of suggestions you want

  # Get the corresponding product names
  suggestions = df.iloc[top_indices]['Product Name'].tolist()
  return suggestions

In [29]:
import h5py
mf  = h5py.File('/content/drive/MyDrive/Colab Notebooks/product_embeddings_overall.h5', 'r')
embeds = mf['embeddings']
get_inference("fire tv", embeds)

['Zebra Outdoors',
 'CatBird Boots',
 'Space Women Heels',
 'Chazer',
 'Kittens Boys Flats',
 'Glamour Casuals',
 'Babylon Women Flats',
 'Nell Women Heels',
 'Liza Women Flats',
 'Rockshose Lace Up']

In [30]:
# Shorter Sentences are not performing well
# Longer sentences are performing good
get_inference("Fire TV", embeds)

['Zebra Outdoors',
 'CatBird Boots',
 'Space Women Heels',
 'Chazer',
 'Kittens Boys Flats',
 'Glamour Casuals',
 'Babylon Women Flats',
 'Nell Women Heels',
 'Liza Women Flats',
 'Rockshose Lace Up']

In [31]:
get_inference("ereader", embeds)

['Assort Women Flats',
 'Justanned Pouch for iPad',
 'Fuse',
 'Nike',
 'Just Wow Women Flats',
 'Adelee Women Wedges',
 'Hitch Boots',
 'Yahe Women Heels',
 'Liars dice',
 'Carlton Boots']

In [32]:
get_inference("e reader", embeds)

['Fuse',
 'Carlton Boots',
 'Nike',
 'Zebra Outdoors',
 'product not found',
 'Max T-Shirt',
 'Chazer',
 'Kiara Backpack',
 'Steppings Women Flats',
 'Chef Outfit']

In [33]:
get_inference("black", embeds)

['Nike',
 'Fuse',
 'Carlton Boots',
 'Chazer',
 'Glamour Casuals',
 'product not found',
 'Kiara Backpack',
 'Max T-Shirt',
 'Liars dice',
 'Zebra Outdoors']

In [None]:
get_inference("balck 97", embeds)

['Blackberry 8310',
 'BlackBerry Bold 9700',
 'Blackberry 9530 Storm',
 'Apple Iphone 5 16gb Cricket - Black',
 'Bb-storm-9530-pb-r',
 'Amazon Echo Show - Black',
 'Blackberry Style 9670 Smartphone - Sprint',
 'Echo Dot (Previous generation)',
 'At&t Blackberry Bold 9900',
 'Apple iPhone 4s 16GB White -Cricket']

In [34]:
get_inference("Certified Refurbished Amazon Echo", embeds)

['Certified Refurbished Amazon Echo',
 'Certified Refurbished Amazon Fire TV with Alexa Voice Remote',
 'Certified Refurbished Amazon Fire TV Stick (Previous Generation - 1st),,,\r\nKindle Paperwhite,,,',
 'Certified Refurbished Amazon Fire TV (Previous Generation - 1st),,,\r\nCertified Refurbished Amazon Fire TV (Previous Generation - 1st),,,',
 'Certified Refurbished Amazon Fire TV with Alexa Voice Remote,,,\r\nCertified Refurbished Amazon Fire TV with Alexa Voice Remote,,,',
 'Certified Refurbished Amazon Fire TV Stick (Previous Generation - 1st),,,\r\nCertified Refurbished Amazon Fire TV Stick (Previous Generation - 1st),,,',
 'Apple iPhone 6 Plus Silver 16GB Unlocked Smartphone (Certified Refurbished)',
 'Motorola V551 Refurbishd Cell Phone Unlocked',
 'Apple iPhone 5 16GB - Unlocked - White (Certified Refurbished)',
 'Apple iPhone 6 Plus Gold 64GB Unlocked Smartphone (Certified Refurbished)']