In [None]:
#get the data downloaded
!wget -q https://raw.githubusercontent.com/wayfair/WANDS/main/dataset/label.csv
!wget -q https://raw.githubusercontent.com/wayfair/WANDS/main/dataset/product.csv
!wget -q https://raw.githubusercontent.com/wayfair/WANDS/main/dataset/query.csv

In [None]:
import pandas as pd

In [None]:
df_label = pd.read_table('/content/label.csv')
df_product = pd.read_table('/content/product.csv')
df_query = pd.read_table('/content/query.csv')

In [None]:
df_label.head(2)

Unnamed: 0,id,query_id,product_id,label
0,0,0,25434,Exact
1,1,0,12088,Irrelevant


In [None]:
df_product.head(2)

Unnamed: 0,product_id,product_name,product_class,category hierarchy,product_description,product_features,rating_count,average_rating,review_count
0,0,solid wood platform bed,Beds,Furniture / Bedroom Furniture / Beds & Headboa...,"good , deep sleep can be quite difficult to ha...",overallwidth-sidetoside:64.7|dsprimaryproducts...,15.0,4.5,15.0
1,1,all-clad 7 qt . slow cooker,Slow Cookers,Kitchen & Tabletop / Small Kitchen Appliances ...,"create delicious slow-cooked meals , from tend...",capacityquarts:7|producttype : slow cooker|pro...,100.0,2.0,98.0


In [None]:
df_query.head(2)

Unnamed: 0,query_id,query,query_class
0,0,salon chair,Massage Chairs
1,1,smart coffee table,Coffee & Cocktail Tables


In [None]:
def cat_map(x):
  if x == "Exact":
    return 1
  elif x == "Partial":
    return 0.75
  elif x == "Irrelevant":
    return 0
  else:
    return 0

In [None]:
df_label['label'] = df_label['label'].apply(cat_map)

In [None]:
df_label.head(2)

Unnamed: 0,id,query_id,product_id,label
0,0,0,25434,1.0
1,1,0,12088,0.0


In [None]:
df_search = df_product[['product_id','product_name','product_description']]

In [None]:
df_search['product_text'] = df_search['product_description'].fillna(df_search['product_name'])
df_search.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_search['product_text'] = df_search['product_description'].fillna(df_search['product_name'])


Unnamed: 0,product_id,product_name,product_description,product_text
42989,42989,malibu pressure balanced diverter fixed shower...,the malibu pressure balanced diverter fixed sh...,the malibu pressure balanced diverter fixed sh...
42990,42990,emmeline 5 piece breakfast dining set,,emmeline 5 piece breakfast dining set
42991,42991,maloney 3 piece pub table set,this pub table set includes 1 counter height t...,this pub table set includes 1 counter height t...
42992,42992,fletcher 27.5 '' wide polyester armchair,"bring iconic , modern style to your space in a...","bring iconic , modern style to your space in a..."
42993,42993,griffin 29 '' bar stool,this set of two barstools features a minimalis...,this set of two barstools features a minimalis...


In [None]:
!pip install langchain

In [None]:
from langchain.vectorstores import FAISS

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
!pip install HuggingFace

In [None]:
from langchain.document_loaders import DataFrameLoader

In [None]:
!pip install sentence_transformers

In [None]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
input = DataFrameLoader(df_search,'product_text').load()

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
Embedding_base = HuggingFaceEmbeddings(model_name =  'all-MiniLM-L6-v2')

In [None]:
db_vector_base = FAISS.from_documents(input,Embedding_base)

In [None]:
def predict(db,query_df):

    # perform search on embeddings
  raw_results = db.similarity_search_with_score(
                query_df['query'].values[0], # only expecting one value at a time
                k=5
                )

    # get lists of of scores, descriptions and ids from raw results
  scores, descriptions, names, ids = zip(
      *[(r[1], r[0].page_content, r[0].metadata['product_name'], r[0].metadata['product_id']) for r in raw_results]
      )

    # reorganized results as a pandas df, sorted on score
  results_pd = pd.DataFrame({
      'product_id':ids,
      'product_name':names,
      'product_description':descriptions,
      'score':scores
      }).sort_values(axis=0, by='score', ascending=True)
  return results_pd

In [None]:
search = pd.DataFrame({'query':['keychain']})
predict(db_vector_base,search)

Unnamed: 0,product_id,product_name,product_description,score
0,14188,pineapple theme keychain favor,pineapple theme keychain favor,0.788458
1,5554,digital key cabinet with electronic lock,enjoy the protection and convenience of access...,0.956205
2,34476,asdsit flask vase key potion table accessory,fill the space on your shelves and tabletops w...,1.04125
3,5551,storage box key cabinet with combination and k...,secured 200-key cabinet with combination and k...,1.094755
4,19780,igloohome smart key cabinet with electronic lock,the smart keybox 3 is a smart box that stores ...,1.112405


In [None]:
#fine tuning the base model
tuned_model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
from sentence_transformers import InputExample, losses, util

In [None]:
result_df = df_search.merge(df_label, on='product_id', how='inner')
search_pd = result_df.merge(df_query,on='query_id', how = 'inner')
search_pd = search_pd[['query','product_text','label']]
search_pd.head()

Unnamed: 0,query,product_text,label
0,boho bed frame,"good , deep sleep can be quite difficult to ha...",0.75
1,boho bed frame,"liven it up with colorful duvets and pillows ,...",0.75
2,boho bed frame,your bed is the central part of your restful r...,0.75
3,boho bed frame,your bed is the central part of your restful r...,0.75
4,boho bed frame,"get wade logan® , meet someone wade logan® bed...",0.0


In [None]:
def create_input(doc1, doc2, score):
  return InputExample(texts=[doc1, doc2], label=score)

# convert each search result into an input
inputs = search_pd.apply(
  lambda s: create_input(s['query'], s['product_text'], s['label']), axis=1
  ).to_list()

In [None]:
from torch.utils.data import DataLoader

In [None]:
train_dataloader = DataLoader(inputs, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(tuned_model)
#Tune the model
tuned_model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=5)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/14591 [00:00<?, ?it/s]

In [None]:
tuned_model.save('/content/tuned_model.csv')

In [None]:
tuned_embedded_model = HuggingFaceEmbeddings(model_name = '/content/tuned_model.csv')

In [None]:
tuned_db = FAISS.from_documents(input,tuned_embedded_model)

In [None]:
search = pd.DataFrame({'query':['keychain']})
predict(tuned_db,search)

Unnamed: 0,product_id,product_name,product_description,score
0,5554,digital key cabinet with electronic lock,enjoy the protection and convenience of access...,0.595098
1,5551,storage box key cabinet with combination and k...,secured 200-key cabinet with combination and k...,0.638122
2,9607,ivanka laundry hamper,here 's a fresh concept from the minds of our ...,0.644298
3,28144,jewelery holder,this decor piece gives a unique touch to your ...,0.651003
4,28143,jewelry holder,this decor piece gives a unique touch to your ...,0.651003


In [None]:
search = pd.DataFrame({'query':['keychain']})
predict(db_vector_base,search)

Unnamed: 0,product_id,product_name,product_description,score
0,14188,pineapple theme keychain favor,pineapple theme keychain favor,0.788458
1,5554,digital key cabinet with electronic lock,enjoy the protection and convenience of access...,0.956205
2,34476,asdsit flask vase key potion table accessory,fill the space on your shelves and tabletops w...,1.04125
3,5551,storage box key cabinet with combination and k...,secured 200-key cabinet with combination and k...,1.094755
4,19780,igloohome smart key cabinet with electronic lock,the smart keybox 3 is a smart box that stores ...,1.112405


In [None]:
#compare cosine similarities for original model and tuned model
original_model = SentenceTransformer('all-MiniLM-L12-v2')
query_embeddings = (
  original_model
    .encode(
      search_pd['query'].tolist()
      )
  )

product_embeddings = (
  original_model
    .encode(
      search_pd['product_text'].tolist()
      )
  )

Downloading (…)5dded/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)4d81d5dded/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)81d5dded/config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)ded/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)5dded/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

Downloading (…)dded/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)4d81d5dded/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)1d5dded/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
import torch
import numpy as np

In [None]:

original_cos_sim_scores = (
  util.pairwise_cos_sim(
    query_embeddings,
    product_embeddings
    )
  )
torch.mean(original_cos_sim_scores).item()

0.4252978265285492

In [None]:
original_corr_coef_score = (
  np.corrcoef(
    original_cos_sim_scores,
    search_pd['label'].values
  )[0][1]
)
# print results
print(original_corr_coef_score)

0.43291588333654885


In [None]:
query_embeddings = (
  tuned_model
    .encode(
      search_pd['query'].tolist()
      )
  )

product_embeddings = (
  tuned_model
    .encode(
      search_pd['product_text'].tolist()
      )
  )

# determine cosine similarity for each query-product pair
tuned_cos_sim_scores = (
  util.pairwise_cos_sim(
    query_embeddings,
    product_embeddings
    )
  )

tuned_cos_sim_score = torch.mean(tuned_cos_sim_scores).item()

# display result
print(f"With tuning, avg cosine similarity went from {torch.mean(original_cos_sim_scores).item()} to {tuned_cos_sim_score}")