## Libraries

In [1]:
!which python


/mnt/c/Users/selen/OneDrive/Proyectos/Waywand_search/env/bin/python


In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from pinecone import Pinecone, ServerlessSpec
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from libs.auxiliar_func import *
from libs.visualization import *

In [4]:
#clone the git repo that contains the data and additional information about the dataset
!git clone https://github.com/wayfair/WANDS.git

fatal: destination path 'WANDS' already exists and is not an empty directory.


## Main

In [13]:
# get data
print(f'{bcolors.BOLD_BLUE}Reading WAYFAIR data with pandas {bcolors.ENDC}')
product_df = pd.read_csv("WANDS/dataset/product.csv", sep='\t')
query_df = pd.read_csv("WANDS/dataset/query.csv", sep='\t')
label_df = pd.read_csv("WANDS/dataset/label.csv", sep='\t')

#group the labels for each query to use when identifying exact matches
grouped_label_df = label_df.groupby('query_id')

# Load BERT model
print(f'{bcolors.BOLD_BLUE} Load bert model {bcolors.ENDC}')
model = SentenceTransformer("all-MiniLM-L6-v2")
# Compute BERT embeddings 
print(f'{bcolors.BOLD_GREEN} Creating embeddings ...... {bcolors.ENDC}')
#product_embeddings = create_product_embeddings(product_df,model)
print(f'{bcolors.BOLD_GREEN} Done ! {bcolors.ENDC}')

product_df['embeddings']=product_embeddings.tolist()

#applying the function to obtain top product IDs and adding top K product IDs to the dataframe
print(f'{bcolors.BOLD_BLUE} Getting IDS and top products {bcolors.ENDC}')

# Create a top 10 products based on cosine similarities (the function is store in auxiliar_func.py)
print(f'{bcolors.BOLD_GREEN} Creating top 10 product ids ...... {bcolors.ENDC}')
query_df['top_product_ids'] = query_df['query'].apply(lambda q: get_top_10(q, model, product_df))


#adding the list of exact match product_IDs from labels_df
print(f'{bcolors.BOLD_GREEN} Creating exact match product IDs...... {bcolors.ENDC}')
query_df['relevant_ids'] = query_df['query_id'].apply(lambda q: get_exact_matches_for_query(q, grouped_label_df))

#now assign the map@k score
print(f'{bcolors.BOLD_GREEN} Creating map@k score...... {bcolors.ENDC}')
query_df['map@k'] = query_df.apply(lambda x: map_at_k(x['relevant_ids'], x['top_product_ids'], k=10), axis=1)

print(f'{bcolors.BOLD_GREEN} Done...... {bcolors.ENDC}')
# calculate the MAP across the entire query set
score = query_df.loc[:, 'map@k'].mean()
print(f'{bcolors.BOLD_MAGENTA} Score of the model: {score} {bcolors.ENDC}')


print(f'{bcolors.BOLD_BLUE} Saving embeddings into Pinecone ... {bcolors.ENDC}')

#API key
load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")

# Pinecone
pc = Pinecone(api_key= PINECONE_API_KEY)

# index name
INDEX_NAME = "product-search-2"

# index 
if INDEX_NAME not in pc.list_indexes():
    pc.create_index(
        INDEX_NAME,
        dimension=len(product_df["embeddings"].iloc[0]),
        metric="cosine",
        spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"))

# Connect to the index
index = pc.Index(INDEX_NAME)

# Converting embeddings into Pinecone format
vectors = [
    (str(i), np.array(embedding).tolist(), {"product_name": product_name})
    for i, (product_name, embedding) in enumerate(zip(product_df["product_name"], product_df["embeddings"]))
]

# Uploading in batches
batch_size = 100
for i in range(0, len(vectors), batch_size):
    index.upsert(vectors[i:i + batch_size])

print(f'{bcolors.BOLD_BLUE} Uploaded {len(vectors)} embeddings to Pinecone {bcolors.ENDC}')

[1;34mReading WAYFAIR data with pandas [0m
[1;34m Load bert model [0m
[1;32m Creating embeddings ...... [0m
[1;32m Done ! [0m
[1;34m Getting IDS and top products [0m
[1;32m Creating top 10 product ids ...... [0m
[1;32m Creating exact match product IDs...... [0m
[1;32m Creating map@k score...... [0m
[1;32m Done...... [0m
[1;35m Score of the model: 0.3776823743386243 [0m
[1;34m Saving embeddings into Pinecone ... [0m
[1;34m Uploaded 42994 embeddings to Pinecone [0m
