In [18]:
import numpy as np
import pandas as pd
import os
import faiss

Using dataset : https://www.kaggle.com/datasets/abhishekmungoli/amazon-query-product-search

In [37]:
dataset_path = "../DATASETS/amazon-product-query"

# dataset = pd.read_csv(os.path.join(dataset_path,"dataset_150k.csv"))

product = pd.read_parquet(os.path.join(dataset_path, "shopping_queries_dataset_products.parquet"))
product_embeddings = pd.read_csv(os.path.join(dataset_path, "product_150k.csv"))
query_embeddings = pd.read_csv(os.path.join(dataset_path, "query_150k.csv"))
dataset_combined = pd.read_csv(os.path.join(dataset_path,"dataset_150k.csv"))

merged_product = pd.merge(product_embeddings, product.drop_duplicates(subset='product_id'), on=['product_id'])

Checking for semantic similarity for product and query

In [38]:
def runANN(df: pd.DataFrame, embedding_columns: list[str]):
    embeddings = df[embedding_columns].to_numpy().astype(np.float32)
    dimension = embeddings.shape[1]
    print(f"Dimension of embeddings : {dimension}")
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    distances,indices = index.search(embeddings, 20)
    return indices
    

In [40]:
embedding_columns = product_embeddings.columns.to_list()
embedding_columns.remove('product_id')
indices = runANN(product_embeddings, embedding_columns)
similar_product_df = pd.DataFrame([{
    'Item' :   merged_product.iloc[i[0]]['product_title'],
    'Neighbors' : [merged_product.iloc[x]['product_title'] for x in i[1:]]
} for i in indices])

similar_product_df.head()

Dimension of embeddings : 160


Unnamed: 0,Item,Neighbors
0,"Crocs Kids' Handle It Rain Boots , Candy Pink,...","[Crocs Kids' Handle It Rain Boots , Candy Pink..."
1,"Hatley Kids' Little Classic Rain Boots, Pink &...","[Hatley unisex child Classic Boots Raincoat, N..."
2,Outee Rubber Kids Rain Boots,[Winthome Changing Towel Poncho Robe with Hood...
3,Hope & Henry Girls' Red Milano Stitch Cardigan,[Hope & Henry Girls' Milano Stitch Cardigan Na...
4,Spring&Gege Youth Solid Full Zipper Hoodies So...,[Spring&Gege Youth Solid Pullover Sport Hoodie...


In [None]:
embedding_columns = query_embeddings.columns.to_list()
embedding_columns.remove('query')
indices = runANN(query_embeddings, embedding_columns)
similar_query_df = pd.DataFrame([{
    'Item' :   query_embeddings.iloc[i[0]]['query'],
    'Neighbors' : [query_embeddings.iloc[x]['query'] for x in i[1:]]
} for i in indices])

similar_query_df.head()

Dimension of embeddings : 32


Unnamed: 0,Item,Neighbors
0,child proof cabinet locks,"[child locks for cabinets, cabinet locks, cabi..."
1,ankle stockings for women sheer,"[cargo pants for men, short cowboy booties for..."
2,gluten free snacks,"[gluten free lemon cookie, heartland gluten fr..."
3,hair geow,"[arhletic sweatshirts, pink earrinfs, snoogle,..."
4,"1 by one, amplified, outdoor hdtv antenna","[lighting wired headphones, lego 14 years and ..."


The Semantic search between product and query yields better results. so it captures the semantic meaning of the title and query. goin ahead with model creation and training

In [None]:
from torch import nn

class TwoTower(nn.Module):
    def __init__(self, query_embedding_dimension, product_embedding_dimension, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.query_embedding_layers = nn.Sequential(
            nn.Linear(query_embedding_dimension, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(0.3)
            )
        self.product_embedding_layers = nn.Sequential(
            nn.Linear(product_embedding_dimension, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(0.3)
            )
        self.combined_dense_layers = nn.Sequential(
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Linear(32, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(0.3)
        )        
        

