In [1]:
import pandas as pd

In [2]:
shopping_queries_dataset_examples = pd.read_parquet("Data/shopping_queries_dataset_examples.parquet")

In [3]:
shopping_queries_dataset_products = pd.read_parquet("Data/shopping_queries_dataset_products.parquet")

In [6]:
df_examples_products = pd.merge(
    shopping_queries_dataset_examples,
    shopping_queries_dataset_products,
    how='left',
    left_on=['product_locale','product_id'],
    right_on=['product_locale', 'product_id']
)

In [7]:
df = df_examples_products[df_examples_products["large_version"] == 1]
df = df[['product_title', 'product_description', 'product_bullet_point','product_brand','product_color','product_id','query','split']]
df = df.drop_duplicates(subset='product_id', keep='first')

# df_train = df[df["split"] == "train"]
# df_test = df[df["split"] == "test"]

In [14]:
df.isnull().sum()

product_title                 0
product_description     1348674
product_bullet_point     364761
product_brand            166854
product_color            910895
product_id                    0
query                         0
split                         0
dtype: int64

In [15]:
# Convert all string columns to lowercase, except 'split' and 'product_id'
df.loc[:, df.columns.difference(['split', 'product_id'])] = df.loc[:, df.columns.difference(['split', 'product_id'])].applymap(lambda x: x.lower() if isinstance(x, str) else x)

# Replace null values with an empty string, except in 'split' and 'product_id'
df.loc[:, df.columns.difference(['split', 'product_id'])] = df.loc[:, df.columns.difference(['split', 'product_id'])].fillna('')


In [16]:
df.head()

Unnamed: 0,product_title,product_description,product_bullet_point,product_brand,product_color,product_id,query,split
0,panasonic fv-20vq3 whisperceiling 190 cfm ceil...,,whisperceiling fans feature a totally enclosed...,panasonic,white,B000MOO21W,revent 80 cfm,train
1,homewerks 7141-80 bathroom fan integrated led ...,,outstanding performance: this homewerk's bath ...,homewerks,80 cfm,B07X3Y6B1V,revent 80 cfm,train
2,homewerks 7140-80 bathroom fan ceiling mount e...,,outstanding performance: this homewerk's bath ...,homewerks,white,B07WDM7MQQ,revent 80 cfm,train
3,delta electronics rad80l breezradiance 80 cfm ...,this pre-owned or refurbished product has been...,quiet operation at 1.5 sones\nbuilt-in thermos...,delta electronics (americas) ltd.,white,B07RH6Z8KW,revent 80 cfm,train
4,panasonic fv-08vre2 ventilation fan with reces...,,the design solution for fan/light combinations...,panasonic,white,B07QJ7WYFQ,revent 80 cfm,train


In [17]:
df.to_csv("Data/dataset.csv")

### Steps
- loading embedding model (bert)
- function to reshape embedding
- keep only product cols = ['product_title', 'product_description', 'product_bullet_point','product_brand','product_color','product_id']
- drop duplicates on product_id
- convert to lower case, count null and replace with ''
- create embedding of unique query and product cols

In [2]:
df = pd.read_csv("Data/dataset.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,product_title,product_description,product_bullet_point,product_brand,product_color,product_id,query,split
0,0,panasonic fv-20vq3 whisperceiling 190 cfm ceil...,,whisperceiling fans feature a totally enclosed...,panasonic,white,B000MOO21W,revent 80 cfm,train
1,1,homewerks 7141-80 bathroom fan integrated led ...,,outstanding performance: this homewerk's bath ...,homewerks,80 cfm,B07X3Y6B1V,revent 80 cfm,train
2,2,homewerks 7140-80 bathroom fan ceiling mount e...,,outstanding performance: this homewerk's bath ...,homewerks,white,B07WDM7MQQ,revent 80 cfm,train
3,3,delta electronics rad80l breezradiance 80 cfm ...,this pre-owned or refurbished product has been...,quiet operation at 1.5 sones\nbuilt-in thermos...,delta electronics (americas) ltd.,white,B07RH6Z8KW,revent 80 cfm,train
4,4,panasonic fv-08vre2 ventilation fan with reces...,,the design solution for fan/light combinations...,panasonic,white,B07QJ7WYFQ,revent 80 cfm,train


In [3]:
df.isnull().sum()

Unnamed: 0                    0
product_title                 0
product_description     1348675
product_bullet_point     364762
product_brand            166893
product_color            911029
product_id                    0
query                         0
split                         0
dtype: int64

In [26]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [29]:
sentences = ["This is an example sentence."]
embeddings = model.encode(sentences, convert_to_tensor=True)

# The 'embeddings' variable now contains the sentence embeddings as PyTorch tensors
print(embeddings.shape)

torch.Size([1, 384])
