In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
import polars as pl
import os
import utils

# select model and dataset

In [2]:
model_folder = 'openai'
#model_folder = 'text-exp0307'
#model_folder = 'text-004'

#dataset = 'wands'
dataset = 'esci-us'


# load embeddings and ground truth

In [3]:
q = pd.read_parquet(f'{model_folder}/{dataset}-queries.parquet')

p = pl.read_parquet(f'{model_folder}/{dataset}-products.parquet').to_pandas()
p.set_index('__index_level_0__', inplace=True)

r = pd.read_parquet(f'{dataset}/relevance.parquet')

# index ground truth with query and product ids
r.index = [f'{row.query_id}|{row.product_id}' for _,row in tqdm(r.iterrows(), total=len(r))]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1818825/1818825 [01:00<00:00, 30143.01it/s]


In [4]:
q.shape, p.shape, r.shape

((97345, 2), (1215851, 2), (1818825, 5))

In [5]:
r.head()

Unnamed: 0,example_id,query_id,product_id,esci_label,relevance
0|B000MOO21W,0,0,B000MOO21W,I,1
0|B07X3Y6B1V,1,0,B07X3Y6B1V,E,4
0|B07WDM7MQQ,2,0,B07WDM7MQQ,E,4
0|B07RH6Z8KW,3,0,B07RH6Z8KW,E,4
0|B07QJ7WYFQ,4,0,B07QJ7WYFQ,E,4


In [6]:
q.head()

Unnamed: 0_level_0,query,embeddings
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,revent 80 cfm,"[-0.05282729119062424, 0.0162424948066473, -0...."
1,!awnmower tires without rims,"[-0.015263666398823261, -0.023168815299868584,..."
2,!qscreen fence without holes,"[0.019541103392839432, 0.025528443977236748, -..."
5,# 10 self-seal envelopes without window,"[-0.010724861174821854, 0.0026416631881147623,..."
6,# 2 pencils not sharpened,"[-0.013498208485543728, -0.011514219455420971,..."


In [7]:
p.head()

Unnamed: 0_level_0,product,embeddings
__index_level_0__,Unnamed: 1_level_1,Unnamed: 2_level_1
306045,<TITLE>WhiteCoat Clipboard- Pink - Respiratory...,"[-0.027882717549800873, -0.021610360592603683,..."
2159511,<TITLE>A Day in the Life of Ireland</TITLE>\n<...,"[0.03405754268169403, 0.005424880422651768, -0..."
3002764,<TITLE>HENRY'S AWFUL MISTAKE</TITLE>,"[0.022827627137303352, -0.014154857955873013, ..."
4126475,<TITLE>David Bellamy's Watercolour Landscape C...,"[0.049933984875679016, 0.028047902509570122, -..."
4127579,<TITLE>Developing Your Watercolours</TITLE>\n<...,"[0.030019331723451614, 0.031795453280210495, -..."


# rank topn according dot product on embeddings

In [8]:
qe = np.stack(q.embeddings.values.copy())
pe = np.stack(p.embeddings.values.copy())

qe.shape, pe.shape

((97345, 3072), (1215851, 3072))

## vector search using `faiss`

In [9]:
import faiss

In [10]:
# search in normalized vector space is equivalent to dot product
kqe = qe / np.linalg.norm(qe, axis=1).reshape(-1,1)
kpe = pe / np.linalg.norm(pe, axis=1).reshape(-1,1)

In [11]:
index = faiss.IndexFlatL2(kqe.shape[1])   # build the index
index.add(kpe)
print(index.is_trained, index.ntotal)

True 1215851


In [12]:
chunk_size = 2048
qnn = []
for i in tqdm(range(0,len(qe), chunk_size)):
    _,idxs = index.search(kqe[i:i+chunk_size], 40) # sanity check
    for ii in idxs:
        qnn.append(ii)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [2:14:04<00:00, 167.59s/it]


In [13]:
q['topn'] = [list(p.index[qnni]) for qnni in tqdm(qnn)]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 97345/97345 [00:03<00:00, 31283.33it/s]


## brute force approach with numpy dotproduct and `numpy.argsort`


In [None]:
# brute force approach with numpy dotproduct and argsort

def get_dotp_closests_idxs(q_embedding,p_embeddings_matrix, topn=40):
    return np.argsort( p_embeddings_matrix.dot(q_embedding) )[-topn:][::-1]

#qnn = Parallel(n_jobs=-1, verbose=5)(delayed(get_dotp_closests_idxs)(qi,pe, topn=40) for qi in qe)
#q['topn'] = [list(p.index[qnni]) for qnni in tqdm(qnn)]

In [1]:
1

1

# create rankings file together with ground truth

In [14]:
qr = []
for query_id, qi in tqdm(q.iterrows()):
    for i, product_id in enumerate(qi.topn):
        qr.append([query_id, product_id, i+1])
qr = pd.DataFrame(qr, columns = ['query_id', 'product_id', 'model_rank'])
qr.index = [f'{row.query_id}|{row.product_id}' for _,row in tqdm(qr.iterrows(), total=len(qr))]
qr = pd.merge(qr, r[['relevance']], how='left', left_index=True, right_index=True).fillna(0)
qr.shape

97345it [00:08, 11493.23it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3893800/3893800 [02:08<00:00, 30260.06it/s]


(3893800, 4)

In [15]:
qr.head()

Unnamed: 0,query_id,product_id,model_rank,relevance
0|B016UQ2YQ0,0,B016UQ2YQ0,1,0.0
0|B078W573T9,0,B078W573T9,2,0.0
0|B016UQ2YLK,0,B016UQ2YLK,3,0.0
0|B007PD7WOW,0,B007PD7WOW,4,0.0
0|B07613VSB4,0,B07613VSB4,5,0.0


In [16]:
qr.relevance.value_counts()

relevance
0.0    3100859
4.0     638877
2.0     118518
1.0      24187
3.0      11359
Name: count, dtype: int64

In [17]:
qr.to_parquet(f'{model_folder}/{dataset}-ranking_with_relevance.parquet', index=False)

In [None]:
qr[qr.product_id=='B000MOO21W']

In [None]:
q