# batch makes all queries to get top 40 products relevant for each query

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from joblib import Parallel, delayed
import utils

In [2]:
import sys
sys.executable

'/opt/conda/envs/p312/bin/python'

In [3]:
model = 'gemini'
locale = 'es'

In [4]:
ESCI_DATASET_ROOT = '/usr/local/google/home/raulramos/projects/esci-data'

dgt = utils.load_examples(ESCI_DATASET_ROOT=ESCI_DATASET_ROOT, locale=locale)

In [5]:
q = pd.read_parquet(f'{model}/queries-{locale}.parquet')

In [6]:
p = pd.read_parquet(f'{model}/products-{locale}.parquet')

In [7]:
q.shape, p.shape

((15180, 2), (259973, 2))

In [8]:
qe = np.stack(q.embeddings.values.copy())
pe = np.stack(p.embeddings.values.copy())
qe.shape, pe.shape

((15180, 768), (259973, 768))

In [9]:
def get_dotp_closests_idxs_onlyannotated(query_id):

    q_embeddings = q.loc[query_id].embeddings

    prod_ids = dgt[dgt.query_id == query_id].product_id.values
    pq = p.loc[prod_ids]
    p_embeddings = np.stack(pq.embeddings.values)
    return list(pq.index[np.argsort(p_embeddings.dot(q_embeddings))[::-1]])


In [11]:
qnn = Parallel(n_jobs=-1, verbose=5, prefer='threads')(delayed(get_dotp_closests_idxs_onlyannotated)(qi) for qi in q.index)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 64 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 322 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 520 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 754 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 1024 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 1330 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 1672 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 2050 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 2464 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 2914 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 3400 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done 3922 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 4480 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done 5074 tasks   

In [12]:
q['top40_products'] = qnn

In [13]:
q

Unnamed: 0_level_0,query,embeddings,top40_products
query_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,!solid camiseta sin manga,"[-0.04055759, 0.0099594435, 0.020765519, 0.002...","[B07Q3251CF, B088SR4RS3, B07W5H99YX, B095G3N3X..."
4,"""vitamina c""","[0.012071465, 0.002740369, -0.014514849, -0.03...","[B073Q4Z7XL, B073GSMPNW, B08QZ78HGB, B08CZ7F6N..."
31,#27 rubio oscuro extensiones sin clip,"[-0.039591387, 0.015862327, -0.01659879, -0.03...","[B08X6M87Y5, B087FWY7LW, B08R17MCP9, B08M8PDX3..."
110,% pura manteca de karite sin aditivos sin olor,"[0.012794739, 0.01580022, 0.0056419247, -0.040...","[B08V4TTSFG, B08V1ZZY32, B00X1JJLJ8, B00MY7O91..."
130,'el verano sin hombres',"[-0.035371274, -0.020628797, 0.032183185, 0.03...","[8432236950, B098899B4X, 8491819959, B094DQFQX..."
...,...,...,...
115942,árbol navidad,"[-0.024797823, 0.022182858, 0.00029902192, 0.0...","[B07XC16LJ5, B07Z3P86DP, B07GF5VLV1, B07X9182W..."
115943,árbol navidad infantil,"[-0.0296712, 0.003702432, -0.01044168, -0.0066...","[B07Z3P86DP, B07GF5VLV1, B07YKHMF9Y, B07WC578H..."
115944,árbol navidad madera,"[-0.030937139, 0.05824799, 0.0020262604, -0.01...","[B07K2K9JTF, B07L9ZLNVS, B077Q8L3JX, B07ZNQL76..."
115946,árvore de natal,"[-0.0084783295, 0.023883473, 0.0031354125, -0....","[B08GG5GZ9P, B09HRNH652, B01N4HT4UX, B09DXKZ7T..."


In [14]:
qr = []
for query_id, qi in tqdm(q.iterrows()):
    for product_id in qi.top40_products:
        qr.append([query_id, product_id])
    

15180it [00:01, 13174.56it/s]


In [15]:
pd.DataFrame(qr, columns = ['query_id', 'product_id']).to_csv(f'{model}/embeddings_dotp_ranking_onlyannotated_{locale}.csv', index=False)