In [1]:
import torch
import pandas as pd
from app.utils.logger import setup_colored_logger
import numpy as np
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import gc
from torch.amp.autocast_mode import autocast
from scipy.sparse import lil_matrix

  from tqdm.autonotebook import tqdm, trange


In [2]:
SIMILARITY_MATRIX_PATH = 'app/data/similarity_matrix.csv'
SIMILARITIES_PATH = '../app/data/similarities.csv'
DATASET_PATH = '../app/data/openfoodfacts_sample.pkl'

In [3]:
logger = setup_colored_logger(__name__)

In [6]:
logger.info(f'Loading products from OpenFoodFacts')
product_df = pd.read_pickle(DATASET_PATH)
product_df = product_df.reset_index(drop=True)
logger.info(f'Products loaded from OpenFoodFacts')
logger.info(f'Products shape: {product_df.shape}')

[32m2024-12-09 18:22:29 [INFO] __main__: Loading products from OpenFoodFacts[0m
[32m2024-12-09 18:22:30 [INFO] __main__: Products loaded from OpenFoodFacts[0m
[32m2024-12-09 18:22:30 [INFO] __main__: Products shape: (14664, 45)[0m


In [None]:
# DATASET_PART = 0.05

# sample_size = int(DATASET_PART * product_df.shape[0])
# product_df = product_df.sample(sample_size)
# product_df = product_df.reset_index(drop=True)

In [None]:
# product_df.to_pickle(DATASET_PATH)

In [15]:
for _, row in product_df[product_df['countries_tags'] == 'en:poland'].iterrows():
    print(f"Name: {row['product_name']}, Code: {row['code']}")

Name: Jogurt typu greckiego z jagodami 2,4% tłuszczu, Code: 5900531003400
Name: Holenderski ser kozi półtwardy ser podpuszczkowy z mleka koziego, w plastrach., Code: 5907627471532
Name: Lody truskawkowe, Code: 5907439112074
Name: JoguVege Coco Jagoda, Code: 8595588200956
Name: Bahlsen Hitczekoladowy 220G, Code: 5901414203467
Name: Sorbet mango, Code: 5907180312334
Name: nan, Code: 5900562227134
Name: nan, Code: 04998358
Name: Chleb żytni bez drożdzy, Code: 5907577250461
Name: Pestki Dyni Łuskane, Code: 5905784358000
Name: Berlinki Machos á la Kabanos, Code: 5900567021676
Name: Czekoladowy zająć, Code: 5400265040837
Name: Powidła śliwkowe z odmiany śliwki węgierki, Code: 20496913
Name: Owsiane smoothie owoce lata, Code: 5900168530959
Name: Makaron Staropolski, Code: 5906940007039
Name: Szynka Harnasia Miodowa, Code: 5906190370969
Name: Śmietana mleczna dolina, Code: 5900820009090
Name: Kiełbasa bamberska, Code: 5904215159087
Name: Ketchup łagodny, Code: 5901986081050
Name: Bulion warzyw

In [18]:
product_df["labels_tags"].value_counts().head(15)

labels_tags
en:green-dot                                                        633
en:no-gluten                                                        234
en:vegetarian,en:vegan                                              179
en:made-in-france                                                   130
en:no-preservatives                                                 107
en:nutriscore,en:nutriscore-grade-a                                  88
en:vegetarian                                                        78
en:pdo                                                               70
en:sustainable,en:sustainable-fishery,en:sustainable-seafood-msc     68
en:organic,en:eu-organic,fr:ab-agriculture-biologique                67
en:green-dot,en:made-in-france                                       63
en:organic                                                           62
fr:triman                                                            59
en:nutriscore,en:nutriscore-grade-b                 

In [None]:
product_df

In [None]:
def calculate_similarity_matrix(df, product_col='code', categories_col='categories_en', batch_size=128, model_name='all-MiniLM-L6-v2'):
    """
    Similarity matrix calculation using categorical encoding for product IDs
    """
    # Convert product column to categorical and ensure proper type
    df[product_col] = df[product_col].astype('category')
    product_codes = df[product_col].cat.codes.astype('int32')# Używamy int32 zamiast int64
    product_categories = df[product_col].cat.categories
        
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Initialize model
    model = SentenceTransformer(model_name)
    model.to(device)
    
    if torch.cuda.is_available():
        model.half()
    
    n_products = len(df)
    categories = df[categories_col].tolist()
    
    print("Calculating embeddings...")
    all_embeddings = []
    
    for i in tqdm(range(0, n_products, batch_size)):
        batch_categories = categories[i:i + batch_size]
        with torch.no_grad(), autocast(device_type="cuda" if torch.cuda.is_available() else "cpu", dtype=torch.float16):
            batch_embeddings = model.encode(
                batch_categories,
                convert_to_tensor=True,
                device=device,
                normalize_embeddings=True,
                batch_size=32
            )
            if torch.cuda.is_available():
                batch_embeddings = batch_embeddings.half()
            all_embeddings.append(batch_embeddings)
        
        if torch.cuda.is_available() and i % (batch_size * 10) == 0:
            torch.cuda.empty_cache()
    
    embeddings = torch.cat(all_embeddings)
    
    print("Calculating similarities...")
    
    similarity_matrix = lil_matrix((n_products, n_products), dtype=np.float32)
    
    for i in tqdm(range(0, n_products, batch_size)):
        batch_start, batch_end = i, min(i + batch_size, n_products)
        batch_embeddings = embeddings[batch_start:batch_end]
        
        with torch.no_grad(), autocast(device_type="cuda" if torch.cuda.is_available() else "cpu", dtype=torch.float16):
            batch_similarities = util.pytorch_cos_sim(batch_embeddings, embeddings)
            batch_similarities = batch_similarities.float()
            similarity_matrix[batch_start:batch_end] = batch_similarities.cpu().numpy()
        
        if torch.cuda.is_available():
            del batch_similarities
            torch.cuda.empty_cache()
    
    del embeddings, all_embeddings
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    return similarity_matrix, product_codes, product_categories

def save_similarity_matrix(similarity_matrix, product_codes, product_categories, output_file, threshold=0.9, batch_size=10000):
    """
    Memory-efficient similarity matrix saving with categorical product codes
    """
    print("Saving results...")
    
    with open(output_file, 'w') as f:
        f.write('product1,product2,similarity\n')
        logger.info(f'Saving results to {output_file}') 
        n_products = len(product_codes)
        for i in tqdm(range(0, n_products, batch_size)):
            batch_end = min(i + batch_size, n_products)
            logger.info(f'Processing batch {i} - {batch_end}')
            
            for j in range(i, batch_end):
                for k in range(j + 1, n_products):
                    
                    similarity = float(similarity_matrix[j, k])
                    if similarity > threshold:
                        
                        product1 = product_categories[product_codes[j]]
                        product2 = product_categories[product_codes[k]]
                        f.write(f'{product1},{product2},{similarity:.4f}\n')
            
            if i % (batch_size * 10) == 0:
                gc.collect()

In [None]:
batch_size = 128  # Adjust based on your GPU memory
similarity_matrix, product_codes, product_categories = calculate_similarity_matrix(
    product_df,
    batch_size=batch_size
)

save_similarity_matrix(
    similarity_matrix,
    product_codes,
    product_categories,
    SIMILARITY_MATRIX_PATH,
    threshold=0.9,
    batch_size=10000
)

Calculating embeddings...


100%|██████████| 115/115 [01:03<00:00,  1.82it/s]


Calculating similarities...


100%|██████████| 115/115 [00:49<00:00,  2.34it/s]
[32m2024-12-03 14:24:03 [INFO] __main__: Saving results to similarity_matrix.csv[0m


Saving results...


  0%|          | 0/2 [00:00<?, ?it/s][32m2024-12-03 14:24:03 [INFO] __main__: Processing batch 0 - 10000[0m
 50%|█████     | 1/2 [08:32<08:32, 512.24s/it][32m2024-12-03 14:32:35 [INFO] __main__: Processing batch 10000 - 14665[0m
100%|██████████| 2/2 [09:29<00:00, 284.51s/it]


In [None]:
optimized_df = pd.read_csv(SIMILARITY_MATRIX_PATH)

  optimized_df = pd.read_csv('../data/similarity_matrix.csv')


In [18]:
optimized_df

Unnamed: 0,product1,product2,similarity
0,8888626005935,23041103,0.9150
1,8888626005935,3222472572954,0.9077
2,8888626005935,3350031695452,0.9229
3,8888626005935,3080920988456,0.9136
4,8888626005935,600350117825,0.9424
...,...,...,...
992794,5903018640006,7896283000072,0.9023
992795,6191327600012,5400113572237,0.9390
992796,6191327600012,3760074417148,0.9409
992797,8436006913266,3564700784815,0.9399


In [19]:
df_sorted = optimized_df.sort_values(['product1', 'similarity'], ascending=[True, False])

In [20]:
df_sorted = df_sorted.reset_index(drop=True)

In [21]:
df_sorted

Unnamed: 0,product1,product2,similarity
0,28271,3555081196526,1.0000
1,28271,5000128982917,1.0000
2,28271,5010909004509,1.0000
3,28271,39047011304,1.0000
4,28271,3472860001706,0.9922
...,...,...,...
992794,99482493332,8717953127495,0.9321
992795,99482493332,21222686506,0.9253
992796,99482493332,3760211820718,0.9131
992797,99482493332,20284251,0.9111


In [None]:
df_sorted['product1'] = df_sorted['product1'].astype(str).apply(lambda x: x.zfill(8))
df_sorted['product2'] = df_sorted['product2'].astype(str).apply(lambda x: x.zfill(8))

df_sorted.to_csv(SIMILARITIES_PATH, index=False)

In [None]:
df_sorted

Unnamed: 0,product1,product2,similarity
0,00028271,3555081196526,1.0000
1,00028271,5000128982917,1.0000
2,00028271,5010909004509,1.0000
3,00028271,39047011304,1.0000
4,00028271,3472860001706,0.9922
...,...,...,...
992794,99482493332,8717953127495,0.9321
992795,99482493332,21222686506,0.9253
992796,99482493332,3760211820718,0.9131
992797,99482493332,20284251,0.9111


In [1]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
Note: you may need to restart the kernel to use updated packages.
