In [None]:
import torch
import pandas as pd
from app.utils.logger import setup_colored_logger
import numpy as np
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm
import gc
from torch.amp.autocast_mode import autocast
from scipy.sparse import lil_matrix

  from tqdm.autonotebook import tqdm, trange


In [10]:
SIMILARITY_MATRIX_PATH = '../data/similarity_matrix.csv'
SIMILARITIES_PATH = '../data/similarities.csv'
DATASET_PATH = '../data/openfoodfacts_sample.pkl'

In [3]:
logger = setup_colored_logger(__name__)

In [None]:
logger.info(f'Loading products from OpenFoodFacts')
product_df = pd.read_pickle(DATASET_PATH)
product_df = product_df.reset_index(drop=True)
logger.info(f'Products loaded from OpenFoodFacts')
logger.info(f'Products shape: {product_df.shape}')

[32m2024-12-03 14:21:31 [INFO] __main__: Loading products from OpenFoodFacts[0m
[32m2024-12-03 14:21:33 [INFO] __main__: Products loaded from OpenFoodFacts[0m
[32m2024-12-03 14:21:33 [INFO] __main__: Products shape: (293311, 44)[0m


In [11]:
DATASET_PART = 0.05

sample_size = int(DATASET_PART * product_df.shape[0])
product_df = product_df.sample(sample_size)
product_df = product_df.reset_index(drop=True)

In [None]:
# product_df.to_pickle(DATASET_PATH)

In [11]:
product_df = pd.read_pickle(DATASET_PATH)

In [12]:
product_df

Unnamed: 0,code,product_name,quantity,categories_tags,categories_en,labels_tags,labels_en,stores,countries_tags,ingredients_tags,...,popularity_tags,completeness,main_category,main_category_en,energy_100g,saturated-fat_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g
0,8888626005935,Pineapple in heavy syrup,565 g net,"en:plant-based-foods-and-beverages,en:plant-ba...","Plant-based foods and beverages,Plant-based fo...","en:halal,en:no-artificial-flavors,en:no-preser...","Halal,No artificial flavors,No preservatives,N...",,en:singapore,"en:pineapple,en:fruit,en:water,en:sugar,en:add...",...,"bottom-25-percent-scans-2021,top-80-percent-sc...",0.8750,en:canned-pineapple-in-pineapple-juice-and-syr...,Canned pineapple in pineapple juice and syrup ...,389.0,0.0,18.0,4.0,0.6,0.0200
1,4026285002733,Bissinger Limette,0.5 l,"en:beverages,en:carbonated-drinks,en:sodas,en:...","Beverages,Carbonated drinks,Sodas,Sweetened be...","en:vegetarian,en:vegan,de:mehrweg","Vegetarian,Vegan,de:mehrweg",,en:germany,"en:natural-mineral-water,en:water,en:mineral-w...",...,"bottom-25-percent-scans-2021,bottom-20-percent...",0.7875,en:sweetened-beverages,Sweetened beverages,186.0,0.0,11.0,,0.0,0.0160
2,03838705,Jamón Serrano,400 g,"en:meats-and-their-products,en:meats,en:prepar...","Meats and their products,Meats,Prepared meats,...",en:green-dot,Green Dot,,en:france,"en:ham,en:animal,en:meat,en:pork,en:pork-meat,...",...,"bottom-25-percent-scans-2019,bottom-20-percent...",0.7750,en:raw-cured-ham,Raw-cured-ham,962.0,3.6,0.1,,26.8,3.5000
3,4337185003639,Pomazánka s lilkem,180 g,"en:plant-based-foods-and-beverages,en:plant-ba...","Plant-based foods and beverages,Plant-based fo...","en:organic,en:vegetarian,en:eu-organic,en:non-...","Organic,Vegetarian,EU Organic,Non-EU Agricultu...",Kaufland,"en:bulgaria,en:czech-republic,en:germany,en:ro...","en:sunflower-oil,en:oil-and-fat,en:vegetable-o...",...,"top-75-percent-scans-2022,top-80-percent-scans...",0.9000,en:vegan-vegetable-spreads,Vegan vegetable spreads,1452.0,3.8,3.6,3.2,3.9,1.1000
4,3596710450381,Allumettes fumees,2 x 100 g,"en:meats-and-their-products,en:prepared-meats,...","Meats and their products,Prepared meats,fr:Cha...",,,Auchan,en:france,,...,"bottom-25-percent-scans-2019,top-80-percent-sc...",0.8875,en:smoked-lardons,Smoked lardons,1071.0,6.0,0.8,,18.0,2.2000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14660,3831051015180,LCA probiotični tekoči jogurt malina,500g,"en:dairies,en:fermented-foods,en:fermented-mil...","Dairies,Fermented foods,Fermented milk product...","sl:brez-dodanega-sladkorja-in-sladil,sl:brez-g...","sl:brez-dodanega-sladkorja-in-sladil,sl:brez-g...",,en:slovenia,"sl:pasterizirano-fermentirano-mleko,sl:oligofr...",...,"bottom-25-percent-scans-2019,bottom-20-percent...",0.9875,en:yogurts,Yogurts,180.0,0.8,4.4,2.8,3.6,0.1000
14661,7613033113697,Chocolat Noir dessert,200 g,"en:snacks,en:desserts,en:sweet-snacks,en:cocoa...","Snacks,Desserts,Sweet snacks,Cocoa and its pro...","en:green-dot,en:pure-cocoa-butter","Green Dot,Pure cocoa butter",,"en:belgium,en:france,en:switzerland","en:sugar,en:added-sugar,en:disaccharide,en:coc...",...,"top-100000-scans-2019,at-least-5-scans-2019,to...",0.8875,fr:chocolats-noirs-dessert,fr:chocolats-noirs-dessert,2287.0,20.9,46.7,7.4,4.9,0.0254
14662,8480013081005,Galletas María Rustica,800 g,"es:galletas-tipo-maria,es:pan-y-reposteria","es:galletas-tipo-maria,es:pan-y-reposteria",,,"SPAR,UPPER",en:spain,"en:wheat-flour,en:cereal,en:flour,en:wheat,en:...",...,"bottom-25-percent-scans-2021,top-80-percent-sc...",0.9875,es:pan-y-reposteria,es:pan-y-reposteria,1837.0,5.0,22.0,2.8,7.0,0.7300
14663,3564700784815,Miel fleur d'oranger,375 g,"en:breakfasts,en:spreads,en:sweet-spreads,en:b...","Breakfasts,Spreads,Sweet spreads,Bee products,...",en:green-dot,Green Dot,Leclerc,en:france,"en:blend-of-eu-and-non-eu-honeys,en:added-suga...",...,"top-50000-scans-2019,top-100000-scans-2019,at-...",0.9875,en:orange-blossom-honeys,Orange blossom honeys,1360.0,0.0,80.0,0.0,0.0,0.0000


In [None]:
def calculate_similarity_matrix(df, product_col='code', categories_col='categories_en', batch_size=128, model_name='all-MiniLM-L6-v2'):
    """
    Similarity matrix calculation using categorical encoding for product IDs
    """
    # Convert product column to categorical and ensure proper type
    df[product_col] = df[product_col].astype('category')
    product_codes = df[product_col].cat.codes.astype('int32')# Używamy int32 zamiast int64
    product_categories = df[product_col].cat.categories
        
    # Set device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Initialize model
    model = SentenceTransformer(model_name)
    model.to(device)
    
    if torch.cuda.is_available():
        model.half()
    
    n_products = len(df)
    categories = df[categories_col].tolist()
    
    print("Calculating embeddings...")
    all_embeddings = []
    
    for i in tqdm(range(0, n_products, batch_size)):
        batch_categories = categories[i:i + batch_size]
        with torch.no_grad(), autocast(device_type="cuda" if torch.cuda.is_available() else "cpu", dtype=torch.float16):
            batch_embeddings = model.encode(
                batch_categories,
                convert_to_tensor=True,
                device=device,
                normalize_embeddings=True,
                batch_size=32
            )
            if torch.cuda.is_available():
                batch_embeddings = batch_embeddings.half()
            all_embeddings.append(batch_embeddings)
        
        if torch.cuda.is_available() and i % (batch_size * 10) == 0:
            torch.cuda.empty_cache()
    
    embeddings = torch.cat(all_embeddings)
    
    print("Calculating similarities...")
    
    similarity_matrix = lil_matrix((n_products, n_products), dtype=np.float32)
    
    for i in tqdm(range(0, n_products, batch_size)):
        batch_start, batch_end = i, min(i + batch_size, n_products)
        batch_embeddings = embeddings[batch_start:batch_end]
        
        with torch.no_grad(), autocast(device_type="cuda" if torch.cuda.is_available() else "cpu", dtype=torch.float16):
            batch_similarities = util.pytorch_cos_sim(batch_embeddings, embeddings)
            batch_similarities = batch_similarities.float()
            similarity_matrix[batch_start:batch_end] = batch_similarities.cpu().numpy()
        
        if torch.cuda.is_available():
            del batch_similarities
            torch.cuda.empty_cache()
    
    del embeddings, all_embeddings
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    return similarity_matrix, product_codes, product_categories

def save_similarity_matrix(similarity_matrix, product_codes, product_categories, output_file, threshold=0.9, batch_size=10000):
    """
    Memory-efficient similarity matrix saving with categorical product codes
    """
    print("Saving results...")
    
    with open(output_file, 'w') as f:
        f.write('product1,product2,similarity\n')
        logger.info(f'Saving results to {output_file}') 
        n_products = len(product_codes)
        for i in tqdm(range(0, n_products, batch_size)):
            batch_end = min(i + batch_size, n_products)
            logger.info(f'Processing batch {i} - {batch_end}')
            
            for j in range(i, batch_end):
                for k in range(j + 1, n_products):
                    
                    similarity = float(similarity_matrix[j, k])
                    if similarity > threshold:
                        
                        product1 = product_categories[product_codes[j]]
                        product2 = product_categories[product_codes[k]]
                        f.write(f'{product1},{product2},{similarity:.4f}\n')
            
            if i % (batch_size * 10) == 0:
                gc.collect()

In [None]:
batch_size = 128  # Adjust based on your GPU memory
similarity_matrix, product_codes, product_categories = calculate_similarity_matrix(
    product_df,
    batch_size=batch_size
)

save_similarity_matrix(
    similarity_matrix,
    product_codes,
    product_categories,
    SIMILARITY_MATRIX_PATH,
    threshold=0.9,
    batch_size=10000
)

Calculating embeddings...


100%|██████████| 115/115 [01:03<00:00,  1.82it/s]


Calculating similarities...


100%|██████████| 115/115 [00:49<00:00,  2.34it/s]
[32m2024-12-03 14:24:03 [INFO] __main__: Saving results to similarity_matrix.csv[0m


Saving results...


  0%|          | 0/2 [00:00<?, ?it/s][32m2024-12-03 14:24:03 [INFO] __main__: Processing batch 0 - 10000[0m
 50%|█████     | 1/2 [08:32<08:32, 512.24s/it][32m2024-12-03 14:32:35 [INFO] __main__: Processing batch 10000 - 14665[0m
100%|██████████| 2/2 [09:29<00:00, 284.51s/it]


In [None]:
optimized_df = pd.read_csv(SIMILARITY_MATRIX_PATH)

  optimized_df = pd.read_csv('../data/similarity_matrix.csv')


In [18]:
optimized_df

Unnamed: 0,product1,product2,similarity
0,8888626005935,23041103,0.9150
1,8888626005935,3222472572954,0.9077
2,8888626005935,3350031695452,0.9229
3,8888626005935,3080920988456,0.9136
4,8888626005935,600350117825,0.9424
...,...,...,...
992794,5903018640006,7896283000072,0.9023
992795,6191327600012,5400113572237,0.9390
992796,6191327600012,3760074417148,0.9409
992797,8436006913266,3564700784815,0.9399


In [19]:
df_sorted = optimized_df.sort_values(['product1', 'similarity'], ascending=[True, False])

In [20]:
df_sorted = df_sorted.reset_index(drop=True)

In [21]:
df_sorted

Unnamed: 0,product1,product2,similarity
0,28271,3555081196526,1.0000
1,28271,5000128982917,1.0000
2,28271,5010909004509,1.0000
3,28271,39047011304,1.0000
4,28271,3472860001706,0.9922
...,...,...,...
992794,99482493332,8717953127495,0.9321
992795,99482493332,21222686506,0.9253
992796,99482493332,3760211820718,0.9131
992797,99482493332,20284251,0.9111


In [None]:
df_sorted['product1'] = df_sorted['product1'].astype(str).apply(lambda x: x.zfill(8))
df_sorted['product2'] = df_sorted['product2'].astype(str).apply(lambda x: x.zfill(8))

df_sorted.to_csv(SIMILARITIES_PATH, index=False)

In [None]:
df_sorted


Unnamed: 0,product1,product2,similarity
0,00028271,3555081196526,1.0000
1,00028271,5000128982917,1.0000
2,00028271,5010909004509,1.0000
3,00028271,39047011304,1.0000
4,00028271,3472860001706,0.9922
...,...,...,...
992794,99482493332,8717953127495,0.9321
992795,99482493332,21222686506,0.9253
992796,99482493332,3760211820718,0.9131
992797,99482493332,20284251,0.9111


In [1]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118
Note: you may need to restart the kernel to use updated packages.
