# Product matching example

In [1]:
import re
import pandas as pd
import numpy as np
import pysnooper
from glob import glob
import time
from datetime import datetime
from pathlib import Path
import sys
from io import StringIO
import configparser

sys.path.append('..')
from modules.normalize_text import normalize_text
from modules.distance_metrics import levenshtein_and_dice_ratio, jaccard_distance

In [2]:
%load_ext autoreload
%autoreload 2

%load_ext memory_profiler

## Web Scrapping Catalog

Scraped `csv` files are located in the `./scraped` directory.
- Each file must contain the following three columns:
    1. `Cadena`: Competitor's name
    2. `SKU`: Name of the product, as shown in the competitor's web page
    3. `Precio final`: Price of the product that will be used to compare against the matching ABI product.


In [45]:
df_scraping_dates.reset_index()

Unnamed: 0,index,datetime,str_date,week_num
0,1,2023-07-28,28-07-23,30
1,0,2023-08-04,04-08-23,31


In [39]:
num_date_to_process = 2

In [48]:
from azure.storage.blob import BlobServiceClient

# Define la conexión a tu cuenta de almacenamiento de Azure Blob
config = configparser.ConfigParser()
config.read('../azure_storage_conf.ini')

account_name = config['AzureBlobStorage']['account_name']
account_key  = config['AzureBlobStorage']['account_key']
blob_service_client = BlobServiceClient(
                        account_url = f"https://{account_name}.blob.core.windows.net", 
                        credential = account_key)

# Define la ruta de la carpeta dentro del contenedor
container_name = config['AzureBlobStorage']['container_name']
container_client = blob_service_client.get_container_client(container= container_name)


# Check avalaible dates
blob_files = list(container_client.list_blob_names())
available_dates = sorted(list(set([i[:8] for i in blob_files])))

# ***********************
# Añadido sólo para evitar errores en el formato de fecha
# TODO: Solicitar que el formato de fecha sea uniforme en todos los archivos
# df_scraping_dates = pd.to_datetime(available_dates, format = "%d-%m-%y").sort_values().to_frame(name = 'datetime')
available_dates_s = pd.Series(available_dates)
df_scraping_dates = pd.to_datetime(available_dates_s, 
                                   format = "%d-%m-%y", errors = 'coerce')\
                      .fillna(pd.to_datetime(available_dates_s, 
                                   format = "%Y-%m-%d", errors = 'coerce'))
df_scraping_dates = df_scraping_dates.sort_values().to_frame(name = 'datetime')
# ***********************

df_scraping_dates['str_date'] = df_scraping_dates['datetime'].dt.strftime("%d-%m-%y")
df_scraping_dates['week_num'] = df_scraping_dates['datetime'].dt.strftime("%U").apply(int)

# BLOB names
_, date_sf, week_num = df_scraping_dates.iloc[num_date_to_process - 1, :]
# blob_names = [i for i in blob_files if i.startswith(date_sf)]
blob_names = blob_files

#######
print(f'Scraping num > {num_date_to_process}')
print('Scripng date:', date_sf)
print('Week num:', week_num)
print('Num of files', len(blob_names))

Scraping num > 2
Scripng date: 04-08-23
Week num: 31
Num of files 19


In [44]:
blob_files

['2023-8-4-Azua.csv',
 '2023-8-4-BodegasAlianza.csv',
 '2023-8-4-CocaCola.csv',
 '2023-8-4-Consuvino.csv',
 '2023-8-4-Dislicores.csv',
 '2023-8-4-FarmaTodo.csv',
 '2023-8-4-Frubana.csv',
 '2023-8-4-Garis.csv',
 '2023-8-4-LaCastellana.csv',
 '2023-8-4-LaEuropea.csv',
 '2023-8-4-LaPlaya.csv',
 '2023-8-4-LaRebajaVirtual.csv',
 '2023-8-4-LaVioleta.csv',
 '2023-8-4-MayoreoTotal.csv',
 '2023-8-4-Scorpion.csv',
 '2023-8-4-SuperMay.csv',
 '2023-8-4-SurteTodo.csv',
 '2023-8-4-SurtiTienda.csv',
 '28-07-23-Alcca_MX.csv']

In [51]:
def get_csvs_from_blob(container_client, 
                       blob_name, encoding = 'utf-8'):
    blob_client = container_client.get_blob_client(blob_name)
    stream = blob_client.download_blob()
    data = stream.content_as_text(encoding = encoding)
    return StringIO(data)

In [52]:
b = blob_files[0]
_df = pd.read_csv(
               get_csvs_from_blob(container_client, 
                                  blob_name = b)
             )
_df

Unnamed: 0,type,company,date,country,site,url,zone,price,name,url.1,image
0,Online,Azua,2023-8-4,MX,www.azua.com.mx,https://www.azua.com.mx/productos/marlboro-sum...,unique,68,Marlboro summer vista 1 cajetilla,//d3ugyf2ht6aenh.cloudfront.net/assets/themes/...,
1,Online,Azua,2023-8-4,MX,www.azua.com.mx,https://www.azua.com.mx/productos/marlboro-roj...,unique,75,Marlboro rojo 1 cajetilla,//d3ugyf2ht6aenh.cloudfront.net/assets/themes/...,
2,Online,Azua,2023-8-4,MX,www.azua.com.mx,https://www.azua.com.mx/productos/pall-mall-to...,unique,68,Pall mall Tokio (1 cajetilla),//d3ugyf2ht6aenh.cloudfront.net/assets/themes/...,
3,Online,Azua,2023-8-4,MX,www.azua.com.mx,https://www.azua.com.mx/productos/corona-light...,unique,CORONA LIGHT LATA C/24 355 ML,//d3ugyf2ht6aenh.cloudfront.net/assets/themes/...,350,
4,Online,Azua,2023-8-4,MX,www.azua.com.mx,https://www.azua.com.mx/productos/estrella-lat...,unique,ESTRELLA LATON C/24 473 ML,//d3ugyf2ht6aenh.cloudfront.net/assets/themes/...,470,
...,...,...,...,...,...,...,...,...,...,...,...
124,Online,Azua,2023-8-4,MX,www.azua.com.mx,https://www.azua.com.mx/productos/pacifico-cla...,unique,Pacífico clara Lata 355ml 24pzs,//d3ugyf2ht6aenh.cloudfront.net/assets/themes/...,380,
125,Online,Azua,2023-8-4,MX,www.azua.com.mx,https://www.azua.com.mx/productos/modelo-laton...,unique,Modelo laton 710 ml-12 pzs,//d3ugyf2ht6aenh.cloudfront.net/assets/themes/...,319,
126,Online,Azua,2023-8-4,MX,www.azua.com.mx,https://www.azua.com.mx/productos/victoria-355...,unique,Victoria 355ml lata 24 Pz,//d3ugyf2ht6aenh.cloudfront.net/assets/themes/...,380,
127,Online,Azua,2023-8-4,MX,www.azua.com.mx,https://www.azua.com.mx/productos/michelob-vid...,unique,MICHELOB VIDRIO 24 UDS 355 ML/ NO RETORNABLE,//d3ugyf2ht6aenh.cloudfront.net/assets/themes/...,418,


In [53]:
_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   type     129 non-null    object 
 1   company  129 non-null    object 
 2   date     129 non-null    object 
 3   country  129 non-null    object 
 4   site     129 non-null    object 
 5   url      129 non-null    object 
 6   zone     129 non-null    object 
 7   price    129 non-null    object 
 8   name     129 non-null    object 
 9   url.1    128 non-null    object 
 10  image    1 non-null      float64
dtypes: float64(1), object(10)
memory usage: 11.2+ KB


# Requerimientos

1. <mark>Nomenclatura uniforme de los archivos.</mark>
    - `fecha_CompetidorName_PaisCode.csv`
    - fecha formato `"dd-mm-yy"`
    - Ejemplo:
        - `04-08-23_Alcca_MX.csv`
        
        
2. <mark>Garantizar que la codificación (encoding) de los archivos sea la misma.</mark>
    - "utf-8"
    - `df.to_csv("file.csv", encoding="utf-8")`
    
    
3. <mark>Estructura uniforme de los archivos.</mark>
    - Tener las mismas columnas (nombre y número) y orden en todos los archivos.
    - Usar uno o más caracteres especiales como separador "`<sp>`"
    
    
4. <mark>En lo posible, dejar el campo de precio únicamente con valores numéricos.</mark>
    - Quitar caractéres como "$", ",", etc.
 

## ABI Marketplace Catalog

In [47]:
df_cat_precios_raw = pd.read_excel(
    '../abi_prices_catalogs/PreciosMarketplaceHistorico.xlsb', 
    sheet_name = 'PRECIOS', engine = 'pyxlsb', skiprows = 3)

print(df_cat_precios_raw.columns)

df_cat_precios = df_cat_precios_raw.copy()
# Sanity check
# Verify that there are no duplicated SKUs?
sku_colname   = 'SKU NUEVO'
sku_desc_name = 'DESCRIPCIÓN'
price_colname = f'PTR_W{15}'
brand_colname = 'Marca'

# Format the SKU material
df_cat_precios[sku_colname] = df_cat_precios[sku_colname]\
                               .apply(lambda x: str(int(x)))
assert not df_cat_precios[sku_colname].duplicated().any()

# Keep only useful columns and rename them
df_cat_precios = df_cat_precios[[sku_colname, sku_desc_name, price_colname, brand_colname]]
df_cat_precios['sku_name'] = df_cat_precios[sku_colname] + ' ' + df_cat_precios[brand_colname]
df_cat_precios = df_cat_precios[[sku_colname, sku_desc_name, price_colname]]
df_cat_precios.columns = ['sku', 'sku_name', 'price']

# Cleaning phase
# Lowercase string fields
df_cat_precios['sku_name_clean'] = df_cat_precios['sku_name']\
    .apply(lambda x: normalize_text(x, encode = 'ascii'))

df_cat_precios.head()

Index(['Categoría', 'SKU NUEVO', 'DESCRIPCIÓN', 'EAN/UPC', 'IMPERDIBLES',
       'IVA', 'IEPS', 'Marca', 'DESCRIPCION_2', 'CUPO', 'PZA', 'PTR',
       'PTR_UNITARIO', 'PTR_W1 ', 'PTR_W2', 'PTR_W3 ', 'PTR_W4', 'PTR_W5',
       'PTR_W6', 'PTR_W7', 'PTR_W8', 'PTR_W9', 'PTR_W10', 'PTR_W11', 'PTR_W12',
       'PTR_W13', 'PTR_W14', 'PTR_W15'],
      dtype='object')


Unnamed: 0,sku,sku_name,price,sku_name_clean
0,3000200,NESTLE PUREZA VITAL 4L - 4PZ,65.84,npv 4lt 4pz
1,3006942,SANTA MARIA 355ML - 24PZS,76.0,santa maria 300ml 24pz
2,3000085,NESTLE PUREZA VITAL 1LT - 12PZS,82.0,npv 1lt 12pz
3,3001443,SANTA MARIA 1L -12PZS,100.0,santa maria 1lt 12pz
4,3000080,SANTA MARIA 1.5L - 12PZS,100.0,santa maria 1.5lt 12pz


# Start with the matching phase

In [49]:
def get_csvs_from_blob(container_client, 
                       blob_name, encoding = 'utf-8'):
    blob_client = container_client.get_blob_client(blob_name)
    stream = blob_client.download_blob()
    data = stream.content_as_text(encoding = encoding)
    return StringIO(data)

In [50]:
date = datetime.strptime(date_sf, "%d-%m-%y").strftime("%d%m%Y")

# scrap_files = glob(f'../scraped/{date}/*.csv')
col_price_name = 'Price'
col_competitor_name = 'Store'
col_sku_name = 'Category'
# Auxiliar cols (Just for Scorpio)
col_quantity = 'Quantity'
#col_desc = 'Description'
col_url = 'Url'

df_files = []
for b in blob_names:
    print(b)
    if ('_COL' in b) | ('LaRebaja' in b):
        print(f'\t{b} es de colombia')
        continue
    if 'Chedraui' in b:
        print('trying')
        _df = pd.read_csv(
               get_csvs_from_blob(container_client, 
                                  blob_name = b)#,
                                  #encoding = 'cp1252'),
               #encoding = 'cp1252', dtype=object#, sep = ";"
             )
    else:
        _df = pd.read_csv(
                  get_csvs_from_blob(container_client, 
                                     blob_name = b), dtype=object
              )
    # Save into the scraping output directory
    scraped_dir = f'../scraped/{date}'
    Path(scraped_dir).mkdir(exist_ok = True)
    _df.to_csv(f'{scraped_dir}/{b}', encoding = 'utf-8')
    if 'Chedraui' in b:
        if 'Estado' in _df.columns:
            _df = _df[_df['Estado'] == 'CDMX y area Metropolitana']
            _df['Store'] = 'Chedraui'
    if 'BodegaAurrera' in b:
        if 'Estado' in _df.columns:
            _df = _df[_df['Estado'] == 'Ciudad de Mexico']
            # Check for null prices:
            print('BA', _df['Price'].isnull().sum(), 'nulos')
            if _df['Price'].isnull().sum() > 100:
                _df['Price'] = _df['Unit_price']
                _df['Store'] = 'Bodega Aurrera'
    _df = _df[[col_competitor_name, 
              col_sku_name, 
              col_price_name,
              col_quantity,
              col_url
             ]]
    df_files.append(_df)

df_scrap = pd.concat(df_files)    
df_scrap['Category'] = df_scrap['Category'] + " " +\
                       df_scrap['Quantity'].fillna('') 
                       #+ " " + df_scrap['Description'].fillna('')
df_scrap.info()

2023-8-4-Azua.csv


KeyError: "None of [Index(['Store', 'Category', 'Price', 'Quantity', 'Url'], dtype='object')] are in the [columns]"

In [12]:
df_scrap['Store'].unique()

array(['Abarrotero', 'Alcca', 'Autoservicio la Playa', 'Alianza',
       'Bodega Aurrera', 'Cava del Duero', 'Consuvino', 'Corpovino',
       'Frubana', 'Ibarra Mayoreo', 'La Europea', 'Mayoreo Total',
       'Scorpio', 'Soriana', 'Surtitienda', 'Walmart', 'La Castellana'],
      dtype=object)

In [13]:
df_scrap['Category'].isnull().groupby(df_scrap['Store']).sum()

Store
Abarrotero               0
Alcca                    0
Alianza                  0
Autoservicio la Playa    0
Bodega Aurrera           0
Cava del Duero           0
Consuvino                0
Corpovino                0
Frubana                  0
Ibarra Mayoreo           0
La Castellana            0
La Europea               0
Mayoreo Total            0
Scorpio                  0
Soriana                  0
Surtitienda              0
Walmart                  0
Name: Category, dtype: int64

In [14]:
df_scrap.groupby('Store')['Store'].count()

Store
Abarrotero                279
Alcca                     599
Alianza                   777
Autoservicio la Playa     821
Bodega Aurrera           1892
Cava del Duero           1996
Consuvino                1378
Corpovino                 234
Frubana                   110
Ibarra Mayoreo           4550
La Castellana            1028
La Europea               1951
Mayoreo Total            6249
Scorpio                  4916
Soriana                   267
Surtitienda              2758
Walmart                  1915
Name: Store, dtype: int64

- The `Precio final` field is not homogeneous across the scraped `.csv` files.
- A cleaning phase for the `Precio final` field is required.

In [15]:
date = datetime.strptime(date_sf, "%d-%m-%y").strftime("%d%m%Y")
# scrap_files = glob(f'../scraped/{date}/*.csv')
col_price_name      = 'Price'
col_competitor_name = 'Store'
col_sku_name        = 'Category'
# Auxiliar cols (Just for Scorpio)
col_quantity        = 'Quantity'
#col_desc = 'Description'
col_url  = 'Url'

In [16]:
%%time
# Cleaning phase
def parse_price(x):
    s = re.sub('\"|\$|\,|[c/u]|[/u]', '', str(x)).strip()
    try:
        price = float(s)
    except ValueError as e:
        price = 0
    return price

cadenas = df_scrap[col_competitor_name].unique()
for cadena in cadenas:
    df_scrap.loc[df_scrap[col_competitor_name] == cadena, col_price_name] = \
        df_scrap.loc[df_scrap[col_competitor_name] == cadena, col_price_name]\
                .apply(parse_price)

# Cast columns
df_scrap['Price'] = df_scrap['Price'].astype(float)

# Cleaning phase
df_scrap['sku_name_clean'] = df_scrap[col_sku_name]\
    .apply(lambda x: normalize_text(x, encode = 'ascii'))
#df_scrap = df_scrap[~ df_scrap['Store'].isnull()]
df_scrap = df_scrap.reset_index(drop = True)
# Save the cleaned parquet
df_scrap.to_parquet(f'{scraped_dir}/{date}_scraped.parquet')
df_scrap.sample(10)

CPU times: user 28.2 s, sys: 1.72 s, total: 29.9 s
Wall time: 30 s


Unnamed: 0,Store,Category,Price,Quantity,Url,sku_name_clean
2121,Alianza,Boligomas de Chamoy Alianza 250 g,61.0,,https://www.bodegasalianza.com/gomitas-alianza...,boligoma chamoy alianza 200g
2900,Bodega Aurrera,Aceite vegetal comestible Sabrosano +30 de so...,45.5,,https://despensa.bodegaaurrera.com.mx/p/Aceite...,aceite vegetal comestible sabrosano +30 soya c...
6613,Consuvino,Vino De Mesa Palacio de Bornos Frizzante Rosad...,239.0,,https://www.consuvino.com.mx/product-page/pala...,vino mesa palacio borno frizzante rosado 700ml
28823,Walmart,Vino Tinto Flor de Pingus 750 ml,4059.0,,https://www.walmart.com.mx/cervezas-vinos-y-li...,vino tinto flor pingu 700ml
28697,Surtitienda,Dulce Duvalin Avellana Vainilla 18 Piezas - Du...,26.9,,https://www.surtitienda.mx/dulce-duvalin-avell...,dulce duvalin avellana vainilla 18pz
2881,Bodega Aurrera,Manteca comestible Aurrera mixta 500 g,35.0,,https://despensa.bodegaaurrera.com.mx/p/Mantec...,manteca comestible aurrera mixta 500g
2565,Bodega Aurrera,Rompope La Holandesa 1 L,125.0,,https://despensa.bodegaaurrera.com.mx/p/Rompop...,rompope holandesa 1lt
1896,Alianza,Whisky The Macallan 25 Years 700ml C/Estuche M...,52000.0,,https://www.bodegasalianza.com/whisky-the-maca...,whisky the macallan 25 years 700ml c estuche m...
28373,Surtitienda,Caramelo Montes Damy Con 100 Caramelos - Montes,54.6,,https://www.surtitienda.mx/caramelo-montes-dam...,caramelo monte damy 100
25622,Scorpio,Yoghurt Danup Bebible Pina Coco 220 Gramos 5 P...,66.0,5 Pieza(s),https://www.scorpion.com.mx/default/yoghurt-da...,yoghurt danup bebible pina coco 200g 5pz


In [17]:
df_scrap[df_scrap['Store'].isnull()]

Unnamed: 0,Store,Category,Price,Quantity,Url,sku_name_clean


In [18]:
df_scrap['Price'].isnull().groupby(df_scrap['Store']).sum()

Store
Abarrotero               0
Alcca                    3
Alianza                  0
Autoservicio la Playa    0
Bodega Aurrera           0
Cava del Duero           0
Consuvino                0
Corpovino                0
Frubana                  0
Ibarra Mayoreo           0
La Castellana            0
La Europea               0
Mayoreo Total            0
Scorpio                  0
Soriana                  0
Surtitienda              0
Walmart                  0
Name: Price, dtype: int64

In [19]:
df_scrap['Category'].isnull().groupby(df_scrap['Store']).sum()

Store
Abarrotero               0
Alcca                    0
Alianza                  0
Autoservicio la Playa    0
Bodega Aurrera           0
Cava del Duero           0
Consuvino                0
Corpovino                0
Frubana                  0
Ibarra Mayoreo           0
La Castellana            0
La Europea               0
Mayoreo Total            0
Scorpio                  0
Soriana                  0
Surtitienda              0
Walmart                  0
Name: Category, dtype: int64

In [20]:
df_scrap.groupby('Store')['Store'].count()

Store
Abarrotero                279
Alcca                     599
Alianza                   777
Autoservicio la Playa     821
Bodega Aurrera           1892
Cava del Duero           1996
Consuvino                1378
Corpovino                 234
Frubana                   110
Ibarra Mayoreo           4550
La Castellana            1028
La Europea               1951
Mayoreo Total            6249
Scorpio                  4916
Soriana                   267
Surtitienda              2758
Walmart                  1915
Name: Store, dtype: int64

In [21]:
df_scrap.shape

(31720, 6)

In [22]:
# Search for incorrect values
error_characters = df_scrap['Category'].apply(lambda x: '√' in str(x))
df_scrap[error_characters]

Unnamed: 0,Store,Category,Price,Quantity,Url,sku_name_clean


In [23]:
df_scrap.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31720 entries, 0 to 31719
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Store           31720 non-null  object 
 1   Category        31720 non-null  object 
 2   Price           31717 non-null  float64
 3   Quantity        15765 non-null  object 
 4   Url             31708 non-null  object 
 5   sku_name_clean  31720 non-null  object 
dtypes: float64(1), object(5)
memory usage: 1.5+ MB


- There are some null values for `Precio Final`

In [24]:
# Are there missing values or 0 values in the price column?
_num_zero = df_scrap[df_scrap['Price'] <= 0].shape[0]
# Number of null values
_num_null = df_scrap['Price'].isnull().sum()
_total_to_omit = _num_zero + _num_null

print(f'There are {_num_zero} zero values and {_num_null} null values.\nTOTAL: {_total_to_omit} from a total of {df_scrap.shape[0]} ({(_total_to_omit / df_scrap.shape[0]) * 100 : .2f}% of values)')

There are 566 zero values and 3 null values.
TOTAL: 569 from a total of 31720 ( 1.79% of values)


In [25]:
#df_scrap = df_scrap.drop(28466)

In [26]:
df_scrap['Price'].isnull().groupby([df_scrap.Store]).sum()

Store
Abarrotero               0
Alcca                    3
Alianza                  0
Autoservicio la Playa    0
Bodega Aurrera           0
Cava del Duero           0
Consuvino                0
Corpovino                0
Frubana                  0
Ibarra Mayoreo           0
La Castellana            0
La Europea               0
Mayoreo Total            0
Scorpio                  0
Soriana                  0
Surtitienda              0
Walmart                  0
Name: Price, dtype: int64

### Drop null values and zero values for Price

In [27]:
df_scrap = df_scrap[~ df_scrap['Category'].isnull()]

In [28]:
df_scrap = df_scrap[df_scrap['Price'] > 0]
df_scrap = df_scrap[~ df_scrap['Price'].isnull()]
df_scrap.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31151 entries, 0 to 31719
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Store           31151 non-null  object 
 1   Category        31151 non-null  object 
 2   Price           31151 non-null  float64
 3   Quantity        15765 non-null  object 
 4   Url             31139 non-null  object 
 5   sku_name_clean  31151 non-null  object 
dtypes: float64(1), object(5)
memory usage: 1.7+ MB


# Matching phase

## Total Matching

In [29]:
# TEST: Combine Jaccard with Lev-Dice distances
def sku_name_conf(a, b) -> float:
    '''Ponderates Levenshtein-Dice distance with 
    Jaccard distances for unit matching'''
    lvd_conf = 1 - levenshtein_and_dice_ratio(a, b)
    jac_dist_units = jaccard_distance(a, b)
    return (lvd_conf) - (jac_dist_units * 0.1)

def exp_matching_value(x, a = 1.2):
    if x == 0:
        return x
    else:
        return np.exp(1 - (1/x**1.1))

def get_confidence(a, b):
    conf = sku_name_conf(a, b)
    conf = exp_matching_value(conf, a = 1.1)
    return round(conf, 2)

In [30]:
def get_performance(s1, s2):
    # Cleaning
    a1 = normalize_text(s1)
    a2 = normalize_text(s2)
    display(a1, a2)
    print()

    # Confidence using only LevDice
    lvd_conf = round(1 - levenshtein_and_dice_ratio(a1, a2), 2)
    jac_conf = round(1 - jaccard_distance(a1, a2), 2)
    comb_conf = get_confidence(a1, a2)
    display(lvd_conf, jac_conf, comb_conf)
    print()
    
    display(sku_name_conf(a1, a2))
    display(get_confidence(a1, a2))

    # Extract units
    b1 = extract_units(a1)
    b2 = extract_units(a2)
    display(b1, b2)
    print()

In [31]:
len(df_scrap[col_competitor_name].unique())

17

In [32]:
df_scrap[col_competitor_name].unique()

array(['Abarrotero', 'Alcca', 'Autoservicio la Playa', 'Alianza',
       'Bodega Aurrera', 'Cava del Duero', 'Consuvino', 'Corpovino',
       'Frubana', 'Ibarra Mayoreo', 'La Europea', 'Mayoreo Total',
       'Scorpio', 'Soriana', 'Surtitienda', 'Walmart', 'La Castellana'],
      dtype=object)

In [33]:
# Total de evaluaciones
_n_evals = df_scrap.shape[0] * df_cat_precios.shape[0]
print(f'[{df_scrap.shape[0]:,} mkp skus] * {df_cat_precios.shape[0]:,}')
print(f'= {_n_evals:,} evaluations')

# SKUs Marketplace n
# SKUs scraping m

# ========
# n * m 
# =======

[31,151 mkp skus] * 247
= 7,694,297 evaluations


In [34]:
col_competitor_name

'Store'

In [35]:
# %%time
# %%memit
# # Mach every ABI SKU with every third-company sku

# __df_tgt    = df_scrap.copy()
# __df_ref    = df_cat_precios.copy()
# col_ref        = 'sku_name_clean'
# target_ref     = 'sku_name_clean'
# target_raw     = 'Category'
# conf_function  = get_confidence
# conf_threshold = 0.75

# # TODO: Vectorize implementation
# for c_num, cadena_name in enumerate(__df_tgt[col_competitor_name].unique()):
#     target_sku_names = []
#     distances        = []
#     prices           = []
#     urls = []
#     start_t = time.time()
#     for ref_sku in __df_ref[col_ref]:
#         __df_slice = __df_tgt[__df_tgt[col_competitor_name] == cadena_name].copy()
#         __df_slice['sku_ref'] = ref_sku
#         __df_slice['conf']    = __df_slice[[target_ref, 'sku_ref']]\
#                                 .apply(lambda x: 
#                                    get_confidence(x[target_ref], x['sku_ref']), 
#                                 axis = 1)
#         best_match = __df_slice.loc[__df_slice['conf'].idxmax()]
#         bm_conf = best_match['conf']
#         # If there are more than one matches a DataFrame will be returned
#         best_match = best_match.to_frame().transpose()
#         if bm_conf >= conf_threshold:
#             target_sku_names.append(best_match[target_raw].values[0])
#             distances.append(best_match['conf'].values[0])
#             prices.append(best_match[col_price_name].values[0])
#             urls.append(best_match[col_url].values[0])
#         else:
#             target_sku_names.append(np.nan)
#             distances.append(np.nan)
#             prices.append(np.nan)
#             urls.append(np.nan)
#     # Append the columns to the main dataframe
    
#     __df_ref[f'c_{cadena_name}_sku']   = target_sku_names
#     __df_ref[f'c_{cadena_name}_conf']  = distances
#     __df_ref[f'c_{cadena_name}_price'] = prices
#     __df_ref[f'c_{cadena_name}_url']   = urls
#     end_t = time.time()
#     print(f'{c_num}) {cadena_name} finished:\n\t' + 
#           f'{len(__df_slice)} skus evaluated. ' +
#           f'{end_t - start_t: .2f} seconds.' +
#           f'\n\t{np.sum(~np.isnan(prices))} matched.')

In [36]:
import concurrent.futures

__df_tgt = df_scrap.copy()
__df_ref = df_cat_precios.copy()
col_ref = 'sku_name_clean'
target_ref = 'sku_name_clean'
target_raw = 'Category'
conf_function = get_confidence
conf_threshold = 0.75
col_competitor_name = 'Store'

import concurrent.futures

# Define a helper function to process the slice for each cadena_name
def process_slice(cadena_name, 
                  col_competitor_name, col_ref, 
                  target_ref, conf_threshold, 
                  col_price_name, col_url, target_raw, 
                  __df_tgt, __df_ref):
    target_sku_names = []
    distances = []
    prices = []
    urls = []
    start_t = time.time()
    for ref_sku in __df_ref[col_ref]:
        __df_slice = __df_tgt[__df_tgt[col_competitor_name] == cadena_name].copy()
        __df_slice['sku_ref'] = ref_sku
        __df_slice['conf'] = __df_slice[[target_ref, 'sku_ref']] \
            .apply(lambda x: get_confidence(x[target_ref], x['sku_ref']),
                   axis=1)
        best_match = __df_slice.loc[__df_slice['conf'].idxmax()]
        bm_conf = best_match['conf']
        best_match = best_match.to_frame().transpose()
        if bm_conf >= conf_threshold:
            target_sku_names.append(best_match[target_raw].values[0])
            distances.append(best_match['conf'].values[0])
            prices.append(best_match[col_price_name].values[0])
            urls.append(best_match[col_url].values[0])
        else:
            target_sku_names.append(np.nan)
            distances.append(np.nan)
            prices.append(np.nan)
            urls.append(np.nan)
    return cadena_name, target_sku_names, distances,\
            prices, urls, time.time() - start_t,\
            np.sum(~np.isnan(prices)), __df_slice.shape[0]

# Define the main function for parallel execution
def parallel_process(__df_tgt, __df_ref, col_competitor_name, col_ref, target_ref, conf_threshold, col_price_name, col_url, target_raw):
    with concurrent.futures.ProcessPoolExecutor() as executor:
        results = []
        for c_num, cadena_name in enumerate(__df_tgt[col_competitor_name].unique()):
            result = executor.submit(process_slice, cadena_name, 
                                     col_competitor_name, col_ref, 
                                     target_ref, conf_threshold, 
                                     col_price_name, col_url, target_raw, 
                                     __df_tgt, __df_ref)
            results.append(result)
        for result in concurrent.futures.as_completed(results):
            cadena_name, target_sku_names, distances, prices, urls, elapsed_time, matched_count, num_skus_evaluated = result.result()
            __df_ref[f'c_{cadena_name}_sku']   = target_sku_names
            __df_ref[f'c_{cadena_name}_conf']  = distances
            __df_ref[f'c_{cadena_name}_price'] = prices
            __df_ref[f'c_{cadena_name}_url'] = urls
            print(f'{cadena_name} finished:\n\t{num_skus_evaluated} skus evaluated. {elapsed_time:.2f} seconds.\n\t{matched_count} matched.')

# Call the main function for parallel execution
parallel_process(__df_tgt, __df_ref, col_competitor_name, 
                 col_ref, target_ref, conf_threshold, 
                 col_price_name, col_url, target_raw)

Corpovino finished:
	234 skus evaluated. 22.07 seconds.
	26 matched.
Abarrotero finished:
	279 skus evaluated. 26.02 seconds.
	12 matched.
Frubana finished:
	110 skus evaluated. 12.29 seconds.
	12 matched.
Alcca finished:
	596 skus evaluated. 53.45 seconds.
	31 matched.
Alianza finished:
	751 skus evaluated. 67.79 seconds.
	32 matched.
Autoservicio la Playa finished:
	821 skus evaluated. 73.35 seconds.
	36 matched.
Soriana finished:
	267 skus evaluated. 25.90 seconds.
	26 matched.
Consuvino finished:
	1378 skus evaluated. 121.81 seconds.
	57 matched.
Bodega Aurrera finished:
	1892 skus evaluated. 167.47 seconds.
	69 matched.
Cava del Duero finished:
	1996 skus evaluated. 176.51 seconds.
	51 matched.
La Europea finished:
	1951 skus evaluated. 172.96 seconds.
	26 matched.
La Castellana finished:
	1028 skus evaluated. 87.47 seconds.
	46 matched.
Surtitienda finished:
	2218 skus evaluated. 183.67 seconds.
	57 matched.
Walmart finished:
	1915 skus evaluated. 161.17 seconds.
	40 matched.
Iba

In [37]:
__df_ref.columns = [c.replace('-', '').replace(' ', '_').replace('__', '_') for c in __df_ref.columns]
__df_ref.shape

(247, 72)

In [38]:
__df_ref

Unnamed: 0,sku,sku_name,price,sku_name_clean,c_Corpovino_sku,c_Corpovino_conf,c_Corpovino_price,c_Corpovino_url,c_Abarrotero_sku,c_Abarrotero_conf,...,c_Ibarra_Mayoreo_price,c_Ibarra_Mayoreo_url,c_Scorpio_sku,c_Scorpio_conf,c_Scorpio_price,c_Scorpio_url,c_Mayoreo_Total_sku,c_Mayoreo_Total_conf,c_Mayoreo_Total_price,c_Mayoreo_Total_url
0,3000200,NESTLE PUREZA VITAL 4L - 4PZ,65.8400,npv 4lt 4pz,,,,,,,...,123.7,https://ibarramayoreo.com/agua-natural-nestle-...,Agua Natural Nestle Pureza Vital Botella 1 Lit...,0.82,9.0,https://www.scorpion.com.mx/default/agua-natur...,,,,
1,3006942,SANTA MARIA 355ML - 24PZS,76.0000,santa maria 300ml 24pz,,,,,,,...,,,Sta Maria Agua Natural 500 Ml 24 Pieza(s),0.88,6.4,https://www.scorpion.com.mx/default/sta-maria-...,Paquete agua Santa Maria 1L/12P 1 litro con 12...,0.82,139.0,https://www.mayoreototal.mx/products/paquete-a...
2,3000085,NESTLE PUREZA VITAL 1LT - 12PZS,82.0000,npv 1lt 12pz,,,,,,,...,90.0,https://ibarramayoreo.com/agua-natural-nestle-...,Agua Natural Nestle Pureza Vital Botella 1 Lit...,1.00,92.4,https://www.scorpion.com.mx/default/agua-natur...,,,,
3,3001443,SANTA MARIA 1L -12PZS,100.0000,santa maria 1lt 12pz,,,,,,,...,,,Sta Maria Agua Natural 500 Ml Pieza,0.82,6.9,https://www.scorpion.com.mx/default/sta-maria-...,Paquete agua Santa Maria 1L/12P 1 litro con 12...,1.00,139.0,https://www.mayoreototal.mx/products/paquete-a...
4,3000080,SANTA MARIA 1.5L - 12PZS,100.0000,santa maria 1.5lt 12pz,,,,,,,...,,,,,,,Paquete agua Santa Maria 1L/12P 1 litro con 12...,0.79,139.0,https://www.mayoreototal.mx/products/paquete-a...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242,3009536,SOPA MARUCHAN RES - 12 PZS,155.0000,sopa maruchan 64g re 12pz,,,,,Sopa Instantanea Sabor Carne de Res 64 g - caj...,0.91,...,143.5,https://ibarramayoreo.com/sopa-maruchan-res-64-g,Sopa Instantanea Maruchan Pollo 64 Gramos 12 P...,0.85,13.7,https://www.scorpion.com.mx/default/sopa-insta...,Caja Sopa Maruchan sabor a carne de res 12P/64...,0.91,207.0,https://www.mayoreototal.mx/products/sopa-maru...
243,3009126,SOPA MARUCHAN CAMARON HABANERO - 12 PZS,162.0000,sopa maruchan 64g camaron habanero 12pz,,,,,"Sopa Instantanea Camaron, Limon y Habanero 64 ...",0.91,...,143.5,https://ibarramayoreo.com/sopa-maruchan-camaro...,Sopa Instantanea Maruchan Camaron Habanero 64 ...,1.00,13.7,https://www.scorpion.com.mx/default/sopa-insta...,Caja sopa Maruchan sabor camaron con limon y c...,0.92,207.0,https://www.mayoreototal.mx/products/maruchan-...
244,3009860,TIEMPO - AIRE,100.0036,tiempo aire,,,,,,,...,,,,,,,,,,
245,3010258,TIEMPO - AIRE D,100.0000,tiempo aire,,,,,,,...,,,,,,,,,,


In [39]:
output_path = f'../outputs/{date}'
Path(output_path).mkdir(parents=True, exist_ok=True)
thr = 0.60
#                   .query(f'c_{seller}_conf > {thr}')\


__dfs = []
for seller in [c.replace('-', '').replace(' ', '_').replace('__', '_') for c in df_scrap[col_competitor_name].unique()]:
    a_cols = ['sku', 'sku_name', 'price']
    b_cols = [c for c in __df_ref.columns if f'c_{seller}' in c]

    __def2 = __df_ref[a_cols + b_cols]
    __def2 = __def2[__def2[f'c_{seller}_sku'].notnull()]\
                    .query('price > 0')\
                    .sort_values(f'c_{seller}_conf')
    __def2['price_diff'] = __def2['price'] - __def2[f'c_{seller}_price']
    __def2 = __def2.round(5)
    __def2.columns = ['mkp_sku_id', 
                      'mkp_sku_name', 
                      'mkp_price', 
                      'competitor_sku_name', 
                      '%_confidence', 
                      'competitor_price', 
                      'competitor_url',
                      'price_diff'
                      ]
    #__def2.to_csv(f'{output_path}/{seller}_top_{top}_matched.csv', index = False)
    __def2['competitor_name'] = seller
    __def2['%_price_diff'] = __def2['price_diff'] / __def2[f'mkp_price']
    __dfs.append(__def2)
    
cols =  ['competitor_name', 
         'mkp_sku_id', 'mkp_sku_name', 'mkp_price', 
         'competitor_sku_name', 'competitor_url', f'competitor_price', 
         'price_diff', '%_price_diff', '%_confidence']
last_df = pd.concat(__dfs)
last_df = last_df[cols]
last_df.competitor_url.fillna('Pending', inplace = True)
# Add scrapped date
last_df['scraping_date'] = pd.to_datetime(date, format = '%d%m%Y')

# total matches
print(last_df.shape)

last_df.to_csv(f'../outputs/{date}/Product_matching_MVP1_{date}.csv', index = False)

(787, 11)


In [40]:
__def2

Unnamed: 0,mkp_sku_id,mkp_sku_name,mkp_price,competitor_sku_name,%_confidence,competitor_price,competitor_url,price_diff,competitor_name,%_price_diff
116,3009553,TEQUILA 100 ANOS REPOSADO 700ML - 1 PZ,170.00809,Tequila Patron Reposado 750 mL,0.78,859.0,https://lacastellana.com/products/tequila-patr...,-688.99191,La_Castellana,-4.052701
104,3009668,BRANDY PRESIDIENTE 500ML - 1PZ,84.01903,Brandy Presidente 700ml,0.8,118.0,https://lacastellana.com/products/brandy-presi...,-33.98097,La_Castellana,-0.404444
109,3009669,BRANDY PRESIDIENTE 900ML - 1PZ,141.52255,Brandy Presidente 700ml,0.8,118.0,https://lacastellana.com/products/brandy-presi...,23.52255,La_Castellana,0.166211
123,3009483,BRANDY PRESIDIENTE 200ML - 6 PZS,214.73305,Brandy Presidente 700ml,0.8,118.0,https://lacastellana.com/products/brandy-presi...,96.73305,La_Castellana,0.45048
96,3009551,BRANDY DON PEDRO CLASICO 200ML - 1PZ,37.59026,Brandy Don Pedro Clasico 750 mL,0.82,149.0,https://lacastellana.com/products/brandy-don-p...,-111.40974,La_Castellana,-2.963793
101,3009673,RON BACARDI CARTA BLANCA 200ML - 1PZ,49.99612,Ron Bacardi Carta Blanca 980ml,0.82,299.0,https://lacastellana.com/products/ron-bacardi-...,-249.00388,La_Castellana,-4.980464
102,3009586,WHISKY JOHNNIE WALKER RED 200ML - 1 PZA,69.99811,Whisky Johnnie Walker Etiqueta Roja 700 ml,0.82,375.0,https://lacastellana.com/products/whisky-johnn...,-305.00189,La_Castellana,-4.357288
108,3010273,WHISKY PASSPORT 350 ML - 1 PZ,113.16125,Whisky Passport 700 mL,0.82,210.0,https://lacastellana.com/products/whisky-passport,-96.83875,La_Castellana,-0.855759
113,3009554,TEQUILA CABRITO REPOSADO 750ML - 1 PZ,146.91794,Tequila Cabrito Reposado 950 mL,0.82,157.0,https://lacastellana.com/products/tequila-cabr...,-10.08206,La_Castellana,-0.068624
127,3009787,Vodka Smirnoff No.21 1000 ml - 1pz,233.26196,Vodka Smirnoff 1000ml,0.88,250.0,https://lacastellana.com/products/vodka-smirno...,-16.73804,La_Castellana,-0.071756
