# Product matching 

In [1]:
import os
import sys
import pandas as pd
import numpy as np
from datetime import datetime
from pathlib import Path

sys.path.append('..')
from modules.normalize_text import normalize_text
from modules.csv_reader import CSVFileReader
from modules.sku_matcher import get_confidence

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# Define the input and output directories
MKPT_PRICE_PATH        = './data/mktp_prices/'
WEBSCRAPING_INPUT_PATH = './data/scrapped/'
MATCHING_OUPUT_PATH    = './data/matched_parquets/'

## Load the input files
### ABI Marketplace SKU-Price catalog

In [4]:
mktp_prices_path = Path(f'{MKPT_PRICE_PATH}/PreciosMarketplaceHistorico.xlsb')

df_mkpt_prices = pd.read_excel(
    mktp_prices_path, 
    sheet_name = 'PRECIOS', engine = 'pyxlsb', skiprows = 3)

print(df_mkpt_prices.columns)

# Sanity check
sku_colname   = 'SKU NUEVO'
sku_desc_name = 'DESCRIPCIÓN'
price_colname = f'PTR_W{15}'
brand_colname = 'Marca'

# Format the SKU material to string
df_mkpt_prices[sku_colname] = df_mkpt_prices[sku_colname].astype(str)
# Verify that there are no duplicated SKUs
assert not df_mkpt_prices[sku_colname].duplicated().any()

# Keep only useful columns and rename them
df_mkpt_prices = df_mkpt_prices[[sku_colname, sku_desc_name, 
                                 price_colname, brand_colname]]
# Include the `brand` info into the `sku_name`
df_mkpt_prices['mkp_sku_id'] = df_mkpt_prices[sku_colname]\
                                .str.cat(df_mkpt_prices[brand_colname], sep=" ")
df_mkpt_prices = df_mkpt_prices[[sku_colname, sku_desc_name, price_colname]]
df_mkpt_prices.columns = ['mkp_sku_id', 'mkp_sku_name', 'mkp_price']

# Cleaning phase
# Lowercase string fields
df_mkpt_prices['mkp_sku_name_clean'] = df_mkpt_prices['mkp_sku_name']\
    .apply(lambda x: normalize_text(x, encode = 'ascii'))

df_mkpt_prices.head()

Index(['Categoría', 'SKU NUEVO', 'DESCRIPCIÓN', 'EAN/UPC', 'IMPERDIBLES',
       'IVA', 'IEPS', 'Marca', 'DESCRIPCION_2', 'CUPO', 'PZA', 'PTR',
       'PTR_UNITARIO', 'PTR_W1 ', 'PTR_W2', 'PTR_W3 ', 'PTR_W4', 'PTR_W5',
       'PTR_W6', 'PTR_W7', 'PTR_W8', 'PTR_W9', 'PTR_W10', 'PTR_W11', 'PTR_W12',
       'PTR_W13', 'PTR_W14', 'PTR_W15'],
      dtype='object')


Unnamed: 0,mkp_sku_id,mkp_sku_name,mkp_price,mkp_sku_name_clean
0,3000200,NESTLE PUREZA VITAL 4L - 4PZ,65.84,npv 4lt 4pz
1,3006942,SANTA MARIA 355ML - 24PZS,76.0,santa maria 300ml 24pz
2,3000085,NESTLE PUREZA VITAL 1LT - 12PZS,82.0,npv 1lt 12pz
3,3001443,SANTA MARIA 1L -12PZS,100.0,santa maria 1lt 12pz
4,3000080,SANTA MARIA 1.5L - 12PZS,100.0,santa maria 1.5lt 12pz


### Load the web scraping files

In [5]:
# Path to the web scraping directory
scraped_path = Path(WEBSCRAPING_INPUT_PATH)
# Date to match
date_sf = '02-05-23'

# TODO: Create a function to list all available dates
# select the max date and check whether that date exist in the sku_matched table
# iterate until evaluate all available dates 

# TODO: No está claro el formato del archivo que va a enviar
# el equipo de Elkin y de Michael

In [6]:
# Date to match (corresponds to a single directory per date)
scraping_date = datetime.strptime(date_sf, "%d-%m-%y").strftime("%d%m%Y")
date_path = scraped_path /scraping_date 

# List of .csv files inside the date directory
scraping_files = [ 
        date_path / f for f in
            (filter(lambda f: f.endswith('.csv'), 
                os.listdir(date_path)))
            ]

scraping_files

[PosixPath('data/scrapped/02052023/02-05-23-BodegaAurrera_MX.csv'),
 PosixPath('data/scrapped/02052023/02-05-23-Soriana_MX.csv'),
 PosixPath('data/scrapped/02052023/02-05-23-Walmart_MX.csv'),
 PosixPath('data/scrapped/02052023/02-05-23-Scorpion_MX.csv'),
 PosixPath('data/scrapped/02052023/02-05-23-CavaDelDuero_MX.csv'),
 PosixPath('data/scrapped/02052023/02-05-23-laCastellana_MX.csv'),
 PosixPath('data/scrapped/02052023/02-05-23-Surtitienda_MX.csv'),
 PosixPath('data/scrapped/02052023/02-05-23-MayoreoTotal_MX.csv'),
 PosixPath('data/scrapped/02052023/02-05-23-BodegaAlianza_MX.csv'),
 PosixPath('data/scrapped/02052023/02-05-23-Autoservicio-Laplaya_MX.csv'),
 PosixPath('data/scrapped/02052023/02-05-23-Alcca_MX.csv'),
 PosixPath('data/scrapped/02052023/02-05-23-LaEuropea_MX.csv'),
 PosixPath('data/scrapped/02052023/02-05-23-IbarraMayoreo_MX.csv'),
 PosixPath('data/scrapped/02052023/02-05-23-Frubana_MX.csv'),
 PosixPath('data/scrapped/02052023/02-05-23-Corpovino_MX.csv'),
 PosixPath('data/

In [7]:
# Use the `CSVFileReader` to load and clean the csv files
df_scrap = pd.concat([CSVFileReader(f).validate_file() for f in scraping_files])\
                .reset_index(drop = True)
# Rename the categories assuming the following patter
# `sku_name`, `competitor_name`, `url`, `price`
df_scrap.columns = ['competitor_sku_name', 'competitor_name', 
                    'competitor_url', 'competitor_price']
# Convert competitors_column to categorical
df_scrap['competitor_name'] = df_scrap['competitor_name'].astype('category')

df_scrap.value_counts('competitor_name')

competitor_name
Mayoreo Total            6249
Scorpio                  4916
Ibarra Mayoreo           4209
Surtitienda              2758
Bodega Aurrera           2100
La Europea               1951
Walmart                  1915
Consuvino                1378
Cava del Duero           1086
La Castellana            1028
Autoservicio la Playa     821
Alianza                   777
Alcca                     596
Abarrotero                279
Soriana                   267
Corpovino                 234
Frubana                   110
Name: count, dtype: int64

### Perform the text-cleaning phase of web scraping data

In [8]:
# Cleaning phase
df_scrap['comp_sku_name_clean'] = df_scrap['competitor_sku_name']\
    .apply(lambda x: normalize_text(x, encode = 'ascii'))
df_scrap.sample(10)

Unnamed: 0,competitor_sku_name,competitor_name,competitor_url,competitor_price,comp_sku_name_clean
27911,Aceite Pam Aerosol Oliva 141 g Caja -12 articu...,Ibarra Mayoreo,https://ibarramayoreo.com/aceite-pam-aerosol-o...,788.6,aceite pam aerosol oliva 100g 12pz
1294,Alimento liquido de soya AdeS sabor manzana 9...,Bodega Aurrera,https://despensa.bodegaaurrera.com.mx/p/Alimen...,25.0,alimento liquido soya ade manzana 900ml
15272,Plumas de Gel Best Trading 44 Pzas - ZK 44 piezas,Mayoreo Total,https://www.mayoreototal.mx/products/plumas-de...,367.0,pluma gel best trading 44pz zk
6441,Frijoles Enteros Negros La Costena 560 gr 12 P...,Scorpio,https://www.scorpion.com.mx/default/la-coste-a...,13.3,frijole entero black costena 500g 12pz
23998,Vino Tinto Cruz de Alba Tempranillo Crianza 2...,La Europea,https://www.laeuropea.com.mx/vt-cruz-de-alba-t...,798.49,vino tinto cruz alba tempranillo crianza 2015 ...
6504,Nescafe Clasico 42 gr 16 Pieza(s),Scorpio,https://www.scorpion.com.mx/default/nescafe-cl...,499.2,nescafe clasico 42g 16pz
9892,Tequila Viuda de Romero Reposado 1l,Cava del Duero,https://sur.cavadelduero.com/producto/tequila-...,198.47,teq viuda romero rep 1lt
23930,Vino Rosado Cabernet Sauvignon Lorenza -750 ML,La Europea,https://www.laeuropea.com.mx/vr-cab-sauv-rosad...,131.63,vino rosado cabernet sauvignon lorenza 700ml
10769,Vino Rosado Sutter Home White Zinfandel 750 mL,La Castellana,https://lacastellana.com/products/vino-rosado-...,259.0,vino rosado sutter home white zinfandel 700ml
20135,Arroz Blanco Schettino Precocido 5K - ZK 5 kg,Mayoreo Total,https://www.mayoreototal.mx/products/arroz-bla...,202.0,arroz blanco schettino precocido 5k 5kg


## SKU matching phase

### Number of evaluations to perform
Calculate the number of evaluations to perform:

$$N_{evals} = m * n$$
where $m$= *number of Marketplace skus* and $n$ = *number of scraped products*

In [9]:
# Total de evaluaciones
_n_evals = df_scrap.shape[0] * df_mkpt_prices.shape[0]
print(f'[{df_mkpt_prices.shape[0]:,} mkp skus] * [{df_scrap.shape[0]:,} web scraping skus] ')
print(f'= {_n_evals:,} evaluations')

[247 mkp skus] * [30,674 web scraping skus] 
= 7,576,478 evaluations


Define a function to perform the matching phase and keep the best match between a given pair of competitor and marketplace skus.

In [10]:
def get_best_match_mktp_sku(comp_sku_clean_name: str) -> tuple:
    """
    Get the best matching marketplace SKU and confidence score 
    for a given cleaned competitor´s SKU name.

    Note:
        This function assumes that the DataFrame `df_mkpt_prices` 
        exists and contains 'sku_name_clean' and 'sku' columns.
    """
    mkpt_skus = df_mkpt_prices['mkp_sku_name_clean']
    # Get the distances between `comp_sku_name` and each mkp sku
    confs = mkpt_skus.apply(
        lambda mkp_sku: get_confidence(mkp_sku, comp_sku_clean_name))
    # NOTE: for multiple occurences referred to `np.argmax` documentation
    max_idx   = confs.argmax()
    best_conf = confs.max()
    best_mkp_sku  = df_mkpt_prices.at[max_idx, 'mkp_sku_id']
    return best_mkp_sku, best_conf

In [11]:
# Get the best match for each competitor product
df_scrap[['mkp_sku', 'match_conf']] =\
    df_scrap.apply(
            lambda x: get_best_match_mktp_sku(x['comp_sku_name_clean']), 
            axis = 'columns', result_type = 'expand'
        )
# Update the `df_scraped` table with the MKPT catalog data
df_scrap = df_scrap.merge(df_mkpt_prices, left_on = 'mkp_sku', right_on = 'mkp_sku_id')

### Format the output and save to `.parquet`

1. Keep only relevant matches
2. Include `scraping_date`
3. Keep only relevant columns
4. Compute `price_diff` and `price_index`

In [12]:
# Keep only relevant matches based on the confidence_index
conf_thr = 0.65
df_match = df_scrap[df_scrap['match_conf'] >= conf_thr].reset_index(drop = True)

# Include the scraping date
df_match['scraping_date'] = pd.to_datetime(scraping_date, format = '%d%m%Y')

# TODO: Include country?

# Keep relevant columns
col_to_keep = ['scraping_date', 'mkp_sku_id', 'mkp_sku_name', 'competitor_name', 
               'competitor_sku_name', 'match_conf', 'mkp_price', 'competitor_price', 
               'competitor_url']
df_match = df_match[col_to_keep]

# Compute price diff
df_match['price_diff']   = df_match['mkp_price'].sub(df_scrap['competitor_price'])

# Compute price index
df_match['price_index'] = df_match['price_diff'].div(df_scrap['mkp_price']) 
df_match['competitor_url'].fillna('Pending', inplace = True)

# Save to parquet
df_match.to_parquet(f'{MATCHING_OUPUT_PATH}/{scraping_date}_prod_matched.parquet')

In [13]:
df_match

Unnamed: 0,scraping_date,mkp_sku_id,mkp_sku_name,competitor_name,competitor_sku_name,match_conf,mkp_price,competitor_price,competitor_url,price_diff,price_index
0,2023-05-02,3010278,LALA LECHE DESLACTOSADA 1L - 12 PZS,Bodega Aurrera,Leche Lala evaporada 250 ml,0.70,261.3000,15.5,https://despensa.bodegaaurrera.com.mx/p/Leche-...,245.8,0.940681
1,2023-05-02,3010278,LALA LECHE DESLACTOSADA 1L - 12 PZS,Bodega Aurrera,Leche Lala deslactosada caja con 6 pzas de 1 l...,0.93,261.3000,138.0,https://despensa.bodegaaurrera.com.mx/p/Leche-...,123.3,0.471871
2,2023-05-02,3010278,LALA LECHE DESLACTOSADA 1L - 12 PZS,Bodega Aurrera,Leche Cafe con Lala deslactosada 960 ml,0.82,261.3000,36.0,https://despensa.bodegaaurrera.com.mx/p/Leche-...,225.3,0.862227
3,2023-05-02,3010278,LALA LECHE DESLACTOSADA 1L - 12 PZS,Bodega Aurrera,Leche Lala deslactosada 1 l,0.95,261.3000,27.0,https://despensa.bodegaaurrera.com.mx/p/Leche-...,234.3,0.89667
4,2023-05-02,3010278,LALA LECHE DESLACTOSADA 1L - 12 PZS,Bodega Aurrera,Leche Lala deslactosada 1.5 l,0.83,261.3000,38.0,https://despensa.bodegaaurrera.com.mx/p/Leche-...,223.3,0.854573
...,...,...,...,...,...,...,...,...,...,...,...
3223,2023-05-02,3009515,VOGUE 600HOJAS 4UND - 10PZS,Ibarra Mayoreo,Papel Higienico Vogue 600 hojas 4 rollos Bolsa...,1.00,258.0000,329.9,https://ibarramayoreo.com/papel-higienico-vogu...,-71.9,-0.278682
3224,2023-05-02,3009515,VOGUE 600HOJAS 4UND - 10PZS,Ibarra Mayoreo,Papel Higienico Vogue 600 hojas 6 rollos Bolsa...,0.90,258.0000,388.1,https://ibarramayoreo.com/papel-higienico-vogu...,-130.1,-0.504264
3225,2023-05-02,3009501,WHISKAS ATUN 85G - 8PZS,Ibarra Mayoreo,Alimento Whiskas Atun 85 g Caja -8 articulo(s),0.76,61.9208,83.2,https://ibarramayoreo.com/alimento-whiskas-atu...,-21.2792,-0.343652
3226,2023-05-02,3009501,WHISKAS ATUN 85G - 8PZS,Ibarra Mayoreo,Alimento Whiskas Atun 85 g Caja -8 articulo(s),0.76,61.9208,83.7,https://ibarramayoreo.com/alimento-whiskas-atu...,-21.7792,-0.351727
