# Product matching 

In [0]:
import os
import pandas as pd
from pathlib import Path
from datetime import datetime
# Import the in-house libraries
from modules.csv_reader import CSVFileReader
from modules.sku_matcher import get_confidence
from modules.normalize_text import normalize_text

In [0]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StringType, FloatType

# Commented as only required for local tests
# spark = SparkSession.builder \
#     .appName("TestSpark") \
#     .getOrCreate()

In [0]:
# %load_ext autoreload
# %autoreload 2

# Install the requirements
%pip install -r ./requirements.txt

In [0]:
# Define the input and output directories
MKPT_PRICE_PATH        = './data/mktp_prices/'
WEBSCRAPING_INPUT_PATH = './data/scrapped/'
MATCHING_OUPUT_PATH    = './data/matched_parquets/'

## Load the input files
### ABI Marketplace SKU-Price catalog

In [0]:
mktp_prices_path = Path(f'{MKPT_PRICE_PATH}/PreciosMarketplaceHistorico.xlsb')

df_mkpt_prices = pd.read_excel(
    mktp_prices_path, 
    sheet_name = 'PRECIOS', engine = 'pyxlsb', skiprows = 3)

print(df_mkpt_prices.columns)

# Sanity check
sku_colname   = 'SKU NUEVO'
sku_desc_name = 'DESCRIPCIÓN'
price_colname = f'PTR_W{15}'
brand_colname = 'Marca'

# Format the SKU material to string
df_mkpt_prices[sku_colname] = df_mkpt_prices[sku_colname].astype(str)
# Verify that there are no duplicated SKUs
assert not df_mkpt_prices[sku_colname].duplicated().any()

# Keep only useful columns and rename them
df_mkpt_prices = df_mkpt_prices[[sku_colname, sku_desc_name, 
                                 price_colname, brand_colname]]
# Include the `brand` info into the `sku_name`
df_mkpt_prices['mkp_sku_id'] = df_mkpt_prices[sku_colname]\
                                .str.cat(df_mkpt_prices[brand_colname], sep=" ")
df_mkpt_prices = df_mkpt_prices[[sku_colname, sku_desc_name, price_colname]]
df_mkpt_prices.columns = ['mkp_sku_id', 'mkp_sku_name', 'mkp_price']

# CONVERT TO SPARK
df_mkpt_prices = spark.createDataFrame(df_mkpt_prices)

# Round price values
df_mkpt_prices = df_mkpt_prices.withColumn('mkp_price',
                        F.round(df_mkpt_prices['mkp_price'], 2))

@F.udf(StringType())
def normalize_text_udf(text):
    # Replace 'normalize_text'
    return normalize_text(text, encode='ascii')
# normalize_text_spark_udf = udf(normalize_text_udf, StringType())

# Cleaning phase
# Lowercase string fields
df_mkpt_prices = df_mkpt_prices\
                    .withColumn('mkp_sku_name_clean', 
                                normalize_text_udf(df_mkpt_prices['mkp_sku_name']))

df_mkpt_prices.show()

### Load the web scraping files

In [0]:
# Path to the web scraping directory
scraped_path = Path(WEBSCRAPING_INPUT_PATH)
# Date to match
date_sf = '02-05-23'

# TODO: Create a function to list all available dates
# select the max date and check whether that date exist in the sku_matched table
# iterate until evaluate all available dates 

# TODO: No está claro el formato del archivo que va a enviar
# el equipo de Elkin y de Michael

In [0]:
# Date to match (corresponds to a single directory per date)
scraping_date = datetime.strptime(date_sf, "%d-%m-%y").strftime("%d%m%Y")
date_path = scraped_path /scraping_date 

# List of .csv files inside the date directory
scraping_files = [ 
        date_path / f for f in
            (filter(lambda f: f.endswith('.csv'), 
                os.listdir(date_path)))
            ]

scraping_files

In [0]:
# Use the `CSVFileReader` to load and clean the csv files
df_scrap = pd.concat([CSVFileReader(f).validate_file() for f in scraping_files])\
                .reset_index(drop = True)
# Rename the categories assuming the following patter
# `sku_name`, `competitor_name`, `url`, `price`
df_scrap.columns = ['competitor_sku_name', 'competitor_name', 
                    'competitor_url', 'competitor_price']

# Drop duplicates
df_scrap = df_scrap.drop_duplicates(subset = 
                ['competitor_sku_name', 'competitor_name'])
                
# Convert competitors_column to categorical
df_scrap['competitor_name'] = df_scrap['competitor_name'].astype('category')

# Reset index
# Add an index for future analaysis
df_scrap = df_scrap.reset_index(names = 'scrap_id')

# CONVER TO SPARK
df_scrap = spark.createDataFrame(df_scrap)

df_scrap.groupby('competitor_name').count().show()

### Perform the text-cleaning phase of web scraping data

In [0]:
# Cleaning phase
df_scrap = df_scrap.withColumn('comp_sku_name_clean', 
            normalize_text_udf(df_scrap['competitor_sku_name']))

# df_scrap.orderBy(F.rand()).limit(10).show()
df_scrap.show()

## SKU matching phase

### Number of evaluations to perform
Calculate the number of evaluations to perform:

$$N_{evals} = m * n$$
where $m$= *number of Marketplace skus* and $n$ = *number of scraped products*

In [0]:
# Total de evaluaciones
_n_rows_mkp = df_mkpt_prices.count()
_n_rows_wsp = df_scrap.count()
_n_evals =_n_rows_mkp * _n_rows_wsp
print(f'[{_n_rows_mkp:,} mkp skus] * [{_n_rows_wsp:,} web scraping skus] ')
print(f'= {_n_evals:,} evaluations')

Define a function to perform the matching phase and keep the best match between a given pair of competitor and marketplace skus.

### Perform the matching phase

In [0]:
# Define the UDF for confidence matching 
@F.udf(FloatType())
def get_confidence_udf(mktp_sku_clean_name, comp_sku_clean_name):
    # Replace 'get_confidence'
    conf = get_confidence(mktp_sku_clean_name, comp_sku_clean_name)
    return conf

In [0]:
# Perform a cross join to obtain all pair combinations of mkp and comp products
# Broadcast the smallest table to enhance performance, Keep only relevant columns
# to avoid memory overload
cj_df = F.broadcast(df_mkpt_prices.select(['mkp_sku_id', 'mkp_sku_name_clean']))\
        .crossJoin(df_scrap.select(["scrap_id", "comp_sku_name_clean"]))

# Perform the pair-wise evaluation using the 'get_confidence'
cj_df = cj_df.withColumn("confidence", 
                 get_confidence_udf(F.col('mkp_sku_name_clean'), 
                                    F.col('comp_sku_name_clean')))

# Now get the best mktp match for each competitor sku
w = Window.partitionBy('scrap_id')
match_df = cj_df\
        .withColumn('best_conf', F.max('confidence').over(w))\
        .where(F.col('confidence') == F.col('best_conf'))\
        .drop('best_conf')\
        .dropDuplicates(['scrap_id']) # Keep only one occurrence 

### Format the output and save to `.parquet`

1. Keep only relevant matches
2. Include `scraping_date`
3. Keep only relevant columns
4. Compute `price_diff` and `price_index`

In [0]:
col_to_keep = ['scraping_date', 'mkp_sku_id', 'scrap_id', 'mkp_sku_name', 'competitor_name', 
               'competitor_sku_name', 'confidence', 'mkp_price', 'competitor_price', 
               'price_diff', 'price_index', 'mkp_sku_name_clean', 'comp_sku_name_clean', 
               'competitor_url']

# Add the date column
formatted_date = pd.to_datetime(scraping_date, format = '%d%m%Y').date()

# Confidence interval
conf_thr = 0.35

match_df = match_df\
                .select(['scrap_id', 'mkp_sku_id', 'confidence'])\
                .join(df_scrap, ['scrap_id'], "left")\
                .join(df_mkpt_prices, ['mkp_sku_id'], "left")\
                .filter(F.col('confidence') >= conf_thr)\
                .withColumn('scraping_date', F.lit(formatted_date))\
                .withColumn('price_diff',  F.round(F.col('mkp_price') - F.col('competitor_price'), 2))\
                .withColumn('price_index', F.round(F.col('price_diff') / F.col('mkp_price'), 2))\
                .select(col_to_keep)\
                .fillna('', subset = ['competitor_url'])\
                .orderBy(F.col('mkp_sku_id'), F.col('confidence').desc())

# Save to cache
match_df.cache()
output_path = f'{MATCHING_OUPUT_PATH}/{scraping_date}_prod_matched_TEST.parquet' 
match_df.write.parquet(output_path)
match_df.count()
        

In [0]:
match_df.show()

In [0]:
match_df.count()