## String Match Baseline

This notebook takes cleaned UPC and EC tables from Notebook 1 & 2 and uses TF-IDF cosine similarity as a string match baseline. This simple string match baseline is just for demonstration purpose and if run through the evaluation script, it would have a success@5 score of 0.661 and a NDCG@5 score of 0.388. For more details about the evaluation, please check the evaluation script in the shared folder.

In [None]:
import pandas as pd 
import re
import heapq

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# This processed files are from notebook 01 and 02.
upc = pd.read_csv('upc_cleaned.csv', dtype=str)
ec = pd.read_csv('ec_cleaned.csv', dtype=str)
ppc = pd.read_csv('./raw_data/ppc20152016.csv', dtype=str)

# Some UPC code in PPC table was never in UPC table. They will be filtered here for now.
# Don't filter this on EC table because there are custom ec_code that are not in EC table.
ppc = ppc[ppc['upc'].isin(upc['upc_code'])]

Custom EC codes are Westat-created codes that do not exist in FNDDS. Please refer to the Custom EC Codes spreadsheet available in the data documentation folder.

In [None]:
# Randomaly sample 1K records from PPC table because the full match will be too expensive without blocking
ppc_clip = ppc.sample(1000, random_state=2498)

# Clip the UPC and EC tables accordingly.
upc_set = set(ppc_clip['upc'].tolist())
ec_set = set(ppc_clip['ec'].tolist())

upc_clip = upc[upc['upc_code'].isin(upc_set)]
ec_clip = ec[ec['ec_code'].isin(ec_set)]

# Reset index so it's incremental
upc_clip = upc_clip.reset_index()
upc_clip.drop('index', axis=1, inplace=True)
ec_clip = ec_clip.reset_index()
ec_clip.drop('index', axis=1, inplace=True)

# Fill missing value with empty string
upc_clip = upc_clip.fillna("")

In [None]:
# Just to have another look at the data
upc_clip.head()

In [None]:
ec_clip.head()

In [None]:
def ngrams_analyzer(string):
    string = re.sub(r'[,-./]', r'', string)
    ngrams = zip(*[string[i:] for i in range(3)])  # N-Gram length is 3
    return [''.join(ngram) for ngram in ngrams]

def closest_description(ec, upc, similarity=0):
    vectorizer = TfidfVectorizer(analyzer=ngrams_analyzer)
    
    #Apply the defined vectorizer
    ec_desc = ec.ec_description
    tfidf_matrix = vectorizer.fit_transform(ec_desc)
    #Calculate the closest distance for each word
    closest_desc=[]
    closest_distance=[]
    upc_desc_list=[]
    # Iterate through the UPC table and compare the descriptions with each EC description
    for index, row in upc.iterrows():
        upc_desc = [row.upc_description]
        upc_desc = vectorizer.transform(upc_desc)
        cos_sim = cosine_similarity(upc_desc, tfidf_matrix)
        # Find the top 5 closest matches
        max_ = heapq.nlargest(5, cos_sim[0])
        closest_index = [(i, j) for i, j in enumerate(cos_sim[0]) if j in max_]
        if len(closest_index) > 5:
            closest_index = closest_index[:5]
        closest_desc.extend([ec_desc[x[0]] for x in closest_index])
        closest_distance.extend([x[1] for x in closest_index])
        upc_desc_list.extend([row.upc_description] * 5)
    
    closest_df=pd.DataFrame({'upc_desc':upc_desc_list,'closest_desc':closest_desc,'closest_distance':closest_distance})
    # The record will be labelled no match if the similarity is below certain threshold
    closest_df['closest_desc']=[row.closest_desc if row.closest_distance>=similarity else 'No Match' for index, row in closest_df.iterrows() ]
    return closest_df


def with_code(match_df, ec, upc):
    # This function filters out unmatched pairs and connects the description to the corresponding codes
    with_code = match_df[~(match_df.closest_desc == 'No Match')]
    with_code = with_code.merge(ec, left_on='closest_desc', right_on='ec_description', how='left')
    with_code = with_code.merge(upc, left_on='upc_desc', right_on='upc_description', how='left')
    with_code = with_code.drop_duplicates()
   
    return with_code

In [None]:
%%time
match_df = closest_description(ec_clip, upc_clip)

In [None]:
match_df.head()

In [None]:
# Let's check the match with the highest score
match_df[match_df['closest_distance'] == match_df['closest_distance'].max()]

In [None]:
%%time
result = with_code(match_df, ec_clip, upc_clip)
result = result[['upc_code', 'upc_description', 'ec_code', 'ec_description', 'closest_distance']]

In [None]:
result.head()

In [None]:
# Clean the table in the format of PPC table
result = result.rename(columns={'upc_code': 'upc', 'ec_code': 'ec'})
clean_result = result[['upc', 'ec']]
clean_result.to_csv('result/string_submission.csv', index=False)

In [None]:
clean_result.head()

In [None]:
# Output the files for evaluation. In practice, the ppc table will contain the same UPC codes as in the UPC table.
ppc_clip[['upc', 'ec']].to_csv('result/string_ground_truth.csv', index=False)

In [None]:
ppc_clip.head()