### Versions of libraries used
numpy==1.24.3
pandas==1.3.4
rapidfuzz==3.6.1
ipywidgets==8.1.1


In [0]:
%pip install -U ipywidgets==8.1.1
import pandas as pd, numpy as np
import os
import io
import json
import datetime as dt
from rapidfuzz import process, fuzz,utils
from ipywidgets import widgets
import datetime as dt

np.set_printoptions(threshold=np.inf)
pd.set_option('chained_assignment',None)

### List of departments applicable for each attribute type
attr_dpt_map = {'Color':['Kitchen & Dining', 'Home Decor (excluding Rugs)', 'Home Decor (Rugs only)' ,'Floral'],\
                'Flavor': ['Pantry','Beverages','Ice Cream','Candy'],\
                 'Scent':['Cleaning & Household','Beauty','Personal Care']}
### Substitutions to be made within the input text before performing the fuzzy match
replacements = {          
 ',': ' ',
 '/': ' ',
 '>':' ',
'<': ' ',
'COFFEE TABLE':'TABLE'}

### Remove substrings like 'CHOCOLATE' if 'CHOCOLATE CHIP' is also present as a top result
def remove_substrings(string_list):
    str_df = pd.DataFrame(string_list, columns= ['Sentence'])
    str_df['Word Length'] = str_df.Sentence.apply(lambda x: len(x.split(' ')))
    str_df['Drop'] = ''
    str_df.sort_values('Word Length', inplace= True)
    str_df.reset_index(drop = True, inplace= True)
    for j in range(len(str_df)):
        str_1 = str_df.Sentence.loc[j]
        for i in range(len(str_df)-(j +1)):
            str_2 = str_df.Sentence.loc[i+j+1]
            if str_1 in str_2: 
                str_df.Drop.loc[j] = 'Yes'
                break
        if len(str_df) <2: 
            break
    return str_df[str_df.Drop != 'Yes'].Sentence.tolist()

#Match using 'token set ratio' with threshold
def color_match(item, color_list_org, subs,cutoff = 90):
    color_list = color_list_org 
    color_no_spaces = [ i for i in color_list_org if not '' in i]
    color_with_spaces = [ i for i in color_list_org if i not in color_no_spaces]
    abbrev_maps = {}
    for i in color_no_spaces:
        for j in color_with_spaces:
            if j.replace(' ','') == i:
                abbrev_maps[i] = j
    matched = process.extract(item, color_list, score_cutoff= cutoff, scorer = fuzz.token_set_ratio, processor=utils.default_process, limit= 7)
    color_shortlist = [i[0] for i in matched]
    matched = [i for i in matched if i[0] in color_shortlist]
    if len( color_shortlist) >0:
        max_score = matched[0][1]
        final_match = [ i for i in matched if i[1]== max_score]
        match_list = [i[0] for i in final_match]
        match_list = [abbrev_maps[i] if i in abbrev_maps.keys() else i for i in match_list]
        match_list = [attb_subs[i] if i in attb_subs.keys() else i for i in match_list]
        rematched = process.extract(item, match_list,  scorer = fuzz.partial_ratio, processor=utils.default_process, limit = 5)
        re_max_score = rematched[0][1]
        final_rematch = [ i for i in rematched if i[1]== re_max_score]
        match_list =  remove_substrings([i[0] for i in final_rematch])
        match_list_ordered =[]
        for word in item.split(' '):#Ordering scents according to thair appearance in the description text
            for color_token in match_list:
                if word in color_token:
                    match_list_ordered.append(color_token)
                    continue
        if len(match_list_ordered) >0:
            match_list = pd.Series(match_list_ordered).drop_duplicates().tolist()
        else:
            match_list = pd.Series(match_list).drop_duplicates().tolist()
    
        
        matched = ';'.join(list(pd.Series(match_list)))
    else: 
        matched = 'OTHER'
    return matched

[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m
[43mNote: you may need to restart the kernel using dbutils.library.restartPython() to use updated packages.[0m


### Select the attribute

In [0]:
### Select an Attribute
print('Select an attribute')
attrib_w = widgets.Dropdown(
    options=['Color','Flavor','Scent'],
    value='Color',
    description='Attribute:',
    disabled=False,
)
display(attrib_w)

Select an attribute


Dropdown(description='Attribute:', options=('Color', 'Flavor', 'Scent'), value='Color')

### Select the department / item category

In [0]:
### Select the Customer Facing Department
attrib = attrib_w.value
print('Select a Department for ' + attrib +  ' Attribution')
dpt_w = widgets.Dropdown(
    options= attr_dpt_map[attrib],
    value= attr_dpt_map[attrib][0],
    description='Department:',
    disabled=False,
)
display(dpt_w)

Select a Department for Flavor Attribution


Dropdown(description='Department:', options=('Pantry', 'Beverages', 'Ice Cream', 'Candy'), value='Pantry')

### Instructions for Uploading input file containing item descriptions
File Format requirements:
1. Must be .csv
2. All listed items must belong to the category selected above. 
3. Must have a column named 'DESCRIPTION' containing item description.
4. May contain any additional columns (such as 'GTIN')


In [0]:
print('Upload the file containing item descriptions to be used for attribute extraction')
upload_w = widgets.FileUpload(
    accept='.csv',  # Accepted file extension e.g. '.txt', '.pdf', 'image/*', 'image/*,.pdf'
    multiple=False  # True to accept multiple files upload else False
)
display(upload_w)

Upload the file containing item descriptions to be used for attribute extraction


FileUpload(value={}, accept='.csv', description='Upload')

#### Run the cell below to generate and save the output file
#### Output path: 
filed will be saved in:
/dbfs/FileStore/tables/NORMALIZED_ATTRIBUTE_OUTPUTS

##### Filename format: 
If the input file is 'XYZ.csv', the output file will be called 'NORMALIZED_Color_<Today's Date>_XYZ.csv'


In [0]:
### Read uploaded file
items = pd.read_csv(io.BytesIO(list(upload_w.value.values())[0]['content']))

### Load config file
select_dpt = dpt_w.value
rename = {'Home Decor (excluding Rugs)':'Home_Decor_NoRugs', 'Home Decor (Rugs only)':'Home_Decor_Rugs'}
if select_dpt in rename.keys():
    select_dpt = rename[select_dpt]


### Look up the relevant config file based on attribute and dpt
configs_dir = '/dbfs/FileStore/tables/DATA_SCIENCE/FuzzyConfigs'
configsList =  os.listdir(configs_dir)
configfile = [i for i in configsList if select_dpt.replace(' ','') in i and attrib.upper() in i][0]
### Load configs
with open(configs_dir + '/' + configfile, "r") as read_file:
    configs = json.load(read_file)
print('Completed loading up configs.')

### Extract the configs
attb_list = configs['Normalized_list']
attb_subs = configs['Substitutions']

print('Generating the output and saving the results..........')
# Generate output
items['Normalized_'+attrib] = items.DESCRIPTION.apply(lambda x: color_match(x,color_list_org = attb_list , subs = attb_subs))
### Save results 
today = dt.datetime.today().strftime("%m%d%Y")
input_file = list(upload_w.value.values())[0]['metadata']['name']
output_path = '/dbfs/FileStore/tables/NORMALIZED_ATTRIBUTE_OUTPUTS/'
outfile = 'NormalIzed_'+ attrib + '_'+ today +'_' +input_file
items.to_csv(output_path+ outfile, index = None)
print('Done. Results saved in'+ output_path + outfile)

Completed loading up configs.
Generating the output and saving the results..........
Done. Results saved in/dbfs/FileStore/tables/NORMALIZED_ATTRIBUTE_OUTPUTS/NormalIzed_Color_01112024_Kitchen&Dining_sample_input.csv
