# Icecream Flavor Extraction Synchup
   - module name: Icecream Flavor Extraction Synchup.ipynb
   - purpose: Detect new icecream flavors for GTIN_NO being  classified as  'OTHER' in Detected_Flavor(s) from the input file  
   - input:
      - ICECREAM_OTHER_FLAVORS_101323.csv
      - ice_cream_items_org.csv
   - output:  
     ICECREAM_OTHER_NEW_DETECTED_FLAVORS_mmddyy.csv
   - functions:
      -  proc_bnj: Get flavors of 'Ben & Jerry' in VND_ECOM_DSC
      -  proc_blue_bunny: Get flavors of ' 'BLUE BUNNY' in VND_ECOM_DSC
      -  proc_oth_res : Get flavors for the GTIN_NO not been identified with flavor
   - Writen by: Sophia Yue  
   - Date: 10-19-23
   

In [1]:

def extract_bnj_flavor(desc):
    pattern = r'[0-9]' #Split on number
    splitted = re.split(pattern, desc.upper())[0]
    if 'ICE CREAM' in splitted: return splitted.split('ICE CREAM')[1]
    else: return splitted 

In [2]:
def proc_bnj(df_oth):
    
    bnjry = df_oth[df_oth.VND_ECOM_DSC.str.contains('Ben &')]
    bnj_flvs_ct = pd.read_csv(path_atb + 'Ben_Jerrys_Flavors_CTribune.csv').Flavor.tolist()
    bnjry['Possible_Flavors'] = bnjry.VND_ECOM_DSC.\
    str.lower().replace('chuncky','chunky')\
    .apply(lambda x: process.extract(x,bnj_flvs_ct , processor= utils.default_process, scorer= fuzz.token_set_ratio, score_cutoff= 90))
    bnjry['New_Detected_Flavor(s)'] = bnjry.Possible_Flavors.apply(lambda x: x[0][0] if len(x) >0 else None)
    
    labeled_descs = bnjry[bnjry['New_Detected_Flavor(s)'].notnull()].VND_ECOM_DSC.unique()
    flavor_maps = dict(zip(bnjry.VND_ECOM_DSC, bnjry['New_Detected_Flavor(s)']))
    bnjry['New_Detected_Flavor(s)'] = bnjry.VND_ECOM_DSC.apply(lambda x: extract_bnj_flavor(x) if not x in labeled_descs else flavor_maps[x])
    bnjry['New_Detected_Flavor(s)'] = bnjry['New_Detected_Flavor(s)'].replace({"BEN & JERRY'S":'','NON-DAIRY':'','FROZEN DESSERT':'','ICE CREAM':'', 'NONE DAIRY':''},regex = True)                                                               
    bnjry = bnjry.drop(columns =['Possible_Flavors'])
    print(f"proc_bnj shape before drop '' = {bnjry.shape}")
    #bnjry = bnjry.dropna(subset=['New_Detected_Flavor(s)'], how = 'all')
    bnjry = bnjry[bnjry['New_Detected_Flavor(s)'] != '']
    
    print(f"proc_bnj shape after drop '' = {bnjry.shape}")
    
    print (f'head: \n {bnjry.head()}')
    print (f"value_count: \n {bnjry['New_Detected_Flavor(s)'].value_counts(dropna = False)}")
    return bnjry

In [3]:
def proc_blue_bunny(df_oth):
    
    pattern = 'BLUE BUNNY'
    blue_bunny = df_oth[df_oth.VND_ECOM_DSC.fillna('').str.upper().str.contains(pattern)]
    blue_bunny['New_Detected_Flavor(s)'] = blue_bunny.VND_ECOM_DSC.str.upper()\
    .replace({'ICE CREAM':'','BLUE BUNNY':'','PREMIUM':'','REDUCED FAT':'','SANDWICH':'','SANDWICHES':''}, regex = True)\
    .apply( extract_bnj_flavor)
    
    print(f"proc_blue_bunny shape before drop '' = {blue_bunny.shape}")
    #blue_bunny = blue_bunny.dropna(subset=['New_Detected_Flavor(s)'], how = 'all')
    blue_bunny= blue_bunny[blue_bunny['New_Detected_Flavor(s)'] != '']
    blue_bunny['New_Detected_Flavor(s)']= blue_bunny['New_Detected_Flavor(s)'].str.rstrip(',') # Drop the last comma
    print(f"proc_blue_bunny shape after drop '' = {blue_bunny.shape}")
    print (f'blue_ bunny head: \n {blue_bunny.head()}')
    print (f"value_count: \n {blue_bunny['New_Detected_Flavor(s)'].value_counts(dropna = False)}")
    return blue_bunny

In [36]:
def proc_oth_res1(df_oth, cutoff = 98,  scorer = fuzz.token_set_ratio ):
   
    df_oth_res = df_oth[~df_oth.GTIN_NO.isin(bnjry.GTIN_NO)]
    df_oth_res = df_oth_res[~df_oth_res.GTIN_NO.isin(blue_bunny.GTIN_NO)] 
    df_oth_res['GTIN_NO'] = df_oth_res.GTIN_NO.apply(lambda x: int(x)).astype(str).apply(lambda x: (14- len(x))*'0'+x)
    keep_col = [ 'GTIN_NO', 'RECEIPT_DESCRIPTION','TAG_DESCRIPTION', 'PRODUCT_DESCRIPTION/MKT_MSG']
    df_oth_res = pd.merge(df_oth_res , ic_gtins[keep_col], on = 'GTIN_NO', how ='left')
    
    df_oth_res['desc'] =  df_oth_res ['VND_ECOM_DSC'] + ' ' +  df_oth_res ['RECEIPT_DESCRIPTION'] \
                          + ' ' + df_oth_res ['TAG_DESCRIPTION']
    df_oth_res ['New_Detected_Flavor(s)'] =  df_oth_res ['desc'].str.upper().apply(lambda x:flavor_match(x, flavor_list = Flavors_combo, cutoff = cutoff, scorer = scorer))
    #df_oth_res.drop(columns =['desc'])
    df_oth_res1 = df_oth_res[df_oth_res['New_Detected_Flavor(s)']   != 'OTHER']
    df_oth_res2 = df_oth_res[df_oth_res['New_Detected_Flavor(s)'] == 'OTHER']
    df_oth_res1['cutooff'] = cutoff
    df_oth_res1   =   df_oth_res1[['GTIN_NO', 'VND_ECOM_DSC', 'Original_Flavor', 'Detected_Flavor(s)', 'New_Detected_Flavor(s)','cutooff']]
    print (f'pr =   df_oth_resoc_oth_res head: \n {df_oth_res1.head()}')
    print (f"value_count: \n {df_oth_res1['New_Detected_Flavor(s)'].value_counts(dropna = False)}")
    return df_oth_res1, df_oth_res2

In [37]:
def proc_oth_res2(df, cutoff = 98,  scorer = fuzz.token_set_ratio ):
    print(f"proc_oth_res_2 shape bef = {df.shape}")

    df ['New_Detected_Flavor(s)'] =  df['desc'].str.upper().apply(lambda x:flavor_match(x, flavor_list = Flavors_combo, cutoff =cutoff, scorer = scorer))
    #df_oth_res.drop(columns =['desc'])))
      
    df['cutooff'] = cutoff
    df  =   df[['GTIN_NO', 'VND_ECOM_DSC', 'Original_Flavor', 'Detected_Flavor(s)', 'New_Detected_Flavor(s)','cutooff']]
    print(f"proc_oth_res_2 shape  = {df.shape}")
    print (f'blue_ bunny head: \n {df.head()}')
    print (f"value_count: \n {df['New_Detected_Flavor(s)'].value_counts(dropna = False)}")
    return df

In [7]:
def proc_oth_res(df_oth):
    """
     Replace flavor_list = IC_flavors_new with flavor_list = Flavors_combo 
     
    """
    df_oth_res = df_oth[~df_oth.GTIN_NO.isin(bnjry.GTIN_NO)]
    df_oth_res = df_oth_res[~df_oth_res.GTIN_NO.isin(blue_bunny.GTIN_NO)] 
    df_oth_res['GTIN_NO'] = df_oth_res.GTIN_NO.apply(lambda x: int(x)).astype(str).apply(lambda x: (14- len(x))*'0'+x)
    keep_col = [ 'GTIN_NO', 'RECEIPT_DESCRIPTION','TAG_DESCRIPTION', 'PRODUCT_DESCRIPTION/MKT_MSG']
    df_oth_res = pd.merge(df_oth_res , ic_gtins[keep_col], on = 'GTIN_NO', how ='left')
    
    df_oth_res['desc'] =  df_oth_res ['VND_ECOM_DSC'] + ' ' +  df_oth_res ['RECEIPT_DESCRIPTION'] \
                          + ' ' + df_oth_res ['TAG_DESCRIPTION']
    df_oth_res ['New_Detected_Flavor(s)'] =  df_oth_res ['desc'].str.upper().apply(lambda x:flavor_match(x, flavor_list = Flavors_combo, cutoff =98))
    #df_oth_res.drop(columns =['desc'])
    df_oth_res =   df_oth_res[['GTIN_NO', 'VND_ECOM_DSC', 'Original_Flavor', 'Detected_Flavor(s)', 'New_Detected_Flavor(s)']]
    print (f'pr =   df_oth_resoc_oth_res head: \n {df_oth_res.head()}')
    print (f"value_count: \n {df_oth_res['New_Detected_Flavor(s)'].value_counts(dropna = False)}")
    return df_oth_res

In [8]:
### Remove substrings like 'CHOCOLATE' if 'CHOCOLATE CHIP' is also present as a top result
def remove_substrings(string_list):
    str_df = pd.DataFrame(string_list, columns= ['Sentence'])
    str_df['Word Length'] = str_df.Sentence.apply(lambda x: len(x.split(' ')))
    str_df['Drop'] = ''
    str_df.sort_values('Word Length', inplace= True)
    str_df.reset_index(drop = True, inplace= True)
    for j in range(len(str_df)):
        str_1 = str_df.Sentence.loc[j]
        for i in range(len(str_df)-(j +1)):
            str_2 = str_df.Sentence.loc[i+j+1]
            if str_1 in str_2: 
                str_df.Drop.loc[j] = 'Yes'
                break
        if len(str_df) <2: 
            break
    return str_df[str_df.Drop != 'Yes'].Sentence.tolist()

In [29]:
#MAtch using 'token set ratio' with threshold
def flavor_match(item, flavor_list, cutoff = 98, scorer = fuzz.token_set_ratio):
    matched = process.extract(item, flavor_list, score_cutoff= cutoff, scorer = scorer, processor=utils.default_process)
    flavor_shortlist = remove_substrings([i[0] for i in matched])
    matched = [i for i in matched if i[0] in flavor_shortlist]
    if len( flavor_shortlist) >0:
        max_score = matched[0][1]
        final_match = [ i for i in matched if i[1]== max_score]
        match_list = [i[0] for i in final_match]
    
        '''
        match_list = [flavor_maps[i] if i in flavor_maps.keys() else i for i in match_list]
        if set(match_list) == {'STRAWBERRY','FRUIT'}: 
            match_list = ['FRUIT']#Drop Strawberry if fruit is the only other flavor
        if  set(match_list) != {'FRUIT'} : match_list = [ i for i in match_list if i != 'FRUIT']#Drop Fruit if it appears with something else
 
        if matched in multiflavor_dict.keys(): #Mapping multi-flavors to intended labels
            matched = multiflavor_dict[matched]
        '''
        matched = ','.join(sorted(list(set(pd.Series(match_list)))))   
    else: matched = 'OTHER'
    return matched

In [11]:
import pandas as pd, numpy as np
from rapidfuzz import process, fuzz,utils
import re
np.set_printoptions(threshold=np.inf)
path = 'C:\\users\\iny2819\\kroger\\Data\\' 
path_atb = 'C:\\users\\iny2819\\kroger\\Data\\ATB\\' 
path_atb_yue = 'C:\\users\\iny2819\\kroger\\Data\\ATB\\atb_yue\\' 

In [12]:
path_code = 'C:\\users\\iny2819\\kroger\\Code\\'  
f_com_code = path_code + "com_code.py"
exec(compile(open(f_com_code , "rb").read(), f_com_code, 'exec' ))

In [13]:
# Flavor List and desired mappings From business (interim) based on "Initial ice cream flavor list.xlsx"
IC_flavors_new =\
['VANILLA','CHOCOLATE','FRUIT','COOKIES AND CREAM','CHOCOLATE CHIP','CARAMEL','TOFFEE','MINT CHOCOLATE CHIP','PEANUT BUTTER','COOKIE DOUGH',\
'BUTTER PECAN','COFFEE','STRAWBERRY','MOOSE TRACKS','ROCKY ROAD','COTTON CANDY','BIRTHDAY CAKE','PUMPKIN','RED VELVET','NEAPOLITAN']
flavor_maps = {'MOCHA':'COFFEE','BROWNIE':'CHOCOLATE','COCOA':'CHOCOLATE','TOFFEE':'CARAMEL','CHERRY':'FRUIT', 'LEMON': 'FRUIT',
 'ORANGE': 'FRUIT',
 'LIME': 'FRUIT',
 'PEACH': 'FRUIT',
 'BANANA': 'FRUIT',
 'PINEAPPLE': 'FRUIT',
 'BERRY': 'FRUIT',
 'APPLE': 'FRUIT',
 'MANGO': 'FRUIT',
 'RASPBERRY': 'FRUIT',
 'PASSION FRUIT': 'FRUIT',
 'POMEGRANATE': 'FRUIT',
 'BLUEBERRY': 'FRUIT'}
IC_flavors_new += flavor_maps.keys()

In [14]:
#### BASE FLAVOR CATEGORIES
# PRIVATE SELECTION
# Yue: added pepper in Spices; not include pvt_slc_flavors
#pvt_slc_flavors = [i.replace('PREMIUM','') for i in pvt_slc.Flavor.unique()]
#Fruits
original_fruits = ['CHERRY', 'LEMON', 'ORANGE', 'LIME', 'PEACH', 'BANANA', 'PINEAPPLE', 'BERRY', 'APPLE', 'MANGO', 'RASPBERRY', 'PASSION FRUIT', 'POMEGRANATE', 'BLUEBERRY']
Fruits =  original_fruits + \
['MARIONBERRY','HUCKLEBERRY','BLACKBERRY','BILBERRY','BOYSENBERRY','LUCUMA','CHERIMOYA','DURIAN','RED BEAN','UBE','GRAPE','GRAPEFRUIT', 'LYCHEE',\
 'RAISIN','APRICOT','PLUM','YUZU','DRAGON FRUIT','HIBISCUS', 'PUMPKIN','COCONUT','CARROT','SWEET POTATO','TARO','WATERMELON','MELON','CANTALOUPE','HONEYDEW','FIGS']
#Nuts /Seeds
Nuts = ['PEANUT','ALMOND','PISTACHIO','WALNUT','BLACK WALNUT','MACADAMIA','CASHEW', 'HAZELNUT','BLACK SESAME','PECAN','NUTS','FLAX','OATMEAL','COCONUT MILK','OAT MILK']
#Candies / Confections
Candy = ['CHOCOLATE','PEPPERMINT','FUDGE','TAFFY','BUBBLEGUM','BUBBLE GUM','MARSHMALLOW','BUTTERSCOTCH','WHITE CHOCOLATE', 'TRUFFLE', 'TIRAMISU', 'NAPOLEON','MACARON','GUMMY BEARS', 'GIANDUJA','NUTELLA','CAJETA','DULCE DE LECHE', 'LICORICE','SOUR PATCH KIDS','TURTLE']
#Desserts / Baked goods
Desserts = ['TOFFEE','CARAMEL','BROWNIE','CAKE','COOKIE','DONUT','DOUGHNUT','WAFER','WAFFLE','WAFFLE CONE','SWIRL','FRENCH TOAST','PANCAKE','PIE','COBBLER','CHEESECAKE','SNICKERDDODLE', "S'MORES",'SMORES','SMORES', 'GRAHAM','GINGERBREAD','PRALINE','BRITTLE','CRISP','CRUMBLE','CLUSTERS','SHORTBREAD', 'TORTE', 'BLACK FOREST','TRES LECHES','FLAN','BAKLAVA','BOSTON CREAM PIE','PUDDING','ECLAIR','CREAM PUFF','CROISSANT','BREAD PUDDDING']
#Dairy products
Dairy = ['MILK','BUTTER','BROWN BUTTER','SOUR CREAM','YOGURT','CHEESE','CREAM CHEESE','RICOTTA','MASCARPONE']
#Sweeteners
Sweeteners = ['HONEY','BROWN SUGAR','MOLASSES','MAPLE','DATES','PALM SUGAR']
#Spices
Spices = ['GINGER','CARDAMOM','CINNAMON','NUTMEG','TURMERIC','ANISE','ALLSPICE','SPICE', 'PEPPER']
#Alcohol
Boozes = ['RUM','BRANDY','BOURBON','VODKA','TEQUILA','MEZCAL','IRISH CREAM','KAHLUA','WHISKEY','BEER','COGNAC','WINE','CHAMPAGNE','PROSECCO','MERLOT','MARGARITA','LIQUOR','LIQUEUR','AMARETTO','COINTREAU','AMARULA']
#Herbs
Herbs = ['BASIL','MINT','LAVENDER']
#Aromas
Aromas = ['ROSE','ELDERFLOWER','PANDAN','SAFFRON']
#Cereals
Cereals = ['FROSTED','FRUIT LOOPS','CHEERIOS','CORN','RICE','RICE KRISPIES','MALT','BARLEY']
##Beverages
Bevs = ['COFFEE','GREEN TEA','CHAI','ESPRESSO','CAPPUCCINO', 'MOCHA','COCOA','EGGNOG','EGG NOG','COLA','SPRITE','PEPSI','FANTA','SODA','POP','LEMONADE','LATTE']
Others = ['UNICORN','SPUMONI','STRACCIATELLA','RAINBOW','BACON', 'RED, WHITE AND BLUE','SUPERMAN','BLUE MOON','SPRINKLES']
Ethnic = ['ITALIAN','FRENCH','ENGLISH','BRITISH','DUTCH','DANISH','POLISH','SCOTCH','IRISH','GERMAN','GREEK','SWEDISH','INDIAN','JAPANESE','ICELANDIC','MEXICAN','SPANISH','RUSSIAN','AMERICAN','KOREAN','LATIN','ARABIC','TURKISH','PERSIAN','CHINESE','FILIPINO','THAI', 'VIETNAMESE', 'ASIAN','EUROPEAN','AFRICAN','AUSTRALIAN']
Processing = ['TOASTED','BAKED','ROASTED','FROSTED','SHAVED','CRUSHED','SLICED','WHIPPED']
pvt_slc_flavors = [] # YUe
Flavors_combo = list(set(pvt_slc_flavors + Fruits + Nuts + Candy + Desserts + Dairy + Sweeteners + Spices + Boozes + Herbs + Cereals + Aromas + Bevs +Others + Ethnic + Processing))


In [15]:
ic_gtins = f_read(path_atb, 'ice_cream_items_org.csv', encoding= 'ISO-8859-1', sep = '|')

file ice_cream_items_org.csv shape (12386, 12) 
 columns: Index(['GTIN_NO', 'ECOMMERCE_DESCRIPTION', 'RECEIPT_DESCRIPTION',
       'TAG_DESCRIPTION', 'PRODUCT_DESCRIPTION/MKT_MSG', 'KROGER_OWNED_FLAVOR',
       'FLV_DSC', 'PID_COM_DSC', 'PID_SUBCOM_DSC', 'CFIC_DEPT_NAM',
       'CFIC_COM_NAM', 'CFIC_SUBCOM_NAM'],
      dtype='object')
dtypes: 
 GTIN_NO                        object
ECOMMERCE_DESCRIPTION          object
RECEIPT_DESCRIPTION            object
TAG_DESCRIPTION                object
PRODUCT_DESCRIPTION/MKT_MSG    object
KROGER_OWNED_FLAVOR            object
FLV_DSC                        object
PID_COM_DSC                    object
PID_SUBCOM_DSC                 object
CFIC_DEPT_NAM                  object
CFIC_COM_NAM                   object
CFIC_SUBCOM_NAM                object
dtype: object
head: 
           GTIN_NO ECOMMERCE_DESCRIPTION RECEIPT_DESCRIPTION  \
0  00000000001175        Frys Snow Cone        SNOW CONE LG   
1  00000000013772                (null)          

In [16]:
ic_gtins.replace('(null)', None, inplace = True)
ic_gtins.isnull().sum()                  

GTIN_NO                            0
ECOMMERCE_DESCRIPTION           4409
RECEIPT_DESCRIPTION             5852
TAG_DESCRIPTION                 4970
PRODUCT_DESCRIPTION/MKT_MSG     8159
KROGER_OWNED_FLAVOR            12386
FLV_DSC                        11369
PID_COM_DSC                       59
PID_SUBCOM_DSC                    59
CFIC_DEPT_NAM                   5679
CFIC_COM_NAM                    5679
CFIC_SUBCOM_NAM                 5679
dtype: int64

In [17]:
df_oth = f_read(path_atb, 'ICECREAM_OTHER_FLAVORS_101323.csv')

file ICECREAM_OTHER_FLAVORS_101323.csv shape (3417, 4) 
 columns: Index(['GTIN_NO', 'VND_ECOM_DSC', 'Original_Flavor', 'Detected_Flavor(s)'], dtype='object')
dtypes: 
 GTIN_NO               object
VND_ECOM_DSC          object
Original_Flavor       object
Detected_Flavor(s)    object
dtype: object
head: 
   GTIN_NO        VND_ECOM_DSC Original_Flavor Detected_Flavor(s)
0   22231   DELI GELATO SMALL           OTHER              OTHER
1   22248  DELI GELATO MEDIUM           OTHER              OTHER
2   22255   DELI GELATO LARGE           OTHER              OTHER
3   22262    DELI GELATO PINT           OTHER              OTHER
4   22279   DELI GELATO QUART           OTHER              OTHER


In [18]:
df_oth = df_oth[df_oth['Detected_Flavor(s)'] == 'OTHER' ]

In [19]:
df_oth = df_oth.dropna(subset=['VND_ECOM_DSC'], how = 'all')
df_oth.shape

(1837, 4)

In [20]:
ic_df = pd.read_csv(path_atb + 'ICECREAM_FLAVORS_101223_pipe_delim.csv', sep = '|')
ic_df = ic_df[ic_df.Flavor == 'OTHER']
ic_df['VND_ECOM_DSC'] = ic_df.VND_ECOM_DSC.fillna('')
ic_df.shape

(3417, 3)

In [21]:
bnjry = proc_bnj(df_oth)

proc_bnj shape before drop '' = (59, 5)
proc_bnj shape after drop '' = (48, 5)
head: 
           GTIN_NO                                    VND_ECOM_DSC  \
1555  76840000722  Ben & Jerry's Ice Cream Chillin The Roast 1 PT   
1556  76840001002      Ben & Jerry's Ice Cream Tonight Dough 3 PC   
1559  76840001750   Ben & Jerry's Ice Cream Gimme Some Sugar 1 PT   
1560  76840001767        Ben & Jerry's Ice Cream Wake N Bake 1 PT   
1562  76840002351           Ben & Jerry's Gimme S'more! Ice Cream   

     Original_Flavor Detected_Flavor(s) New_Detected_Flavor(s)  
1555           OTHER              OTHER     Chillin' the Roast  
1556           OTHER              OTHER         TONIGHT DOUGH   
1559           OTHER              OTHER      GIMME SOME SUGAR   
1560           OTHER              OTHER           WAKE N BAKE   
1562           OTHER              OTHER           Gimme S'more  
value_count: 
 Americone Dream                                            3
 LIGHTSCARAMELACTION            

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bnjry['Possible_Flavors'] = bnjry.VND_ECOM_DSC.\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bnjry['New_Detected_Flavor(s)'] = bnjry.Possible_Flavors.apply(lambda x: x[0][0] if len(x) >0 else None)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bnjry['New_Detected_Flavor(s)'] = bnjry.VND_ECOM_DS

In [22]:
blue_bunny = proc_blue_bunny(df_oth)

proc_blue_bunny shape before drop '' = (43, 5)
proc_blue_bunny shape after drop '' = (40, 5)
blue_ bunny head: 
          GTIN_NO                                       VND_ECOM_DSC  \
894  70640001767     Blue Bunny Birthday Party Ice Cream Sandwiches   
896  70640003112        Blue Bunny Premium Birthday Party Ice Cream   
897  70640003808          Blue Bunny Premium Bunny Tracks Ice Cream   
899  70640003938  Blue Bunny No Sugar Added Reduced Fat Bunny Tr...   
903  70640004508  ELLIPTICAL 5.5 FLUID OUNCE BLUE BUNNY PERSONAL...   

    Original_Flavor Detected_Flavor(s)          New_Detected_Flavor(s)  
894           OTHER              OTHER              BIRTHDAY PARTY  ES  
896           OTHER              OTHER                 BIRTHDAY PARTY   
897           OTHER              OTHER                   BUNNY TRACKS   
899           OTHER              OTHER   NO SUGAR ADDED  BUNNY TRACKS   
903           OTHER              OTHER                     ELLIPTICAL   
value_count: 
 ELLIPTI

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  blue_bunny['New_Detected_Flavor(s)'] = blue_bunny.VND_ECOM_DSC.str.upper()\


In [23]:
df_oth_res = proc_oth_res(df_oth)

TypeError: 'str' object is not callable

In [38]:
df_oth_res1, df_oth_res2 = proc_oth_res1(df_oth, scorer = fuzz.WRatio)

pr =   df_oth_resoc_oth_res head: 
 Empty DataFrame
Columns: [GTIN_NO, VND_ECOM_DSC, Original_Flavor, Detected_Flavor(s), New_Detected_Flavor(s), cutooff]
Index: []
value_count: 
 Series([], Name: New_Detected_Flavor(s), dtype: int64)


In [None]:
df_oth_res2 = proc_oth_res2(df_oth_res2, cutoff = 50, scorer = fuzz.WRatio )

In [None]:
df_ic_flavor_new_detect = pd.concat([bnjry, blue_bunny, df_oth_res1, df_oth_res2]) 
df_ic_flavor_new_detect.shape

In [None]:
df_ic_flavor_new_detect['New_Detected_Flavor(s)'].value_counts(dropna = False )

In [None]:
from datetime import date
from pandas import ExcelWriter
from pandas import ExcelFile

dte = date.today().strftime('%m%d%y')
excel_file = path_atb + "CECREAM_OTHER_NEW_DETECTED_FLAVORS_"+dte + '.xlsx'
writer = pd.ExcelWriter(excel_file)
df_ic_flavor_new_detect.to_excel(writer,'New_Detected_Flavor', index = False)
     
writer.close()

In [None]:
stop

In [None]:
len(ic_flavor_new_detect[ic_flavor_new_detect['New_Detected_Flavor(s)'] == ''])

In [None]:
df_oth_det = df_oth [df_oth['Detected_Flavor(s)'] == 'OTHER' ]
df_oth_det.shape

In [None]:
df_oth_det['GTIN_NO'] = df_oth_det.GTIN_NO.apply(lambda x: int(x)).astype(str).apply(lambda x: (14- len(x))*'0'+x)
df_oth_det.head()

In [None]:
keep_col = [ 'GTIN_NO', 'RECEIPT_DESCRIPTION','TAG_DESCRIPTION', 'PRODUCT_DESCRIPTION/MKT_MSG']
df_oth_det_mrg = pd.merge(df_oth_det , ic_gtins[keep_col], on = 'GTIN_NO', how ='left')
df_oth_det_mrg 

In [None]:
df_oth_det_mrg_bkup = df_oth_det_mrg.copy()  

In [None]:
df_oth_det_mrg = df_oth_det_mrg_bkup.copy()
df_oth_det_mrg.head() 

In [None]:
# not inlucde PRODUCT_DESCRIPTION/MKT_MSG Which has NaN. After combine, the col 'dsc' will be NaN '
# Use ' ' as the seperator between columns

df_oth_det_mrg['desc'] =  df_oth_det_mrg ['VND_ECOM_DSC'] + ' ' +  df_oth_det_mrg ['RECEIPT_DESCRIPTION'] \
                               + ' ' + df_oth_det_mrg ['TAG_DESCRIPTION']
df_oth_det_mrg.head() 

In [None]:
df_oth_det_mrg['desc'][0:2] 

In [None]:
df_oth_det_mrg ['Dected_Flavor_Cutoff98'] =  df_oth_det_mrg ['desc'].str.upper().apply(lambda x:flavor_match(x, flavor_list = IC_flavors_new, cutoff =98))
df_oth_det_mrg ['Dected_Flavor_Cutoff90'] =  df_oth_det_mrg ['desc'].str.upper().apply(lambda x:flavor_match(x, flavor_list = IC_flavors_new, cutoff= 90))
df_oth_det_mrg ['Dected_Flavor_Cutoff85'] =  df_oth_det_mrg ['desc'].str.upper().apply(lambda x:flavor_match(x, flavor_list = IC_flavors_new, cutoff= 85))
df_oth_det_mrg ['Dected_Flavor_Cutoff80'] =  df_oth_det_mrg ['desc'].str.upper().apply(lambda x:flavor_match(x, flavor_list = IC_flavors_new, cutoff= 80))
df_oth_det_mrg ['Dected_Flavor_Cutoff75'] =  df_oth_det_mrg ['desc'].str.upper().apply(lambda x:flavor_match(x, flavor_list = IC_flavors_new, cutoff= 75))

In [None]:
df_oth_det_mrg [''Dected_Flavor_Cutoff98'] =  df_oth_det_mrg ['desc'].str.upper().apply(lambda x:flavor_match(x, flavor_list = IC_flavors_new))
df_oth_det_mrg.head() .value_counts(dropna = False)

In [None]:
df_oth_det_mrg ['Dected_Flavor_Cutoff75'].value_counts(dropna = False) 

In [None]:
#df['period'] = df[['Year', 'quarter', ...]].agg('-'.join, axis=1)
df_oth_det_mrg ['desc']  = df_oth_det_mrg [['VND_ECOM_DSC' ,'RECEIPT_DESCRIPTION','TAG_DESCRIPTION', 'PRODUCT_DESCRIPTION/MKT_MSG']].str().agg('-'.join, axis=1)

In [None]:
df_oth_det_mrg ['desc']  = df_oth_det_mrg [['VND_ECOM_DSC' ,'RECEIPT_DESCRIPTION','TAG_DESCRIPTION']].agg('-'.join, axis=1)

In [None]:
df_oth_det_mrg ['desc']  = df_oth_det_mrg [['VND_ECOM_DSC' ,'RECEIPT_DESCRIPTION','TAG_DESCRIPTION']].agg('-'.join, axis=1)

In [None]:
df_oth_det_mrg ['desc']  = df_oth_det_mrg [['VND_ECOM_DSC' ,'RECEIPT_DESCRIPTION','TAG_DESCRIPTION', 'PRODUCT_DESCRIPTION/MKT_MSG']].str().agg('-'.join, axis=1)

In [None]:
df_oth_det_mrg.head() 

In [None]:
from datetime import date
dte = date.today().strftime('%m%d%y')
excel_file = path_atb_yue + "df_oth_det_mrg_"+dte + '.xlsx'
df_oth_det_mrg.to_excel(excel_file,  index=False )

In [None]:
len(blue_bunny["New_Detected_Flavor(s)"] == '')

In [25]:

def flavor_matchx(item, flavor_list, cutoff = 98, score_name = fuzz.token_set_ratio):
    matched = process.extract(item, flavor_list, score_cutoff= cutoff, scorer = score_name, processor=utils.default_process)
    print(f' matched1 = {matched}' )
    flavor_shortlist = remove_substrings([i[0] for i in matched])
    print(f' flavor_shortlist = {flavor_shortlist}' )
    matched = [i for i in matched if i[0] in flavor_shortlist]
    print(f' matched2 = {matched}' )
    if len( flavor_shortlist) >0:
        max_score = matched[0][1]
        print(f' max_score = {max_score}' )
        final_match = [ i for i in matched if i[1]== max_score]
        print(f' final_match  = {final_match}' )
        match_list = [i[0] for i in final_match]
        print(f' match_list = {match_list}' )
        '''
        match_list = [flavor_maps[i] if i in flavor_maps.keys() else i for i in match_list]
        if set(match_list) == {'STRAWBERRY','FRUIT'}: 
            match_list = ['FRUIT']#Drop Strawberry if fruit is the only other flavor
        if  set(match_list) != {'FRUIT'} : match_list = [ i for i in match_list if i != 'FRUIT']#Drop Fruit if it appears with something else
 
        if matched in multiflavor_dict.keys(): #Mapping multi-flavors to intended labels
            matched = multiflavor_dict[matched]
        '''
        matched = ','.join(sorted(list(set(pd.Series(match_list)))))   
    else: matched = 'OTHER'
    return matched

In [26]:
item = "Simply Organic Daily Grind Black Peppercorns - Organic - Grinder - 3 oz"

mth = flavor_matchx(item, Flavors_combo, cutoff = 50,  score_name = fuzz.token_set_ratio)

 matched1 = [('BLACK SESAME', 58.8235294117647, 14), ('BLACK FOREST', 58.8235294117647, 165), ('BLACK WALNUT', 58.8235294117647, 179)]
 flavor_shortlist = ['BLACK SESAME', 'BLACK FOREST', 'BLACK WALNUT']
 matched2 = [('BLACK SESAME', 58.8235294117647, 14), ('BLACK FOREST', 58.8235294117647, 165), ('BLACK WALNUT', 58.8235294117647, 179)]
 max_score = 58.8235294117647
 final_match  = [('BLACK SESAME', 58.8235294117647, 14), ('BLACK FOREST', 58.8235294117647, 165), ('BLACK WALNUT', 58.8235294117647, 179)]
 match_list = ['BLACK SESAME', 'BLACK FOREST', 'BLACK WALNUT']


In [27]:
mth = flavor_matchx(item, Flavors_combo, cutoff = 50,  score_name = fuzz.WRatio)

 matched1 = [('BLACK SESAME', 85.5, 14), ('BLACK FOREST', 85.5, 165), ('BLACK WALNUT', 85.5, 179), ('PEPPERMINT', 63.0, 1), ('PEPPER', 60.0, 59)]
 flavor_shortlist = ['PEPPERMINT', 'PEPPER', 'BLACK SESAME', 'BLACK FOREST', 'BLACK WALNUT']
 matched2 = [('BLACK SESAME', 85.5, 14), ('BLACK FOREST', 85.5, 165), ('BLACK WALNUT', 85.5, 179), ('PEPPERMINT', 63.0, 1), ('PEPPER', 60.0, 59)]
 max_score = 85.5
 final_match  = [('BLACK SESAME', 85.5, 14), ('BLACK FOREST', 85.5, 165), ('BLACK WALNUT', 85.5, 179)]
 match_list = ['BLACK SESAME', 'BLACK FOREST', 'BLACK WALNUT']


In [35]:
score_name = fuzz.WRatio
mth = flavor_matchx(item, Flavors_combo, cutoff = 50,  score_name = score_name)

 matched1 = [('BLACK SESAME', 85.5, 14), ('BLACK FOREST', 85.5, 165), ('BLACK WALNUT', 85.5, 179), ('PEPPERMINT', 63.0, 1), ('PEPPER', 60.0, 59)]
 flavor_shortlist = ['PEPPERMINT', 'PEPPER', 'BLACK SESAME', 'BLACK FOREST', 'BLACK WALNUT']
 matched2 = [('BLACK SESAME', 85.5, 14), ('BLACK FOREST', 85.5, 165), ('BLACK WALNUT', 85.5, 179), ('PEPPERMINT', 63.0, 1), ('PEPPER', 60.0, 59)]
 max_score = 85.5
 final_match  = [('BLACK SESAME', 85.5, 14), ('BLACK FOREST', 85.5, 165), ('BLACK WALNUT', 85.5, 179)]
 match_list = ['BLACK SESAME', 'BLACK FOREST', 'BLACK WALNUT']


In [None]:

print(process.extract(item, Flavors_combo, scorer=fuzz.ratio))

In [None]:
df_oth_res2.desc.str.upper()

In [None]:
print(process.extract(item, Flavors_combo, scorer=fuzz.token_set_ratio))

In [None]:
 print( process.extract(item, Flavors_combo, score_cutoff= 98, scorer =fuzz.token_set_ratio, processor=utils.default_process))

In [None]:
 print( process.extract(item, Flavors_combo,  scorer =fuzz.token_set_ratio, processor=utils.default_process))

In [None]:
 print( process.extract(item, Flavors_combo,  scorer =fuzz.ratio, processor=utils.default_process))

In [None]:
 print( process.extract(item, Flavors_combo, score_cutoff= 0, scorer =fuzz.token_set_ratio, processor=utils.default_process))

In [None]:
 print( process.extract(item, Flavors_combo, score_cutoff= 50, scorer =fuzz.token_set_ratio, processor=utils.default_process))

In [None]:
item = "Simply Organic Daily Grind Black Peppercorns - Organic - Grinder - 3 oz"


In [None]:
 print( process.extract(item, Flavors_combo, score_cutoff= 50, scorer =fuzz.WRatio, processor=utils.default_process))

In [None]:

df_oth_res1x, df_oth_res2x = proc_oth_res1(df_oth, scorer = fuzz.WRatio)
df_oth_res2x = proc_oth_res2(df_oth_res2x, cutoff = 50, scorer = fuzz.WRatio )
df_ic_flavor_new_detectx = pd.concat([bnjry, blue_bunny, df_oth_res1x, df_oth_res2x]) 
df_ic_flavor_new_detectx.to_excel(path_atb_yue + "ic_flavor_new_detect_yue", index = False)

In [40]:
df_oth_res1x, df_oth_res2x = proc_oth_res1(df_oth, scorer = fuzz.WRatio, cutoff=50)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  str_df.Drop.loc[j] = 'Yes'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  str_df.Drop.loc[j] = 'Yes'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  str_df.Drop.loc[j] = 'Yes'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  str_df.Drop.loc[j] = 'Yes'
A value is trying to be set on a copy of a slice from a Data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  str_df.Drop.loc[j] = 'Yes'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  str_df.Drop.loc[j] = 'Yes'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  str_df.Drop.loc[j] = 'Yes'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  str_df.Drop.loc[j] = 'Yes'
A value is trying to be set on a copy of a slice from a Data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  str_df.Drop.loc[j] = 'Yes'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  str_df.Drop.loc[j] = 'Yes'


pr =   df_oth_resoc_oth_res head: 
           GTIN_NO        VND_ECOM_DSC Original_Flavor Detected_Flavor(s)  \
0  00000000022231   DELI GELATO SMALL           OTHER              OTHER   
1  00000000022248  DELI GELATO MEDIUM           OTHER              OTHER   
2  00000000022255   DELI GELATO LARGE           OTHER              OTHER   
3  00000000022262    DELI GELATO PINT           OTHER              OTHER   
4  00000000022279   DELI GELATO QUART           OTHER              OTHER   

  New_Detected_Flavor(s)  cutooff  
0                S'MORES       50  
1               OAT MILK       50  
2               LAVENDER       50  
3   ECLAIR,PEANUT,SPRITE       50  
4                 CARROT       50  
value_count: 
 BOSTON CREAM PIE,CREAM CHEESE,CREAM PUFF,IRISH CREAM,SOUR CREAM    292
BOSTON CREAM PIE,CREAM CHEESE,IRISH CREAM                           68
POP                                                                 44
WAFFLE CONE                                                    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_oth_res1['cutooff'] = cutoff


In [None]:
df_oth.shape

In [32]:
df_oth_res2x = proc_oth_res2(df_res2x, scorer = fuzz.WRatio)

pr =   df_oth_resoc_oth_res head: 
 Empty DataFrame
Columns: [GTIN_NO, VND_ECOM_DSC, Original_Flavor, Detected_Flavor(s), New_Detected_Flavor(s), cutooff]
Index: []
value_count: 
 Series([], Name: New_Detected_Flavor(s), dtype: int64)


In [34]:
flavor_match(item, flavor_list, cutoff = 98, scorer = fuzz.token_set_ratio)

(1749, 9)

In [39]:
df_oth_res1x, df_oth_res2x = proc_oth_res1(df_oth, scorer = fuzz.token_set_ratio)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  str_df.Drop.loc[j] = 'Yes'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  str_df.Drop.loc[j] = 'Yes'


pr =   df_oth_resoc_oth_res head: 
             GTIN_NO                                       VND_ECOM_DSC  \
104  00009073102130                     Sharons Wildberry Sorbet 16 Oz   
121  00011110084095  DELUXE CHURNED CELEBRATION FAMILY SIZE - KROGE...   
124  00011110085283  DELUXE CHURNED CELEBRATION FAMILY SIZE NATURAL...   
127  00011110096531                       Kroger® Jumbo Ice Cream Cups   
136  00011110502438               Kroger Deluxe Bunch Crunch Ice Cream   

    Original_Flavor Detected_Flavor(s) New_Detected_Flavor(s)  cutooff  
104           OTHER              OTHER                  BERRY       98  
121           OTHER              OTHER                   CAKE       98  
124           OTHER              OTHER                   CAKE       98  
127           OTHER              OTHER                   CAKE       98  
136           OTHER              OTHER                   MILK       98  
value_count: 
 POP               22
CHOCOLATE         10
CAKE              10
YOG

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_oth_res1['cutooff'] = cutoff


In [42]:
df_oth_res2.to_excel(path_atb_yue + 'oth_res2_Wratio.xlsx', index = False)