In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np
import re

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch

embedding_model_name = 'sentence-transformers/all-MiniLM-L6-v2'
embedding_tokenizer = AutoTokenizer.from_pretrained(embedding_model_name)
embedding_model = AutoModel.from_pretrained(embedding_model_name)

In [None]:
# save the models to model/
embedding_tokenizer.save_pretrained('model/tokenizer/')
embedding_model.save_pretrained('model/actual_model/')

In [3]:
brand_df = pd.read_csv('data/brand_category.csv')
brand_df.dropna(inplace=True)

# drop duplicate brand names. keep the brand with highest number of receipts
brand_df = brand_df.sort_values(by=['BRAND', 'RECEIPTS'], ascending=False)
brand_df.drop_duplicates(subset=['BRAND'], keep='first', inplace=True)

brand_df.shape

(8521, 3)

In [20]:
brand_df.columns

Index(['BRAND', 'BRAND_BELONGS_TO_CATEGORY', 'RECEIPTS', 'BRAND_NEW',
       'embeddings', 'similarity'],
      dtype='object')

In [21]:
brand_df[['BRAND', 'BRAND_BELONGS_TO_CATEGORY']].head(10)

Unnamed: 0,BRAND,BRAND_BELONGS_TO_CATEGORY
3369,breath savers,Candy
5678,barkThins,Candy
3703,Zagnut,Candy
1138,ZYWIEC,Beer
9463,ZYRTEC,Medicines & Treatments
7307,ZWACK SLIVOVITZ,Spirits
5621,ZUZI,Spirits
4541,ZUMBIDA,Malt Beverages
1811,ZUMBA PICA,Cooking & Baking
9387,ZULKA,Cooking & Baking


In [4]:
categories = pd.read_csv('data/categories.csv')
categories.drop(columns=['CATEGORY_ID'], inplace=True)
categories

Unnamed: 0,PRODUCT_CATEGORY,IS_CHILD_CATEGORY_TO
0,Red Pasta Sauce,Pasta Sauce
1,Alfredo & White Pasta Sauce,Pasta Sauce
2,Cooking & Baking,Pantry
3,Packaged Seafood,Pantry
4,Feminine Hygeine,Health & Wellness
...,...,...
113,Frozen Turkey,Frozen Meat
114,Frozen Chicken,Frozen Meat
115,Frozen Beef,Frozen Meat
116,Frozen Seafood,Frozen Meat


In [5]:
offers = pd.read_csv('data/offer_retailer.csv')
offers

Unnamed: 0,OFFER,RETAILER,BRAND
0,Spend $50 on a Full-Priced new Club Membership,SAMS CLUB,SAMS CLUB
1,"Beyond Meat® Plant-Based products, spend $25",,BEYOND MEAT
2,Good Humor Viennetta Frozen Vanilla Cake,,GOOD HUMOR
3,"Butterball, select varieties, spend $10 at Dillons Food Store",DILLONS FOOD STORE,BUTTERBALL
4,"GATORADE® Fast Twitch®, 12-ounce 12 pack, at Amazon Storefront*",AMAZON,GATORADE
...,...,...,...
379,Spend $10 at KFC,KFC,KFC
380,Sargento Product,,SARGENTO
381,Thomas'® Bagel Thins,,THOMAS
382,Spend $270 at Pavilions,PAVILIONS,PAVILIONS


In [6]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def get_embeddings(df, text_col, model, tokenizer):
    model.eval()
    encoded_input = tokenizer(list(df[text_col]), padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
    return df.assign(embeddings=sentence_embeddings.tolist())

In [7]:
offers = offers.merge(brand_df, on='BRAND', how='left').merge(categories, left_on='BRAND_BELONGS_TO_CATEGORY', right_on='PRODUCT_CATEGORY', how='left')
offers.replace(np.nan, '',inplace=True)
offers['offer_brand_category'] = offers['OFFER'] + ' ' + offers['BRAND']+ ' ' + offers['PRODUCT_CATEGORY']
offers = get_embeddings(offers, 'offer_brand_category', embedding_model, embedding_tokenizer)
categories = get_embeddings(categories, 'PRODUCT_CATEGORY', embedding_model, embedding_tokenizer)
brand_df['BRAND_NEW'] = brand_df['BRAND'] + ' ' + brand_df['BRAND_BELONGS_TO_CATEGORY']
brand_df = get_embeddings(brand_df, 'BRAND_NEW', embedding_model, embedding_tokenizer)

In [8]:
def get_topn_similar(text, df):
    text_embedding = get_embeddings(pd.DataFrame({'OFFER': [text]}), 'OFFER', embedding_model, embedding_tokenizer)['embeddings'].values[0]
    df['similarity'] = df['embeddings'].apply(lambda x: np.dot(x, text_embedding))
    return df.sort_values(by='similarity', ascending=False).head(10)

In [9]:
shit = get_topn_similar('window curtains', offers)[['OFFER','RETAILER','BRAND','similarity']]

In [14]:
if len(shit)>0:
    print('shit')

shit


In [16]:

get_topn_similar('diapers', categories)[['PRODUCT_CATEGORY','similarity']]

Unnamed: 0,PRODUCT_CATEGORY,similarity
37,Diapering,0.803998
102,Adult Incontinence,0.489961
22,Baby Bathing,0.487685
79,Laundry Supplies,0.470841
73,Eggs,0.463667
97,Fruit Juices,0.454788
49,Milk,0.445963
46,Puffed Snacks,0.441153
101,Baby Health,0.438396
27,Potty Training,0.431855


In [17]:
get_topn_similar('diapers', brand_df)[['BRAND','BRAND_BELONGS_TO_CATEGORY', 'RECEIPTS','similarity']]

Unnamed: 0,BRAND,BRAND_BELONGS_TO_CATEGORY,RECEIPTS,similarity
568,PAMPERS,Diapering,3869,0.8264
7907,NEST,Diapering,21,0.652806
9099,ONE BY POISE,Adult Incontinence,14,0.528623
112,WET ONES,Bath & Body,19489,0.506636
937,POISE,Adult Incontinence,2008,0.504477
2620,BABYGANICS,Household Supplies,308,0.50042
2006,JOHNSONS,Baby Bathing,519,0.465809
6005,DR BROWN'S,Baby Bathing,44,0.458393
2828,NANIT,Baby Bathing,262,0.456417
2467,POPPERS,Puffed Snacks,347,0.454431


In [13]:
offers.to_csv('data/offers_embeddings.csv', index=False)
categories.to_csv('data/categories_embeddings.csv', index=False)
brand_df.to_csv('data/brand_embeddings.csv', index=False)

In [27]:
categories[categories['PRODUCT_CATEGORY']=='Diapering']['IS_CHILD_CATEGORY_TO']

37    Baby & Toddler
Name: IS_CHILD_CATEGORY_TO, dtype: object

In [28]:
offers[offers['IS_CHILD_CATEGORY_TO']=='Baby & Toddler']['OFFER']

Series([], Name: OFFER, dtype: object)

In [72]:
brand_df[brand_df['BRAND_BELONGS_TO_CATEGORY']=='Diapering']['BRAND']

568         PAMPERS
1585        HUGGIES
1913         HONEST
3868           LUVS
5191    HELLO BELLO
7907           NEST
Name: BRAND, dtype: object

In [19]:
def has_unique_digits(num):
    str_num = str(num)
    return len(str_num) == len(set(str_num))

def count_unique_numbers(n, m):
    return sum(1 for i in range(n, m+1) if has_unique_digits(i))

def countNumbers(arr):
   return [count_unique_numbers(i[0],i[1]) for i in arr]

In [20]:
countSum([[1,20],[9,19]])

[19, 10]

In [18]:
for i in [[1,20],[9,19]]:
    print(*i)

1 20
9 19
