In [27]:
import pandas as pd
import pickle
import csv
from sentence_transformers import SentenceTransformer, models, InputExample, losses, evaluation, util


In [2]:
offer_df = pd.read_csv("../../../data/raw/offer_retailer.csv")
brand_df = pd.read_csv("../../../data/raw/brand_category.csv")
brand_df.dropna(inplace=True)
cat_df = pd.read_csv("../../../data/raw/categories.csv")

In [4]:
new_categories = []
for parent_category, group in cat_df.groupby("IS_CHILD_CATEGORY_TO"):
    sister_products = ", ".join(group["PRODUCT_CATEGORY"])
    new_categories.append([parent_category, sister_products])

new_categories = pd.DataFrame(new_categories, columns=["product_category", "RELATED_CAT"])
new_categories.head()

Unnamed: 0,product_category,RELATED_CAT
0,Alcohol,"Beer, Malt Beverages, Hard Seltzers, Sodas, Wa..."
1,Animals & Pet Supplies,Dog Supplies
2,Baby & Toddler,"Baby Bathing, Potty Training, Baby Safety, Dia..."
3,Beauty,"Cosmetic Tools, Makeup, Nail Care, Body Fragra..."
4,Beverages,"Coffee, Meal Replacement Beverages, Vegetable ..."


In [6]:
cat_related_df = pd.merge(cat_df, new_categories, left_on="IS_CHILD_CATEGORY_TO", right_on="product_category", how="left").drop(columns=["CATEGORY_ID","product_category"]) 
cat_related_df.head()

Unnamed: 0,PRODUCT_CATEGORY,IS_CHILD_CATEGORY_TO,RELATED_CAT
0,Red Pasta Sauce,Pasta Sauce,"Red Pasta Sauce, Alfredo & White Pasta Sauce, ..."
1,Alfredo & White Pasta Sauce,Pasta Sauce,"Red Pasta Sauce, Alfredo & White Pasta Sauce, ..."
2,Cooking & Baking,Pantry,"Cooking & Baking, Packaged Seafood, Nut Butter..."
3,Packaged Seafood,Pantry,"Cooking & Baking, Packaged Seafood, Nut Butter..."
4,Feminine Hygeine,Health & Wellness,"Feminine Hygeine, Sexual Health, Foot Care, De..."


In [7]:
brand_offer_df = pd.merge(offer_df,brand_df, on="BRAND", how="left")
brand_offer_df.head()

Unnamed: 0,OFFER,RETAILER,BRAND,BRAND_BELONGS_TO_CATEGORY,RECEIPTS
0,Spend $50 on a Full-Priced new Club Membership,SAMS CLUB,SAMS CLUB,,
1,"Beyond Meat® Plant-Based products, spend $25",,BEYOND MEAT,Plant-Based Meat,1584.0
2,"Beyond Meat® Plant-Based products, spend $25",,BEYOND MEAT,Frozen Plant-Based Meat,313.0
3,"Beyond Meat® Plant-Based products, spend $25",,BEYOND MEAT,Packaged Meat,30.0
4,Good Humor Viennetta Frozen Vanilla Cake,,GOOD HUMOR,Frozen Desserts,1052.0


In [18]:
brand_offer_category_df = pd.merge(brand_offer_df, cat_related_df, left_on="BRAND_BELONGS_TO_CATEGORY", right_on="PRODUCT_CATEGORY", how="left").fillna(" ").drop(["BRAND_BELONGS_TO_CATEGORY"], axis=1)
brand_offer_category_df.head()

Unnamed: 0,OFFER,RETAILER,BRAND,RECEIPTS,PRODUCT_CATEGORY,IS_CHILD_CATEGORY_TO,RELATED_CAT
0,Spend $50 on a Full-Priced new Club Membership,SAMS CLUB,SAMS CLUB,,,,
1,"Beyond Meat® Plant-Based products, spend $25",,BEYOND MEAT,1584.0,Plant-Based Meat,Meat & Seafood,Plant-Based Meat
2,"Beyond Meat® Plant-Based products, spend $25",,BEYOND MEAT,313.0,Frozen Plant-Based Meat,Frozen,"Frozen Fruits, Frozen Desserts, Frozen Sides, ..."
3,"Beyond Meat® Plant-Based products, spend $25",,BEYOND MEAT,30.0,Packaged Meat,Pantry,"Cooking & Baking, Packaged Seafood, Nut Butter..."
4,Good Humor Viennetta Frozen Vanilla Cake,,GOOD HUMOR,1052.0,Frozen Desserts,Frozen,"Frozen Fruits, Frozen Desserts, Frozen Sides, ..."


In [10]:
def clean_text(text):
    import re
    text = str(text).lower()
    text = re.sub('[-]+',' ', text)
    text = re.sub('[^A-Za-z0-9\[\]\s]+', '', text)   
    #text = text.strip()
    return text

In [16]:
def embd_txt(row):
    ret_string = ""
    #print(row)
    token_map = {
            "OFFER" : "[OFF] ",
            "BRAND" : " [BN] ",
            "RETAILER" : " [RN] ",
            "PRODUCT_CATEGORY" : " [CN] ",
            "IS_CHILD_CATEGORY_TO" : " [PCN] ",
            "RELATED_CAT" : " [RCN] "
    }
    for cols, val in row.items():
        if val == " ":
            continue
        if cols == "RECEIPTS":
            continue
        val = clean_text(val)
        ret_string += token_map[cols] + str(val)
    if ret_string == "":
        print("no offer")
        #ret_string=ret_string[:-2]

    return ret_string

In [19]:
brand_offer_category_df["SIMPLE_EMBD"] = brand_offer_category_df.apply(lambda row:embd_txt(row), axis=1)

In [21]:
simple_df = brand_offer_category_df[["OFFER","SIMPLE_EMBD"]].copy()
simple_df.head()

Unnamed: 0,OFFER,SIMPLE_EMBD
0,Spend $50 on a Full-Priced new Club Membership,[OFF] spend 50 on a full priced new club membe...
1,"Beyond Meat® Plant-Based products, spend $25",[OFF] beyond meat plant based products spend 2...
2,"Beyond Meat® Plant-Based products, spend $25",[OFF] beyond meat plant based products spend 2...
3,"Beyond Meat® Plant-Based products, spend $25",[OFF] beyond meat plant based products spend 2...
4,Good Humor Viennetta Frozen Vanilla Cake,[OFF] good humor viennetta frozen vanilla cake...


In [28]:
#sentence-transformers/multi-qa-MiniLM-L6-cos-v1
# model = SentenceTransformer("msmarco-distilbert-base-v4")
sbert_model = SentenceTransformer("./models/")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [24]:
simple_embd = sbert_model.encode(simple_df["SIMPLE_EMBD"].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/26 [00:00<?, ?it/s]

In [26]:
simple_df["VECTOR_EMBD"] = pd.Series([arr for arr in simple_embd])
simple_df.to_pickle("../../../data/processed/embeddings/simple_df_with_embeddings_new_model.pkl")

In [None]:
## Embeddings generated and stored for current data