## Imports

In [2]:
from txtai import Embeddings
import pandas as pd

## Create Merged Dataset

In [3]:
def preprocess_column(column):
    column = column.astype(str).str.lower()
    column = column.str.replace(r'-', ' ', regex=True)
    column = column.str.replace(r'[^a-z0-9\s]', '', regex=True)
    column = column.str.replace(r'\s+', ' ', regex=True)
    column = column.str.strip()
    return column

In [5]:
df_categories = pd.read_csv('DATA/categories.csv')
df_brands = pd.read_csv('DATA/brand_category.csv')
df_offers = pd.read_csv('DATA/offer_retailer.csv')

In [6]:
df_categories.drop('CATEGORY_ID', axis=1, inplace=True)
df_categories = df_categories.rename(columns={'PRODUCT_CATEGORY': 'CATEGORY', 'IS_CHILD_CATEGORY_TO':'PARENT_CATEGORY'})

In [7]:
df_categories['CATEGORY'] = preprocess_column(df_categories['CATEGORY'])
df_categories['PARENT_CATEGORY'] = preprocess_column(df_categories['PARENT_CATEGORY'])

In [8]:
df_brands.drop('RECEIPTS', axis=1, inplace=True)
df_brands = df_brands.rename(columns={'BRAND_BELONGS_TO_CATEGORY':'CATEGORY'})

In [9]:
df_brands['BRAND'] = preprocess_column(df_brands['BRAND'])
df_brands['CATEGORY'] = preprocess_column(df_brands['CATEGORY'])

In [10]:
df_offers['BRAND'] = preprocess_column(df_offers['BRAND'])
df_offers['RETAILER'] = preprocess_column(df_offers['RETAILER'])
df_offers['OFFER_PREPROCESSED'] = preprocess_column(df_offers['OFFER'])

In [11]:
dataset = pd.merge(df_categories, df_brands, on='CATEGORY', how='outer')

In [12]:
dataset = pd.merge(dataset, df_offers, on='BRAND', how='outer')

In [13]:
dataset = dataset.drop_duplicates().reset_index(drop=True)

In [14]:
dataset = dataset.dropna(subset=['OFFER'])

In [15]:
dataset = dataset.reset_index(drop=True)

In [16]:
dataset

Unnamed: 0,CATEGORY,PARENT_CATEGORY,BRAND,OFFER,RETAILER,OFFER_PREPROCESSED
0,red pasta sauce,pasta sauce,barilla,Barilla® Pesto Sauce,,barilla pesto sauce
1,red pasta sauce,pasta sauce,barilla,"Barilla® Pasta, select varieties, buy 2",,barilla pasta select varieties buy 2
2,red pasta sauce,pasta sauce,barilla,"Barilla® pasta, select varieties, buy 3",,barilla pasta select varieties buy 3
3,red pasta sauce,pasta sauce,barilla,"Barilla® pasta, select varieties, buy 4",,barilla pasta select varieties buy 4
4,red pasta sauce,pasta sauce,barilla,"Barilla® pasta, select varieties, buy 2",,barilla pasta select varieties buy 2
...,...,...,...,...,...,...
839,,,dickeys barbecue pit,Spend $40 at Dickey's Barbecue Pit,dickeys barbecue pit,spend 40 at dickeys barbecue pit
840,,,oxiclean,"OxiClean™ Laundry Stain Removers, select varie...",walmart,oxiclean laundry stain removers select varieti...
841,,,squirrel,"Squirrel, The Bedside Perch, online at Amazon",amazon,squirrel the bedside perch online at amazon
842,,,persil,"Persil® ProClean®, select varieties, at Walmart",walmart,persil proclean select varieties at walmart


In [17]:
dataset.to_csv('dataset.csv')

## Convert Merged Dataset to Embeddings

In [18]:
dataset = pd.read_csv('dataset.csv', index_col=0)

In [20]:
# dataset

In [21]:
dataset_concatenated = []

# Iterate through each row in the DataFrame
for index, row in dataset.iterrows():
    # Create a list of non-NaN elements in the row
    # non_nan_elements = [str(element) for element in row if not pd.isna(element)]
    non_nan_elements = [str(element) for column, element in row.items() if column != 'OFFER' and not pd.isna(element)]


    # Concatenate the non-NaN elements in the row to create a single string
    concatenated_row = ' '.join(non_nan_elements)

    # Append the concatenated row to the list
    dataset_concatenated.append(concatenated_row)

In [22]:
len(dataset_concatenated)

844

In [24]:
dataset.iloc[343]

CATEGORY                       cooking baking
PARENT_CATEGORY                        pantry
BRAND                               pavilions
OFFER                 Spend $270 at Pavilions
RETAILER                            pavilions
OFFER_PREPROCESSED     spend 270 at pavilions
Name: 343, dtype: object

In [23]:
dataset_concatenated[343]

'cooking baking pantry pavilions pavilions spend 270 at pavilions'

In [25]:
# Create an embeddings
embeddings = Embeddings(hybrid=True, path="sentence-transformers/nli-mpnet-base-v2")

Downloading (…)lve/main/config.json: 100%|██████████| 587/587 [00:00<00:00, 235kB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [00:39<00:00, 11.2MB/s] 
Downloading (…)okenizer_config.json: 100%|██████████| 1.19k/1.19k [00:00<00:00, 680kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 3.24MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 12.1MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 135kB/s]


In [26]:
embeddings.index(dataset_concatenated)

In [101]:
embeddings.save("dataset_index")

In [27]:
embeddings = Embeddings()
embeddings.load("dataset_index")

In [46]:
search_data = embeddings.search("whole foods mart", 30)

In [47]:
search_results = pd.DataFrame(columns=['OFFER', 'SCORE'])

In [48]:
for index, score in search_data:
    # search_results.append( [dataset.iloc[index]['OFFER'], score] )
    # search_results.append({'OFFER': dataset.iloc[index]['OFFER'], 'SCORE':score })
    search_results = pd.concat([search_results, pd.DataFrame.from_records([{'OFFER': dataset.iloc[index]['OFFER'], 'SCORE':score }])], ignore_index=True)

  search_results = pd.concat([search_results, pd.DataFrame.from_records([{'OFFER': dataset.iloc[index]['OFFER'], 'SCORE':score }])], ignore_index=True)


In [49]:
search_results

Unnamed: 0,OFFER,SCORE
0,"EVOLVE® Plant-Based Protein Shake, 4 count, se...",0.49485
1,"EVOLVE® Plant-Based Protein Shake, 4 count, se...",0.489589
2,"Gorton's Air Fried Butterfly Shrimp, at Walmart",0.302412
3,"Tyson Products, select varieties, spend $15 at...",0.29923
4,Gorton's at select retailers,0.293307
5,Back to the Roots Microgreens Grow Kit OR Seed...,0.290968
6,"Back to the Roots Soils, select varieties and ...",0.290918
7,Back to the Roots Grow Hydroponic Grow Kit OR ...,0.289889
8,"General Mills™ products, select brands, spend ...",0.284443
9,"Sara Lee® bread, select varieties, buy 2 at Wa...",0.284406
