# Classifying input into one of Category, Brand or Retailer
Used a pretrained BERT model and fine tuned it on a custom labeled dataset of Categories, Brands and Retailers for classification. Extremely imbalanced dataset results in bad classification.

In [None]:
# !pip install transformers

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import random
import json

## Classify input into one of category, brand or retailer

### Creating classification dataset

In [None]:
# Creating classification data
df_categories = pd.read_csv('categories.csv')
df_brands = pd.read_csv('brand_category.csv')
df_offers = pd.read_csv('offer_retailer.csv')

In [None]:
def preprocess_column(column):
    column = column.astype(str).str.lower()
    column = column.str.replace(r'-', ' ', regex=True)
    column = column.str.replace(r'[^a-z0-9\s]', ' ', regex=True)
    column = column.str.replace(r'\s+', ' ', regex=True)
    column = column.str.strip()
    return column

In [None]:
df_categories['PRODUCT_CATEGORY'] = preprocess_column(df_categories['PRODUCT_CATEGORY'])
df_categories['IS_CHILD_CATEGORY_TO'] = preprocess_column(df_categories['IS_CHILD_CATEGORY_TO'])

df_brands['BRAND'] = preprocess_column(df_brands['BRAND'])
df_brands['BRAND_BELONGS_TO_CATEGORY'] = preprocess_column(df_brands['BRAND_BELONGS_TO_CATEGORY'])

df_offers['OFFER_PREPROCESSED'] = preprocess_column(df_offers['OFFER'])
df_offers['RETAILER'] = preprocess_column(df_offers['RETAILER'])
df_offers['BRAND'] = preprocess_column(df_offers['BRAND'])

In [None]:
category_list = list( set( list(df_categories['PRODUCT_CATEGORY']) + list(df_categories['IS_CHILD_CATEGORY_TO']) + list(df_brands['BRAND_BELONGS_TO_CATEGORY'] )))

In [None]:
brand_list = list( set( list(df_brands['BRAND']) + list(df_offers['BRAND'] )))

In [None]:
retailer_list = list( set( list( df_offers['RETAILER'] )))

In [None]:
# Create a list of tuples with elements and their label
combined_data = [(item, 'category') for item in category_list] + [(item, 'brand') for item in brand_list] + [(item, 'retailer') for item in retailer_list]

# Create a DataFrame from the combined data
df = pd.DataFrame(combined_data, columns=['Element', 'Label'])

In [None]:
df = df.dropna(subset=['Element'])

In [None]:
df = df.sample(frac=1, random_state=42)  # Setting a random_state for reproducibility
df.reset_index(drop=True, inplace=True)

In [None]:
df.to_csv('classification.csv')

In [None]:
df = pd.read_csv('classification.csv', index_col=0)
df = df.dropna(subset=['Element'])

In [None]:
df['Label'].value_counts()

brand       8464
category     132
retailer      61
Name: Label, dtype: int64

In [None]:
df

Unnamed: 0,Element,Label
0,seventh generation,brand
1,the nutmeg spice co,brand
2,orin swift,brand
3,brick west westbound train juicy ipa,brand
4,southern living,brand
...,...,...
8654,bauhaus brew labs,brand
8655,crooked palm distillery,brand
8656,greenbush brewing co,brand
8657,appleton estate,brand


### Model Training

In [None]:
class_weights = []
brand_weight = (8464 + 132 + 61) / 8464
category_weight = (8464 + 132 + 61) / 132
retailer_weight = (8464 + 132 + 61) / 61
class_weights = [brand_weight, category_weight, retailer_weight]

In [None]:
class_weights

tensor([  1.0228,  65.5833, 141.9180], device='cuda:0')

In [None]:
# df

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class_weights = torch.tensor(class_weights).to(device)
custom_loss = torch.nn.CrossEntropyLoss(weight=class_weights)

# Encode the labels and add a new column for encoded labels
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['Label'])

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=10):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['Element']
        label = self.data.iloc[idx]['encoded_label']
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label)
        }



# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(df['encoded_label'].unique()))
model.to(device)

# Split the data into train and validation sets with a seed value
train_df, val_df = train_test_split(df, test_size=0.25, random_state=42)

# Create training and validation data loaders
train_dataset = CustomDataset(train_df, tokenizer)
val_dataset = CustomDataset(val_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)

# Define the optimizer and training parameters
optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 5

#  Training loop with progress bar
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    tqdm_loader = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}')
    for batch in tqdm_loader:
        input_ids = batch['input_ids'].to(device)  # Move input data to GPU
        attention_mask = batch['attention_mask'].to(device)  # Move attention mask to GPU
        labels = batch['labels'].to(device)  # Move labels to GPU

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs['logits'].to(device)
        # loss = outputs.loss
        loss = custom_loss(logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        tqdm_loader.set_postfix({'Loss': loss.item()})

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/5: 100%|██████████| 1623/1623 [02:06<00:00, 12.80it/s, Loss=0.0307]


Epoch 1/5, Average Loss: 0.2510


Epoch 2/5: 100%|██████████| 1623/1623 [02:03<00:00, 13.16it/s, Loss=0.184]


Epoch 2/5, Average Loss: 0.1462


Epoch 3/5: 100%|██████████| 1623/1623 [02:03<00:00, 13.18it/s, Loss=0.00385]


Epoch 3/5, Average Loss: 0.0891


Epoch 4/5: 100%|██████████| 1623/1623 [02:08<00:00, 12.67it/s, Loss=0.00073]


Epoch 4/5, Average Loss: 0.0498


Epoch 5/5: 100%|██████████| 1623/1623 [02:03<00:00, 13.15it/s, Loss=0.000781]

Epoch 5/5, Average Loss: 0.0486





In [None]:
# Evaluation loop
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)  # Move input data to GPU
        attention_mask = batch['attention_mask'].to(device)  # Move attention mask to GPU
        labels = batch['labels'].to(device)  # Move labels to GPU

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)

        predictions.extend(predicted_labels.tolist())
        true_labels.extend(labels.tolist())

# Generate the classification report
target_names = label_encoder.classes_
classification_rep = classification_report(true_labels, predictions, target_names=target_names)

# Print the classification report
print(classification_rep)

              precision    recall  f1-score   support

       brand       0.99      0.98      0.99      2115
    category       0.69      0.81      0.74        36
    retailer       0.10      0.21      0.14        14

    accuracy                           0.97      2165
   macro avg       0.59      0.67      0.62      2165
weighted avg       0.98      0.97      0.98      2165



Saving model to disk for future use

In [None]:
root = '/content/drive/MyDrive/Fetch_technical_assessment/'

In [None]:
import json
# Save the model and tokenizer
model.save_pretrained(root + "bert_classification_model")  # Save the model's weights and configuration
tokenizer.save_pretrained(root + "bert_classification_model")  # Save the tokenizer's vocabulary

model_info = {
    "label_encoder": label_encoder.classes_.tolist(),
    "max_length": 10,  # Replace with your max_length
    "num_epochs": num_epochs,
    "learning_rate": 1e-5,
}
with open(root + "bert_classification_model/model_info.json", "w") as info_file:
    json.dump(model_info, info_file)

### Inference

In [None]:
root = '/content/drive/MyDrive/Fetch_technical_assessment/'

In [None]:
# Load the tokenizer and model
load_tokenizer = BertTokenizer.from_pretrained(root + "bert_classification_model")
load_model = BertForSequenceClassification.from_pretrained(root + "bert_classification_model")

# Load additional information (if saved)
model_info = {}
with open(root + "bert_classification_model/model_info.json", "r") as info_file:
    model_info = json.load(info_file)

# Ensure the model is in evaluation mode
load_model.eval()

# Perform inference on a new text
text_to_classify = "pasta noodles"
inputs = load_tokenizer(text_to_classify, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
    outputs = load_model(**inputs)

# Extract predicted class probabilities or labels
logits = outputs.logits
predicted_probabilities = torch.softmax(logits, dim=1)
predicted_labels = torch.argmax(logits, dim=1)

# Map the predicted label to the original class name (if needed)
predicted_class = model_info["label_encoder"][predicted_labels.item()]

print(f"Predicted Class: {predicted_class}")
print(f"Predicted Probabilities: {predicted_probabilities.tolist()[0]}")


Predicted Class: category
Predicted Probabilities: [0.002923752414062619, 0.9926780462265015, 0.00439823605120182]


## Search

In [None]:
USER_INPUT = 'equate'

In [None]:
ROOT = '/content/drive/MyDrive/Fetch_technical_assessment/'

In [None]:
def classify_input(input, root=ROOT):
    # Load the tokenizer and model
    load_tokenizer = BertTokenizer.from_pretrained(root + "bert_classification_model")
    load_model = BertForSequenceClassification.from_pretrained(root + "bert_classification_model")

    # Load additional information (if saved)
    model_info = {}
    with open(root + "bert_classification_model/model_info.json", "r") as info_file:
        model_info = json.load(info_file)

    # Ensure the model is in evaluation mode
    load_model.eval()

    # Perform inference on a new text
    text_to_classify = "bai"
    inputs = load_tokenizer(text_to_classify, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = load_model(**inputs)

    # Extract predicted class probabilities or labels
    logits = outputs.logits
    predicted_probabilities = torch.softmax(logits, dim=1)
    predicted_labels = torch.argmax(logits, dim=1)

    # Map the predicted label to the original class name (if needed)
    predicted_class = model_info["label_encoder"][predicted_labels.item()]

    # print(f"Predicted Class: {predicted_class}")
    # print(f"Predicted Probabilities: {predicted_probabilities.tolist()[0]}")

    return predicted_class


In [None]:
classify_input(USER_INPUT)

'retailer'

In [None]:
def brand_search(brand):
    offers = []
    offers.extend(list(set(df_offers[df_offers['BRAND'] == brand]['OFFER'])))



    return offers


In [None]:
brand_search('good humor')

['Good Humor Viennetta Frozen Vanilla Cake',
 "Reese's Peanut Butter Bar, 6 count, at GIANT OR MARTIN’S"]

In [None]:
len(list(set(df_offers['BRAND'])))

144

In [None]:
len(list(set(df_brands['BRAND'])))

8396

In [None]:
merged_df = df_brands.merge(df_offers, on='BRAND', how='outer')

In [None]:
merged_df = merged_df.sample(frac=1, random_state=random.seed())

In [None]:
merged_df

Unnamed: 0,BRAND,BRAND_BELONGS_TO_CATEGORY,RECEIPTS,OFFER,RETAILER,OFFER_PREPROCESSED
10366,matahiwi estate,wine,11.0,,,
8173,menage a trois,wine,25.0,,,
7196,devils backbone crab cakes and football sess,beer,38.0,,,
3349,wright bacon,chips,410.0,,,
6733,happy dad,malt beverages,48.0,,,
...,...,...,...,...,...,...
914,red stripe,beer,6887.0,,,
2974,olde english 800,malt beverages,599.0,,,
2537,powerade zero ion4,sports drinks,940.0,,,
10098,phase three,beer,12.0,,,


In [None]:
(merged_df.notna().all(axis=1)).sum()

837

## TEST

In [None]:
# !pip install transformers

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

### Creating merged dataset with all categories, brands, and retailers mapped to relevant offers.

In [None]:
def preprocess_column(column):
    column = column.astype(str).str.lower()
    column = column.str.replace(r'-', ' ', regex=True)
    column = column.str.replace(r'[^a-z0-9\s]', '', regex=True)
    column = column.str.replace(r'\s+', ' ', regex=True)
    column = column.str.strip()
    return column

In [None]:
df_categories = pd.read_csv('categories.csv')
df_brands = pd.read_csv('brand_category.csv')
df_offers = pd.read_csv('offer_retailer.csv')

In [None]:
df_categories.drop('CATEGORY_ID', axis=1, inplace=True)
df_categories = df_categories.rename(columns={'PRODUCT_CATEGORY': 'CATEGORY', 'IS_CHILD_CATEGORY_TO':'PARENT_CATEGORY'})

In [None]:
df_categories['CATEGORY'] = preprocess_column(df_categories['CATEGORY'])
df_categories['PARENT_CATEGORY'] = preprocess_column(df_categories['PARENT_CATEGORY'])

In [None]:
df_brands.drop('RECEIPTS', axis=1, inplace=True)
df_brands = df_brands.rename(columns={'BRAND_BELONGS_TO_CATEGORY':'CATEGORY'})

In [None]:
df_brands['BRAND'] = preprocess_column(df_brands['BRAND'])
df_brands['CATEGORY'] = preprocess_column(df_brands['CATEGORY'])

In [None]:
df_offers['BRAND'] = preprocess_column(df_offers['BRAND'])
df_offers['RETAILER'] = preprocess_column(df_offers['RETAILER'])
df_offers['OFFER_PREPROCESSED'] = preprocess_column(df_offers['OFFER'])

In [None]:
dataset = pd.merge(df_categories, df_brands, on='CATEGORY', how='outer')

In [None]:
dataset = pd.merge(dataset, df_offers, on='BRAND', how='outer')

In [None]:
dataset = dataset.drop_duplicates().reset_index(drop=True)

In [None]:
dataset = dataset.dropna(subset=['OFFER'])

In [None]:
dataset

In [None]:
dataset.to_csv('dataset.csv')

### Loading custom merged dataset

In [None]:
dataset = pd.read_csv('dataset.csv', index_col=0)

### User Search

In [None]:
USER_INPUT = "SCHWEPPES"

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = text.replace('-', ' ')
    text = ''.join(c for c in text if c.isalnum() or c.isspace())
    text = ' '.join(text.split())
    return text

In [None]:
USER_INPUT = preprocess_text(USER_INPUT)

In [None]:
USER_INPUT

'schweppes'

#### Filtering based on input keyword

In [None]:
# Define a search function
def search_string(s, search):

    s, search = str(s), str(search)
    return set(search.split()).intersection(set(s.split()))

    # return (search in str(s).lower()) or (str(s).lower() in search)

mask = dataset.applymap(lambda x: search_string(x, USER_INPUT))

# Filter the DataFrame based on the mask
filtered_dataset = dataset.loc[mask.any(axis=1)]

In [None]:
filtered_dataset

#### Reordering filtered dataset using cosine similarity (BERT)

In [None]:
MODEL_NAME="bert-base-uncased"
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
MODEL = AutoModel.from_pretrained(MODEL_NAME)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
def cosine_similarity_bert(text1, text2=USER_INPUT):
    # Load the pre-trained BERT model and tokenizer
    tokenizer = TOKENIZER
    model = MODEL

    # Tokenize the input texts and obtain embeddings
    inputs1 = tokenizer(text1, return_tensors='pt', padding=True, truncation=True)
    inputs2 = tokenizer(text2, return_tensors='pt', padding=True, truncation=True)

    # Get the BERT embeddings for the input texts
    with torch.no_grad():
        outputs1 = model(**inputs1)
        outputs2 = model(**inputs2)

    # Extract the embeddings
    embeddings1 = outputs1.last_hidden_state.mean(dim=1)  # Mean pooling over tokens
    embeddings2 = outputs2.last_hidden_state.mean(dim=1)

    # Convert tensors to numpy arrays and calculate cosine similarity
    embedding1_np = embeddings1.cpu().numpy()
    embedding2_np = embeddings2.cpu().numpy()
    similarity_score = cosine_similarity(embedding1_np, embedding2_np)[0][0]

    return similarity_score

In [None]:
filtered_dataset

In [None]:
def sort_by_cosine_similarity(row, user_input):
    return cosine_similarity_bert(row['OFFER_PREPROCESSED'], user_input)

In [None]:
filtered_dataset['cosine_similarity'] = filtered_dataset.apply(lambda row: sort_by_cosine_similarity(row, USER_INPUT), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_dataset['cosine_similarity'] = filtered_dataset.apply(lambda row: sort_by_cosine_similarity(row, USER_INPUT), axis=1)


In [None]:
filtered_dataset.sort_values(by='cosine_similarity', ascending=False)

In [None]:
filtered_dataset

Unnamed: 0,CATEGORY,PARENT_CATEGORY,BRAND,OFFER,RETAILER,OFFER_PREPROCESSED


In [None]:
def match_score(row, user_input=USER_INPUT):
    row_data = str(row['CATEGORY']) + ' ' + str(row['PARENT_CATEGORY']) + ' ' + str(row['BRAND']) + ' ' + str(row['RETAILER']) + ' ' + str(row['OFFER_PREPROCESSED'])

    score = len(set(row_data.split()).intersection(set(user_input.split())))

    return score

In [None]:
filtered_dataset['SCORE'] = filtered_dataset.apply(lambda row: match_score(row, USER_INPUT), axis=1)

In [None]:
filtered_dataset

Unnamed: 0,CATEGORY,PARENT_CATEGORY,BRAND,OFFER,RETAILER,OFFER_PREPROCESSED,SCORE


In [None]:
list(set(dataset['OFFER']))