<a href="https://colab.research.google.com/github/sthitaprajnadas/gcp-aiml-works/blob/main/transaction_bert_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets torch




In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import  get_scheduler
from transformers import DistilBertTokenizer, DistilBertModel
from torch.optim import AdamW

# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased")

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

df = pd.read_excel("test_transactions0.1.xls")
df.head(5)

Unnamed: 0,Date,Amount,Merchant,Category,Description
0,2025-03-16,80814.7,UNICEF,Charity,Donated INR 80814.7 via UNICEF on 16-Mar-2025...
1,2025-03-16,61816.17,Bharat Petroleum,Fuel,Filled up at Bharat Petroleum on 16-Mar-2025 i...
2,2025-03-16,110825.48,HDFC ATM,Cash Withdrawal,"Withdrawn INR 110825.48 from HDFC ATM ATM, Hy..."
3,2025-03-16,87042.87,Nature‚Äôs Basket,Groceries,A substantial shopping haul from Nature‚Äôs Ba...
4,2025-03-16,16158.42,GenericMerchant,Miscellaneous,Paid INR 16158.42 for a variety of small misc...


In [None]:
# Preprocessing
df.columns = df.columns.str.lower()
str_cols = df.select_dtypes(include='object').columns
df[str_cols] =  df[str_cols].apply(lambda x: x.str.strip().str.lower().str.replace(r'[^A-Za-z0-9 ]+','',regex=True)) # Apply cleaning on description column by stripping off unwanted spaces,lowering it and removing special characters



In [None]:
# Label encode target
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["category"])

# Scale numeric column
scaler = StandardScaler()
amount_scaled = scaler.fit_transform(df[["amount"]])

merchant_encoder = LabelEncoder()
merchant_encoder.fit(df["merchant"])

unknown_idx = len(merchant_encoder.classes_)
num_merchants = len(merchant_encoder.classes_) + 1  # +1 for unknown

# Map to indices with unknown handling
merchant_idx_all = df["merchant"].apply(
    lambda m: merchant_encoder.transform([m])[0]
              if m in merchant_encoder.classes_
              else unknown_idx
).values

# Tokenize text (BERT)
max_len = 64
encodings = tokenizer(
    list(df["description"]),
    truncation=True,
    padding="max_length",
    max_length=max_len,
    return_tensors="pt"
)

In [None]:
# Dataset calss

class TransDataset(Dataset):
    def __init__(self, encodings, amount, merchant_idx, labels):
        self.encodings = encodings
        self.amount = amount
        self.merchant_idx = merchant_idx
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['amount'] = torch.tensor(self.amount[idx], dtype=torch.float32)
        item['merchant'] = torch.tensor(self.merchant_idx[idx], dtype=torch.long)
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item



In [None]:
class TransClassifier(nn.Module):
    def __init__(self, bert_model, num_merchants, num_classes, emb_dim=16):
        super().__init__()
        self.bert = bert_model
        self.merchant_emb = nn.Embedding(num_merchants, emb_dim)
        self.amount_fc = nn.Linear(1, 32)

        self.fc = nn.Linear(768 + emb_dim + 32, 256)
        self.out = nn.Linear(256, num_classes)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, input_ids, attention_mask, amount, merchant):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = bert_outputs.last_hidden_state[:, 0, :]  # CLS token

        merchant_vec = self.merchant_emb(merchant)          # [batch, emb_dim]

        #amount_vec = self.relu(self.amount_fc(amount.unsqueeze(1)))  # [batch, 32]
        #amount_vec = amount_vec.unsqueeze(1) if amount_vec.ndim == 1 else amount_vec
        #amount_vec = amount_vec.squeeze(-1)

        # --- Amount block ---
        if amount.ndim == 1:
            amount = amount.unsqueeze(1)   # (B,1)
        elif amount.ndim == 0:
            amount = amount.unsqueeze(0)
        elif amount.ndim == 3:  # (B,1,1)
            amount = amount.squeeze(1)     # (B,1)

        amount_vec = self.relu(self.amount_fc(amount))  # (B,32)




        x = torch.cat([pooled_output, merchant_vec, amount_vec], dim=1)
        x = self.relu(self.fc(x))
        x = self.dropout(x)
        x = self.out(x)
        return x


In [None]:
# Training loop

from sklearn.model_selection import train_test_split

# Split indices
train_idx, test_idx = train_test_split(np.arange(len(df)), test_size=0.2, stratify=y, random_state=42)

train_dataset = TransDataset(
    {k: v[train_idx] for k, v in encodings.items()},
    amount_scaled[train_idx],
    merchant_idx_all[train_idx],  # << change here
    y[train_idx]
)

test_dataset = TransDataset(
    {k: v[test_idx] for k, v in encodings.items()},
    amount_scaled[test_idx],
    merchant_idx_all[test_idx],  # << change here
    y[test_idx]
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

num_classes = len(label_encoder.classes_)
num_merchants = len(merchant_encoder.classes_)

model = TransClassifier(bert_model, num_merchants, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 10


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        amount = batch['amount'].to(device)
        merchant = batch['merchant'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, amount, merchant)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1, Loss: 0.9301
Epoch 2, Loss: 0.0992
Epoch 3, Loss: 0.0413
Epoch 4, Loss: 0.0227


In [None]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        amount = batch['amount'].to(device)
        merchant = batch['merchant'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask, amount, merchant)
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

print(f"Test Accuracy: {correct/total:.4f}")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Test Accuracy: 1.0000


In [None]:
# --- Prepare input ---
text = "Donated INR 80814.7 via UNICEF..."
amount_value = np.array([[80814.7]], dtype=np.float32)   # (1,1)

new_merchant = "UNICEF"
if new_merchant in merchant_encoder.classes_:    # ← use merchant_encoder not label_encoder!
    merchant_value = merchant_encoder.transform([new_merchant])[0]
else:
    merchant_value = len(merchant_encoder.classes_) - 1  # or a reserved index

# Tokenize text
single_encoding = tokenizer(
    text,
    truncation=True,
    padding='max_length',
    max_length=max_len,
    return_tensors='pt'
)

# Make tensors with batch dim
amount_tensor = torch.tensor(amount_value, dtype=torch.float32)            # (1,1)
merchant_tensor = torch.tensor([merchant_value], dtype=torch.long)         # (1,)

# --- Predict ---
model.eval()
with torch.no_grad():
    logits = model(
        single_encoding['input_ids'].to(device),
        single_encoding['attention_mask'].to(device),
        amount_tensor.to(device),
        merchant_tensor.to(device)
    )

pred_idx = torch.argmax(logits, dim=1).cpu().numpy()
pred_category = label_encoder.inverse_transform(pred_idx)
print("Predicted Category:", pred_category)
