### Goals
1. Generate sequences from transaction data
2. Create random masks for MLM
3. Create Dataloader
4. Encoder style transformer
5. Training Loop
6. Downstream adaptation for other tasks

### Note that it has mixed examples

### Prep Data

In [None]:
# Let’s say your DataFrame looks like this:
df = pd.DataFrame({
    "user_id": [...],
    "timestamp": [...],  # Ensure it's sorted
    "merchant_id": [...],
    "txn_type": [...],
    "country": [...],
    "amount": [...]
})

1. Group by user_id
2. Sort by timestamp
3. Chunk into sequences of max length 12
4. Pad shorter sequences with -999
5. Optionally mark positions for -100 as mask target (e.g., in MLM)

In [None]:
import pandas as pd
import numpy as np

SEQ_LEN = 12
PAD_TOKEN = -999
MASK_TOKEN = -100  # if needed for MLM

def create_sequences(df, categorical_cols, numerical_cols):
    sequences = []

    # Sort data
    df = df.sort_values(["user_id", "timestamp"])

    for user_id, user_df in df.groupby("user_id"):
        n = len(user_df)
        for i in range(0, n, SEQ_LEN):
            chunk = user_df.iloc[i:i+SEQ_LEN]

            seq = {}
            for col in categorical_cols + numerical_cols:
                values = chunk[col].tolist()
                # Pad if shorter than SEQ_LEN
                if len(values) < SEQ_LEN:
                    pad_value = PAD_TOKEN if col in categorical_cols + numerical_cols else 0
                    values += [pad_value] * (SEQ_LEN - len(values))
                seq[col] = values[:SEQ_LEN]  # ensure it's exactly SEQ_LEN

            sequences.append(seq)

    return pd.DataFrame(sequences)


In [None]:
categorical_cols = ["merchant_id", "txn_type", "country"]
numerical_cols = ["amount"]

seq_df = create_sequences(df, categorical_cols, numerical_cols)

# You get a DataFrame where each row = one sequence of 12 txns

In [None]:
# Convert to Tensor Dataset
import torch

def to_tensor_dataset(seq_df, categorical_cols, numerical_cols):
    tensors = {}
    for col in categorical_cols:
        tensors[col] = torch.tensor(seq_df[col].tolist(), dtype=torch.long)

    for col in numerical_cols:
        tensors[col] = torch.tensor(seq_df[col].tolist(), dtype=torch.float)

    return tensors


In [None]:
# Optional: Add Masked Positions for MLM
def add_masking(input_tensor, mask_prob=0.15):
    """
    Replace random tokens with MASK_TOKEN (-100) for supervised learning
    """
    mask = torch.rand_like(input_tensor.float()) < mask_prob
    labels = input_tensor.clone()
    labels[~mask] = -100  # no loss on unmasked tokens
    input_tensor[mask] = MASK_TOKEN
    return input_tensor, labels


In [None]:
# Final output looks like this
{
    "merchant_id": [12, 45, 3, ..., -999, -999],
    "txn_type":    [1, 0, 2, ..., -999, -999],
    "country":     [5, 5, 3, ..., -999, -999],
    "amount":      [100.0, 59.0, 22.5, ..., -999.0, -999.0]
}


### Create Dataset and Dataloader

In [None]:
# STEP 1: Convert Sample Data to Tensor Dataset
categorical_cols = ["merchant_id", "txn_type", "country"]
numerical_cols = ["amount"]


import torch
from torch.utils.data import Dataset

class TxnDataset(Dataset):
    def __init__(self, df, categorical_cols, numerical_cols, mask_prob=0.15):
        self.categorical_cols = categorical_cols
        self.numerical_cols = numerical_cols
        self.mask_prob = mask_prob
        self.inputs = {}
        self.labels = {}

        for col in categorical_cols:
            t = torch.tensor(df[col].tolist(), dtype=torch.long)
            masked, labels = self.add_masking(t)
            self.inputs[col] = masked
            self.labels[col] = labels

        for col in numerical_cols:
            self.inputs[col] = torch.tensor(df[col].tolist(), dtype=torch.float)

    def __len__(self):
        return len(next(iter(self.inputs.values())))

    def __getitem__(self, idx):
        return (
            {k: v[idx] for k, v in self.inputs.items()},
            {k: self.labels[k][idx] for k in self.labels}
        )

    def add_masking(self, x):
        prob = torch.rand_like(x.float()) < self.mask_prob
        labels = x.clone()
        labels[~prob] = -100  # no loss where not masked
        x = x.masked_fill(prob, -100)
        return x, labels


In [None]:
# STEP 2: Create DataLoader
dataset = TxnDataset(seq_df, categorical_cols, numerical_cols)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=16, shuffle=True)

### Simulated Data

In [None]:
import numpy as np
import torch

# Simulate categorical fields (merchant_id, txn_type, country)
num_merchants = 500
num_txn_types = 5
num_countries = 20

# Simulate numeric field (amount in bucketed form)
num_amount_buckets = 100

# Create 10k users, 128 txns each
num_sequences = 10000
seq_len = 128

def simulate_transaction_sequence():
    return {
        'merchant_id': np.random.randint(0, num_merchants, seq_len),
        'txn_type': np.random.randint(0, num_txn_types, seq_len),
        'country': np.random.randint(0, num_countries, seq_len),
        'amount_bucket': np.random.randint(0, num_amount_buckets, seq_len)
    }

data = [simulate_transaction_sequence() for _ in range(num_sequences)]


In [None]:
# Dataclass with masking
from torch.utils.data import Dataset

MASK_TOKEN = -1  # Special ID to represent [MASK]

class BankTxnDataset(Dataset):
    def __init__(self, data, mask_prob=0.15):
        self.data = data
        self.mask_prob = mask_prob

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        inputs = {}
        labels = {}

        for field, values in item.items():
            values = np.array(values)
            mask = np.random.rand(len(values)) < self.mask_prob

            labels[field] = np.where(mask, values, -100)  # For loss calc
            inputs[field] = np.where(mask, MASK_TOKEN, values)

        return {k: torch.tensor(v, dtype=torch.long) for k, v in inputs.items()}, \
               {k: torch.tensor(v, dtype=torch.long) for k, v in labels.items()}


### Model

In [None]:
# Define model
import torch.nn as nn

class TransactionBERT(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.embeddings = nn.ModuleDict({
            'merchant_id': nn.Embedding(config['num_merchants'] + 2, config['d_model']),
            'txn_type': nn.Embedding(config['num_txn_types'] + 2, config['d_model']),
            'country': nn.Embedding(config['num_countries'] + 2, config['d_model']),
            'amount_bucket': nn.Embedding(config['num_amount_buckets'] + 2, config['d_model']),
        })

        self.pos_emb = nn.Embedding(config['max_seq_len'], config['d_model'])

        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=config['d_model'],
                nhead=config['nhead'],
                dim_feedforward=config['ff_dim'],
                dropout=0.1
            ),
            num_layers=config['num_layers']
        )

        self.output_heads = nn.ModuleDict({
            k: nn.Linear(config['d_model'], config[f'num_{k}s'] + 2)
            for k in self.embeddings.keys()
        })

    def forward(self, x):
        batch_size, seq_len = next(iter(x.values())).shape

        # Positional encoding
        pos = torch.arange(seq_len, device=x['merchant_id'].device).unsqueeze(0)
        pos_emb = self.pos_emb(pos)

        # Sum field-wise embeddings
        x_emb = sum([self.embeddings[k](v) for k, v in x.items()]) + pos_emb

        # Transformer encoding
        encoded = self.encoder(x_emb)

        # Output heads for each field
        outputs = {k: head(encoded) for k, head in self.output_heads.items()}
        return outputs



In [None]:
## Training Loop
from torch.utils.data import DataLoader
import torch.nn.functional as F
from tqdm import tqdm

def train(model, dataloader, optimizer, device, epochs=3):
    model.train()
    model.to(device)

    for epoch in range(epochs):
        total_loss = 0
        for inputs, labels in tqdm(dataloader):
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = {k: v.to(device) for k, v in labels.items()}

            outputs = model(inputs)

            loss = sum([
                F.cross_entropy(
                    outputs[k].view(-1, outputs[k].size(-1)),
                    labels[k].view(-1),
                    ignore_index=-100
                )
                for k in outputs
            ])

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}: Loss = {total_loss / len(dataloader):.4f}")


In [None]:
### Training Setup
config = {
    'd_model': 64,
    'nhead': 4,
    'ff_dim': 128,
    'num_layers': 3,
    'max_seq_len': 128,
    'num_merchants': num_merchants,
    'num_txn_types': num_txn_types,
    'num_countries': num_countries,
    'num_amount_buckets': num_amount_buckets
}

dataset = BankTxnDataset(data)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

model = TransactionBERT(config)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train(model, dataloader, optimizer, device)


In [None]:
# STEP 3: Instantiate Model
config = {
    "d_model": 64,
    "num_heads": 4,
    "num_layers": 2,
    "max_seq_len": 12,
    "vocab_size_merchant": 200,
    "num_types": 10,
    "num_countries": 20
}
model = TransactionBERT(config)


In [None]:
# STEP 4: Training Loop
import torch.nn as nn
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=-100)

model.train()

for epoch in range(3):
    total_loss = 0.0
    for batch_x, batch_y in dataloader:
        optimizer.zero_grad()
        outputs = model(batch_x)  # outputs is a dict

        loss = 0.0
        for field in batch_y:
            out = outputs[field].reshape(-1, outputs[field].shape[-1])
            tgt = batch_y[field].reshape(-1)
            loss += criterion(out, tgt)

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


In [None]:
# STEP 5: Inspect Output
model.eval()
sample = next(iter(dataloader))[0]
with torch.no_grad():
    output = model(sample)

print("Predicted merchant_id:", output["merchant_id"].argmax(-1))


* Make sure any unknown or padded tokens in your vocab are mapped to index 0 or PAD_ID consistently.

* Masked tokens should use -100 only for label masking. Inputs can use a different index (e.g., MASK_ID = vocab_size + 1) if you want to train a true MLM like BERT.