In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

In [4]:
# Environment Setup
# -----------------------------
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(42)
np.random.seed(42)

# -----------------------------
# Preprocessing
# -----------------------------
# Generate random data (e.g., 1000 samples, sequence length 10, feature dimension 8)
num_samples = 1000
sequence_length = 10
feature_dim = 8
num_classes = 3

X = np.random.rand(num_samples, sequence_length, feature_dim).astype(np.float32)
y = np.random.randint(0, num_classes, size=(num_samples,)).astype(np.int64)

# Normalize data
scaler = StandardScaler()
X_reshaped = X.reshape(-1, feature_dim)
X_scaled = scaler.fit_transform(X_reshaped).reshape(num_samples, sequence_length, feature_dim)

# -----------------------------
# Train-Test Split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

train_dataset = TensorDataset(torch.tensor(X_train), torch.tensor(y_train))
test_dataset = TensorDataset(torch.tensor(X_test), torch.tensor(y_test))

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [5]:
# Train Base Model (Standard Transformer)
# -----------------------------
class TransformerClassifier(nn.Module):
    def __init__(self, feature_dim, num_classes, num_heads=2, num_layers=2, hidden_dim=64):
        super(TransformerClassifier, self).__init__()
        self.embedding = nn.Linear(feature_dim, hidden_dim)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim, 
            nhead=num_heads, 
            batch_first=True  # ✅ Added to avoid warning and improve performance
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.transformer_encoder(x)
        x = x.mean(dim=1)  # Average pooling over sequence length
        return self.fc(x)

model = TransformerClassifier(feature_dim, num_classes).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train(model, loader, criterion, optimizer):
    model.train()
    for data, labels in loader:
        data, labels = data.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

for epoch in range(5):  # Lightweight setup with fewer epochs
    train(model, train_loader, criterion, optimizer)

In [6]:
# Planning (Simulated Experience Using Attention Weights)
# -----------------------------
# Access attention weights (by modifying the encoder layer for demonstration)
def get_attention_weights(model, sample_input):
    with torch.no_grad():
        embedded = model.embedding(sample_input)
        # Access the multi-head attention module directly
        attn_layer = model.transformer_encoder.layers[0].self_attn
        attn_output, attn_weights = attn_layer(embedded, embedded, embedded, need_weights=True)
        return attn_weights.cpu().numpy()

sample_input = torch.tensor(X_test[:1]).to(DEVICE)
attention_weights = get_attention_weights(model, sample_input)
print("Attention Weights Shape:", attention_weights.shape)

# -----------------------------
# Fine-Tune Model
# -----------------------------
optimizer = optim.Adam(model.parameters(), lr=0.0005)  # Lower learning rate for fine-tuning
for epoch in range(3):
    train(model, train_loader, criterion, optimizer)

# -----------------------------
# Evaluate
# -----------------------------
def evaluate(model, loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for data, labels in loader:
            data, labels = data.to(DEVICE), labels.to(DEVICE)
            outputs = model(data)
            predictions = outputs.argmax(dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    return correct / total

accuracy = evaluate(model, test_loader)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# -----------------------------
# Deploy Policy
# -----------------------------
model_path = "standard_transformer_model.pth"
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Attention Weights Shape: (1, 10, 10)
Test Accuracy: 33.00%
Model saved to standard_transformer_model.pth
