In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

In [47]:
# def generate_synthetic_data(n_samples=1000, features=5):
#     # generate features
#     X = torch.randn(n_samples, features)
#     # define an equation with variable level of importance
#     w0= 3
#     w1 = 2
#     w2 = 0.5
#     # Target: y = 3*x_0 + 2*x_1 + 0.5*x_2 + noise (features 3 and 4 are less important)
#     y = w0 * X[:, 0] + w1 * X[:, 1] + w2 * X[:, 2] + 0.1 * np.random.randn(n_samples)
#     X = X.to(torch.float32)
#     y = y.to(torch.float32)
#     y = y.reshape(-1, 1)
#     return X,y

def generate_synthetic_data(n_samples=1000, n_features=5):
    # Generate random features
    X = np.random.randn(n_samples, n_features)
    # Target: y = 3*x_0 + 2*x_1 + 0.5*x_2 + noise (features 3 and 4 are less important)
    y = 3 * X[:, 0] + 2 * X[:, 1] + 0.5 * X[:, 2] + 0.1 * np.random.randn(n_samples)
    return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32).reshape(-1, 1)

X, y= generate_synthetic_data(n_samples=1000, n_features=5)

# define a self attention layer

In [42]:
# define a self attention layer
class SelfAttention(nn.Module):
    def __init__(self, input_dim, attention_dim):
        super(SelfAttention, self).__init__()
        self.query = nn.Linear(in_features=input_dim, out_features=attention_dim)
        self.key = nn.Linear(in_features=input_dim, out_features=attention_dim)
        self.value = nn.Linear(in_features=input_dim, out_features=input_dim)
        self.scale = torch.sqrt(torch.tensor(attention_dim, dtype=torch.float32))

    def forward(self, x):
        # x: (batch_size, input_dim)
        queries = self.query(x)  # (batch_size, attention_dim)
        keys = self.key(x)       # (batch_size, attention_dim)
        values = self.value(x)   # (batch_size, input_dim)

        # compute attention score
        scores = torch.matmul(queries, keys.T)/self.scale # batch_size, batche_size
        attention_weights = torch.softmax(scores, dim=1) # batch_size, batche_size

        # Apply attention to values
        attended = torch.matmul(attention_weights, values)    # (batch_size, input_dim)
        return attended, attention_weights

# Define Neural Network with Attention

In [43]:
class AttentionNet(nn.Module):
    def __init__(self, input_dim, hidden_dim=16):
        super(AttentionNet, self).__init__()
        self.attention = SelfAttention(input_dim, attention_dim=8)
        self.fc1 = nn.Linear(in_features=input_dim, out_features=hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(in_features=hidden_dim, out_features=1)

    def forward(self, x):
        # apply attention
        attended, attention_weights = self.attention(x)
        # Combine attended features with original input (residual connection)
        x = x + attended
        # Feedforward layers
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)

        return x, attention_weights

# Training Function

In [44]:
def train_model(model,X, y, epochs=1000, lr = 0.001):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        Y_pred, _attention_weights = model(X)
        loss = criterion(Y_pred, y)
        loss.backward()
        optimizer.step()
        if epoch % 50 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# main execution

In [49]:
def main():
    X, y= generate_synthetic_data(n_samples=1000, n_features=5)

    # Initialize model
    model = AttentionNet(input_dim=5)
    # Train model
    train_model(model=model, X=X, y=y, epochs=100)

    # evaluate and extract attention weight
    model.eval()
    with torch.no_grad():
        _Y_pred, attention_weights = model(X)
    print(f"Shape of attention weights: {attention_weights.shape}")
    # Compute average attention weights for each feature
    avg_attention = attention_weights.mean(dim=0).numpy()
    print(f"Shape of average attention weights: {avg_attention.shape}")
    print("\nAverage Attention Weights for Each Feature:")
    for i, weight in enumerate(avg_attention[:5]):  # Limit to first 5 for clarity
        print(f"Feature {i}: {weight:.4f}")

main()




Epoch [1/100], Loss: 14.6100
Epoch [51/100], Loss: 12.8845
Shape of attention weights: torch.Size([1000, 1000])
Shape of average attention weights: (1000,)

Average Attention Weights for Each Feature:
Feature 0: 0.0008
Feature 1: 0.0008
Feature 2: 0.0009
Feature 3: 0.0009
Feature 4: 0.0009
