p97 wd0.0 dp0.0 bs512 va没有起色

In [9]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader, random_split
import os
import time
print(torch.cuda.is_available())
os.environ['KMP_DUPLICATE_LIB_OK']='True'

# 1. Data Generation
class ModularAdditionDataset(Dataset):
    def __init__(self, p, embed_dim):
        assert embed_dim > p + 2, "Embedding dimension must be greater than p + 2"
        self.p = p
        self.embed_dim = embed_dim
        
        x = torch.arange(p)
        y = torch.arange(p)
        self.x, self.y = torch.meshgrid(x, y, indexing='ij')
        self.result = (self.x + self.y) % p
        
        # Flatten tensors
        self.x = self.x.reshape(-1)
        self.y = self.y.reshape(-1)
        self.result = self.result.reshape(-1)
        
        # Convert to one-hot vectors
        self.x = nn.functional.one_hot(self.x, self.embed_dim)
        self.y = nn.functional.one_hot(self.y, self.embed_dim)
        
        # Create one-hot vectors for '+' and '='
        self.plus = nn.functional.one_hot(torch.tensor([p]), embed_dim)
        self.equals = nn.functional.one_hot(torch.tensor([p + 1]), embed_dim)
        
    def __len__(self):
        return len(self.result)
        
    def __getitem__(self, idx):
        x_onehot = self.x[idx].float()
        y_onehot = self.y[idx].float()
        result = self.result[idx]
        # Concatenate x, +, y, =, result
        input_vector = torch.stack([x_onehot, self.plus.squeeze(0).float(), y_onehot, self.equals.squeeze(0).float()])
        
        return input_vector, result 

# 2. Transformer Model
class DecoderBlock(nn.Module):
    def __init__(self, dim_model: int, n_heads: int):
        super().__init__()

        self.self_attn = nn.MultiheadAttention(dim_model, n_heads)
        self.self_attn_norm = nn.LayerNorm(dim_model)
        self.ffn = nn.Sequential(
            nn.Linear(dim_model, dim_model * 4),
            #nn.Dropout(0.0),  # 添加dropout
            nn.GELU(),
            nn.Linear(dim_model * 4, dim_model),
            nn.Dropout(0.0)  # 添加dropout
        )
        self.ffn_norm = nn.LayerNorm(dim_model)

    def forward(self, x):
        attn_mask = torch.full(
            (len(x), len(x)), -float("Inf"), device=x.device, dtype=x.dtype
        )
        attn_mask = torch.triu(attn_mask, diagonal=1)
        
        a1, _ = self.self_attn(x, x, x, attn_mask=attn_mask)
        a1 = self.self_attn_norm(x + a1)
        a2 = self.ffn(a1)
        a2 = self.ffn_norm(a1 + a2)

        return a2

class SimpleTransformer(nn.Module):
    def __init__(self, num_layers: int, dim_model: int, num_heads: int, num_tokens: int):
        super().__init__()

        self.model = nn.Sequential(
            *[DecoderBlock(dim_model, num_heads) for _ in range(num_layers)],
            nn.LayerNorm(dim_model),
            nn.Linear(dim_model, num_tokens)
        )

    def _position_encoding(self, seq_len, dim_model):
        pos = np.arange(seq_len)[:, np.newaxis]
        i = np.arange(dim_model)[np.newaxis, :]
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / dim_model)
        angle_rads = pos * angle_rates

        # apply sin to even indices in the array; 2i
        angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

        # apply cos to odd indices in the array; 2i+1
        angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

        pos_encoding = angle_rads[np.newaxis, ...]
        return torch.tensor(pos_encoding, dtype=torch.float32)

    def forward(self, x):
        #x是事先已经embedding好的，只需再加上position embedding
        position_embedding = self._position_encoding(x.shape[1], x.shape[2]).to(x.device)

        embedding = x + position_embedding

        embedding = embedding.permute(1, 0, 2)  # (seq_len, batch_size, dim_model)

        output = self.model(embedding)
        output = output[-1, :, :]  # (batch_size, num_tokens)


        return output
    

# 3. Training Loop
def train_and_evaluate(model, train_loader, val_loader, optimizer, criterion, num_epochs, device, save_path):
    train_accs = []
    val_accs = []
    feature_changes = [0] #添加特征变化变量
    layer_norms = []  #添加层权重范数
    steps = []
    current_step = 0
    feature_change = 0.0

    for epoch in range(num_epochs):
        start_time = time.time()

        model.train()
        for x, labels in train_loader:
            x = x.to(device)
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(x)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            if current_step % 100 == 0:
                with torch.no_grad():
                    train_pred = outputs.argmax(dim=1)
                    train_acc = (train_pred == labels).float().mean().item()

                    model.eval()
                    val_correct = 0
                    val_total = 0
                    for val_x, val_labels in val_loader:
                        val_x = val_x.to(device)
                        val_labels = val_labels.to(device)
                        val_outputs = model(val_x)
                        val_pred = val_outputs.argmax(dim=1)
                        val_correct += (val_pred == val_labels).sum().item()
                        val_total += val_labels.size(0)
                    val_acc = val_correct / val_total
                    model.train()

                    train_accs.append(train_acc)
                    val_accs.append(val_acc)

                    # 计算特征变化
                    if current_step > 0:
                        feature_change = torch.norm(model.model[-1].weight - prev_weights) / torch.norm(prev_weights)
                        feature_changes.append(feature_change.item())
                    prev_weights = model.model[-1].weight.clone()

                    # 计算层权重范数
                    layer_norm = torch.norm(model.model[-1].weight)
                    layer_norms.append(layer_norm.item())

                    steps.append(current_step)
                    end_time = time.time()
                    step100_duration = end_time - start_time
                    start_time = end_time

                    print(f'Step {current_step}, Train Acc: {train_acc:.3f}, Val Acc: {val_acc:.3f}, Feature Change: {feature_change:.3f}, Layer Norm: {layer_norm:.3f}, Time per 100 steps: {step100_duration:.2f}s')

            current_step += 1

        if (epoch + 1) % 10 == 0:
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'train_accs': train_accs,
                'val_accs': val_accs,
                'feature_changes': feature_changes,
                'layer_norms': layer_norms,
                'steps': steps
            }, os.path.join(save_path, f'checkpoint_epoch_{epoch+1}.pt'))

    return steps, train_accs, val_accs, feature_changes, layer_norms

# 修改 run_experiment 函数
def run_experiment(p=53, hidden_dim=128, num_heads=4, num_layers=2,
                  batch_size=512, lr=8e-4, weight_decay=0.0, training_fraction=0.3,
                  num_epochs=10000, device='cpu', save_path='checkpoints'):
    os.makedirs(save_path, exist_ok=True)

    dataset = ModularAdditionDataset(p, hidden_dim)
    train_size = int(training_fraction * len(dataset))
    val_size = len(dataset) - train_size
    random_seed = 42
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size], generator=torch.Generator().manual_seed(random_seed))

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = SimpleTransformer(num_layers=num_layers, dim_model=hidden_dim, num_heads=num_heads, num_tokens=p+2).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    steps, train_accs, val_accs, feature_changes, layer_norms = train_and_evaluate(model, train_loader, val_loader, optimizer, criterion, num_epochs, device, save_path)

 


False


In [10]:

if __name__ == "__main__":
    run_experiment()

Step 0, Train Acc: 0.020, Val Acc: 0.016, Feature Change: 0.000, Layer Norm: 4.302, Time per 100 steps: 0.28s
