In [None]:
!pip install dadaptation

Collecting dadaptation
  Downloading dadaptation-3.2.tar.gz (13 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: dadaptation
  Building wheel for dadaptation (pyproject.toml) ... [?25l[?25hdone
  Created wheel for dadaptation: filename=dadaptation-3.2-py3-none-any.whl size=23208 sha256=a4b2dff9699f130e05c1577cdbab0eff6ae3f4932508581acf244c90188e1177
  Stored in directory: /root/.cache/pip/wheels/d0/03/6d/feba04df15ef39d9ac4e3504058ac2a88fb2ef9183ba92b111
Successfully built dadaptation
Installing collected packages: dadaptation
Successfully installed dadaptation-3.2


In [None]:
from dadaptation.dadapt_adagrad import DAdaptAdaGrad
from dadaptation.dadapt_adam import DAdaptAdam
from dadaptation.dadapt_sgd import DAdaptSGD
from dadaptation.dadapt_adan import DAdaptAdan
from dadaptation.dadapt_lion import DAdaptLion

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
import matplotlib.pyplot as plt



# Wide Residual Block definition
class WideResNetBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1, drop_rate=0.0):
        super(WideResNetBlock, self).__init__()
        self.bn1 = nn.BatchNorm2d(in_channels)
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.drop_rate = drop_rate

        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, padding=0, bias=False)
        else:
            self.shortcut = nn.Identity()

    def forward(self, x):
        out = self.conv1(F.relu(self.bn1(x)))
        if self.drop_rate > 0:
            out = F.dropout(out, p=self.drop_rate, training=self.training)
        out = self.conv2(F.relu(self.bn2(out)))
        out += self.shortcut(x)
        return out

# WideResNet model definition
class WideResNet(nn.Module):
    def __init__(self, depth, widen_factor, num_classes, drop_rate=0.0):
        super(WideResNet, self).__init__()
        assert ((depth - 4) % 6 == 0), "Depth should be 6n+4"
        n = (depth - 4) // 6
        k = widen_factor

        # Initial convolution
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)

        # WideResNet blocks
        self.block1 = self._make_layer(16, 16 * k, n, stride=1, drop_rate=drop_rate)
        self.block2 = self._make_layer(16 * k, 32 * k, n, stride=2, drop_rate=drop_rate)
        self.block3 = self._make_layer(32 * k, 64 * k, n, stride=2, drop_rate=drop_rate)

        # Batch normalization, linear layer, and global average pooling
        self.bn1 = nn.BatchNorm2d(64 * k)
        self.fc = nn.Linear(64 * k, num_classes)

        # Initialize weights
        self._initialize_weights()

    def _make_layer(self, in_channels, out_channels, num_blocks, stride, drop_rate):
        layers = []
        for i in range(num_blocks):
            layers.append(WideResNetBlock(
                in_channels=in_channels if i == 0 else out_channels,
                out_channels=out_channels,
                stride=stride if i == 0 else 1,
                drop_rate=drop_rate
            ))
        return nn.Sequential(*layers)

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                nn.init.constant_(m.bias, 0)

    def forward(self, x):
        out = self.conv1(x)
        out = self.block1(out)
        out = self.block2(out)
        out = self.block3(out)
        out = F.relu(self.bn1(out))
        out = F.avg_pool2d(out, 8)
        out = out.view(out.size(0), -1)
        return self.fc(out)



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import matplotlib.pyplot as plt
import math

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Prepare the CIFAR-10 dataset and DataLoader
transform = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
])
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transforms.ToTensor())

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=64, shuffle=False)

# Initialize model, criterion, and optimizers
model = WideResNet(depth=10, widen_factor=4, num_classes=10).to(device)  # WRN-16-8 for CIFAR-10
criterion = nn.CrossEntropyLoss()

# Save the initial model state for reinitialization
initial_state_dict = model.state_dict()



# Function to train with specific d0 values
def train_with_d0_cifar(d0, epochs=200):
    # Reinitialize the model to start fresh for each d0
    model.load_state_dict(initial_state_dict)  # Reset to the initial saved state
    optimizer = DAdaptSGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4, d0=d0)

    train_losses = []
    train_accs = []
    scaler = torch.cuda.amp.GradScaler()  # Mixed precision scaling

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0

        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()

            # Enable mixed precision
            with torch.cuda.amp.autocast():
                outputs = model(images)
                loss = criterion(outputs, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        avg_loss = total_loss / len(train_loader)
        train_acc = 100 * correct / total

        train_losses.append(avg_loss)
        train_accs.append(train_acc)

        # Print training progress
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}, Training Accuracy: {train_acc:.2f}%')

    return train_losses, train_accs

d0_values = [1.0, 100]
all_train_losses = {}
all_train_accs = {}

for d0 in d0_values:
    print(f'\nTraining with d0 = {d0}')
    train_losses, train_accs = train_with_d0_cifar(d0)
    all_train_losses[d0] = train_losses
    all_train_accs[d0] = train_accs




Files already downloaded and verified
Files already downloaded and verified

Training with d0 = 1.0


  scaler = torch.cuda.amp.GradScaler()  # Mixed precision scaling
  with torch.cuda.amp.autocast():


Epoch [1/200], Loss: 2.1871, Training Accuracy: 28.08%
Epoch [2/200], Loss: 1.7167, Training Accuracy: 37.98%
Epoch [3/200], Loss: 1.6067, Training Accuracy: 41.85%
Epoch [4/200], Loss: 1.5303, Training Accuracy: 45.13%
Epoch [5/200], Loss: 1.4669, Training Accuracy: 47.13%
Epoch [6/200], Loss: 1.4137, Training Accuracy: 49.29%
Epoch [7/200], Loss: 1.3676, Training Accuracy: 50.99%
Epoch [8/200], Loss: 1.3306, Training Accuracy: 52.27%
Epoch [9/200], Loss: 1.2957, Training Accuracy: 53.71%
Epoch [10/200], Loss: 1.2623, Training Accuracy: 55.15%
Epoch [11/200], Loss: 1.2374, Training Accuracy: 55.97%
Epoch [12/200], Loss: 1.2116, Training Accuracy: 57.09%
Epoch [13/200], Loss: 1.1838, Training Accuracy: 58.13%
Epoch [14/200], Loss: 1.1619, Training Accuracy: 58.81%
Epoch [15/200], Loss: 1.1367, Training Accuracy: 59.79%
Epoch [16/200], Loss: 1.1116, Training Accuracy: 60.88%
Epoch [17/200], Loss: 1.0911, Training Accuracy: 61.61%
Epoch [18/200], Loss: 1.0761, Training Accuracy: 62.29%
E

In [None]:
# Reinitialize model to initial state before training with second optimizer
model.load_state_dict(initial_state_dict)

# Define the second optimizer with CyclicLR
base_lr = 0.01
max_lr = 0.1
optimizer_cyclic = optim.SGD(model.parameters(), lr=base_lr, momentum=0.9, weight_decay=5e-4)
scheduler = optim.lr_scheduler.CyclicLR(optimizer_cyclic, base_lr=base_lr, max_lr=max_lr, step_size_up=5, mode='triangular')

# Train with SGD + CyclicLR using mixed precision
train_losses_cyclic, train_accs_cyclic = [], []
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer_cyclic.zero_grad()

        # Mixed Precision Training
        with torch.amp.autocast('cuda', enabled=True):  # Enable mixed precision
            outputs = model(images)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()  # Scaled backward pass
        scaler.step(optimizer_cyclic)  # Update model parameters
        scaler.update()  # Update the scale factor
        scheduler.step()  # Update the learning rate scheduler

        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    avg_loss = total_loss / len(train_loader)
    train_acc = 100 * correct / total
    train_losses_cyclic.append(avg_loss)
    train_accs_cyclic.append(train_acc)

    print(f'Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}, Training Accuracy: {train_acc:.2f}%')

In [None]:

# Plotting Loss and Accuracy Comparison
plt.figure(figsize=(14, 6))

# Loss Comparison
plt.subplot(1, 2, 1)
plt.plot(train_losses_dadapt, label=f'DAdaptSGD', color='blue')
plt.plot(train_losses_cyclic, label=f'SGD + CyclicLR', color='orange')
plt.title('Training Loss Comparison (CIFAR-10)', fontsize=16)
plt.xlabel('Epochs', fontsize=14)
plt.ylabel('Loss', fontsize=14)
plt.legend(fontsize=12)

# Accuracy Comparison
plt.subplot(1, 2, 2)
plt.plot(train_accs_dadapt, label=f'DAdaptSGD', color='blue')
plt.plot(train_accs_cyclic, label=f'SGD + CyclicLR', color='orange')
plt.title('Training Accuracy Comparison (CIFAR-10)', fontsize=16)
plt.xlabel('Epochs', fontsize=14)
plt.ylabel('Accuracy (%)', fontsize=14)
plt.legend(fontsize=12)

plt.tight_layout()
plt.show()