In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.models import resnet50
import time
import wandb
#
from leaf._core import LeafConfig, LeafTrainer

Initializing _core module...
_core module initialization complete!


In [2]:
config = LeafConfig()

In [3]:
config.add_server(
    server_name="gpu-server-1",
    username="root",
    hostname="174.93.255.152",
    port=52395,  
)

gRPC verification failed


ssh: connect to host 174.93.255.152 port 35372: Connection refused
SSH connection test failed. Please verify:
1. SSH port 35372 is correct
2. Username root is correct
3. SSH key is properly set up


In [4]:
config.add_server(
    server_name="gpu-server-2",
    username="root",
    hostname="184.146.3.80",
    port=43636,  
)

gRPC verification failed


ssh: connect to host 174.93.255.152 port 27382: Connection refused
SSH connection test failed. Please verify:
1. SSH port 27382 is correct
2. Username root is correct
3. SSH key is properly set up


In [5]:
config.print_all_resources()


=== Available Servers and Resources ===

Server: gpu-server-1
Status: Not connected
Type: Remote server
Username: root
Hostname: 174.93.255.152
Port: 35372

No computational resources found

--------------------------------------------------

Server: gpu-server-2
Status: Not connected
Type: Remote server
Username: root
Hostname: 174.93.255.152
Port: 27382

No computational resources found

--------------------------------------------------

Server: localhost
Status: Connected
Type: Local machine

Available Resources (1):
--------------------------------------------------

Resource 1:
  Name: Apple M1
  Type: CPU
  Properties:

--------------------------------------------------

=== End of Server List ===



In [6]:
leaf_trainer = LeafTrainer(config)

In [7]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters
num_epochs = 50
batch_size = 128
learning_rate = 0.001

# Data preprocessing
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

In [8]:
# Load CIFAR-10 dataset
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                      download=True, transform=transform_train)
trainloader = DataLoader(trainset, batch_size=batch_size,
                        shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                     download=True, transform=transform_test)
testloader = DataLoader(testset, batch_size=batch_size,
                       shuffle=False, num_workers=2)

In [9]:
# Load pretrained ResNet-50 and modify for CIFAR-10
model = resnet50(pretrained=True)
model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
model.maxpool = nn.Identity()  # Remove maxpool as CIFAR-10 images are small
model.fc = nn.Linear(model.fc.in_features, 10)  # Change output to 10 classes
model = model.to(device)



In [10]:
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3)

In [11]:
leaf_trainer.train(
    model,
    optimizer,
    trainloader,
    num_epochs,
    criterion
)

=== Testing get_gradients_from_server function ===

Testing server: gpu-server-1
Server type: Remote
Connection status: Not connected
Skipping server gpu-server-1 - not connected

Testing server: gpu-server-2
Server type: Remote
Connection status: Not connected
Skipping server gpu-server-2 - not connected

Testing server: localhost
Server type: Local
Connection status: Connected
  Item type: <class 'str'>
  Item repr: conv1.weight
✗ Error testing server localhost: AttributeError: 'str' object has no attribute 'cpu'

Gradient testing completed!


{'server_results': [{'server_name': 'gpu-server-1',
   'is_local': False,
   'is_connected': False,
   'success': False,
   'error': 'Server not connected'},
  {'server_name': 'gpu-server-2',
   'is_local': False,
   'is_connected': False,
   'success': False,
   'error': 'Server not connected'},
  {'server_name': 'localhost',
   'is_local': True,
   'is_connected': True,
   'success': False,
   'error': "AttributeError: 'str' object has no attribute 'cpu'"}],
 'total_servers': 3}

In [12]:
asdf

NameError: name 'asdf' is not defined

In [None]:
run = wandb.init(
    project="big-model-example"
)

In [12]:
# Training loop
def train():
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
        
        if (batch_idx + 1) % 100 == 0:
            print(f'Batch: {batch_idx + 1} | Loss: {running_loss/(batch_idx + 1):.3f} | '
                  f'Acc: {100.*correct/total:.2f}%')
    
    return running_loss/len(trainloader), 100.*correct/total

In [13]:
# Testing loop
def test():
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for inputs, targets in testloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    
    return test_loss/len(testloader), 100.*correct/total

In [None]:
# Main training loop
print('Starting training...')
best_acc = 0
for epoch in range(num_epochs):
    start_time = time.time()
    
    train_loss, train_acc = train()
    test_loss, test_acc = test()
    
    scheduler.step(test_loss)
    
    print(f'\nEpoch: {epoch + 1}/{num_epochs}')
    print(f'Time: {time.time() - start_time:.2f}s')
    print(f'Train Loss: {train_loss:.3f} | Train Acc: {train_acc:.2f}%')
    print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc:.2f}%')
    
    # Save best model
    if test_acc > best_acc:
        print('Saving best model...')
        state = {
            'model': model.state_dict(),
            'acc': test_acc,
            'epoch': epoch,
        }
        torch.save(state, 'best_model.pth')
        best_acc = test_acc
print('Training completed!')