In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.models import resnet50
import time
import wandb
#
from leaf._core import LeafConfig, LeafTrainer

Initializing _core module...
_core module initialization complete!


In [2]:
config = LeafConfig()

In [3]:
config.add_server(
    server_name="gpu-server-2",
    username="root",
    hostname="174.93.255.152",
    port=35730,  
)

Adding server: gpu-server-2 with hostname: 174.93.255.152 and port: 35730
SSH connection test successful
174.93.255.152 : ssh connection successful
Checking for Docker installation...
174.93.255.152 :docker verification successful
174.93.255.152 : docker daemon verification successful
Docker files copied successfully
174.93.255.152 : docker files copied successful
leaf-grpc-server
Building Docker image...
#0 building with "default" instance using docker driver

#1 [internal] load build definition from Dockerfile
#1 transferring dockerfile: 1.12kB done
#1 DONE 0.0s

#2 [internal] load metadata for docker.io/library/ubuntu:22.04
#2 DONE 0.2s

#3 [internal] load .dockerignore
#3 transferring context: 2B done
#3 DONE 0.0s

#4 [builder 1/8] FROM docker.io/library/ubuntu:22.04@sha256:01a3ee0b5e413cefaaffc6abe68c9c37879ae3cced56a8e088b1649e5b269eee
#4 DONE 0.0s

#5 [internal] load build context
#5 transferring context: 5.42kB done
#5 DONE 0.0s

#6 [builder 6/8] COPY server_communication.proto

In [4]:
config.add_server(
    server_name="gpu-server-1",
    username="root",
    hostname="77.104.167.149",
    port=55427,  
)

Adding server: gpu-server-1 with hostname: 77.104.167.149 and port: 55427
SSH connection test successful
77.104.167.149 : ssh connection successful
Checking for Docker installation...
77.104.167.149 :docker verification successful
77.104.167.149 : docker daemon verification successful
Docker files copied successfully
77.104.167.149 : docker files copied successful
leaf-grpc-server
Building Docker image...
#0 building with "default" instance using docker driver

#1 [internal] load build definition from Dockerfile
#1 transferring dockerfile: 1.12kB done
#1 DONE 0.0s

#2 [internal] load metadata for docker.io/library/ubuntu:22.04
#2 DONE 0.8s

#3 [internal] load .dockerignore
#3 transferring context: 2B done
#3 DONE 0.0s

#4 [builder 1/8] FROM docker.io/library/ubuntu:22.04@sha256:01a3ee0b5e413cefaaffc6abe68c9c37879ae3cced56a8e088b1649e5b269eee
#4 DONE 0.0s

#5 [internal] load build context
#5 transferring context: 5.42kB done
#5 DONE 0.0s

#6 [builder 4/8] COPY server_communication.cpp .

In [5]:
config.print_all_resources()


=== Available Servers and Resources ===

Server: gpu-server-1
Status: Connected
Type: Remote server
Username: root
Hostname: 77.104.167.149
Port: 55427

Available Resources (1):
--------------------------------------------------

Resource 1:
  Name: NVIDIA GeForce RTX 3060
  Type: GPU
  Properties:
    free_memory: 11910 MiB

    total_memory: 12288 MiB

--------------------------------------------------

Server: gpu-server-2
Status: Connected
Type: Remote server
Username: root
Hostname: 174.93.255.152
Port: 35730

Available Resources (1):
--------------------------------------------------

Resource 1:
  Name: NVIDIA GeForce GTX 1660 SUPER
  Type: GPU
  Properties:
    free_memory: 5749 MiB

    total_memory: 6144 MiB

--------------------------------------------------

Server: localhost
Status: Connected
Type: Local machine

Available Resources (1):
--------------------------------------------------

Resource 1:
  Name: Apple M1
  Type: CPU
  Properties:

----------------------------

In [6]:
leaf_trainer = LeafTrainer(config)

In [7]:
leaf_trainer.test_with_hardcoded_values()

=== Testing get_gradients_from_server with hardcoded values ===
  Input data size: 12 elements
  Model state size: 55 elements

Testing server: gpu-server-1
Debug: Server gpu-server-1 has hostname: 77.104.167.149 and port: 55427
Server type: Remote
Connection status: Connected
Creating gRPC channel for server 'gpu-server-1' at address: localhost:50055
✓ Gradient computation successful!
  Loss: 0.5
  Gradients size: 3 elements
  Sample gradients: 4.59312e+27, 4.58281e+30, 6.88852e+22

Testing server: gpu-server-2
Debug: Server gpu-server-2 has hostname: 174.93.255.152 and port: 35730
Server type: Remote
Connection status: Connected
Creating gRPC channel for server 'gpu-server-2' at address: localhost:50054
✓ Gradient computation successful!
  Loss: 0.5
  Gradients size: 3 elements
  Sample gradients: 4.59312e+27, 4.58281e+30, 6.88852e+22

Testing server: localhost
Debug: Server localhost has hostname:  and port: 0
Server type: Local
Connection status: Connected
  Computing gradients loc

{'server_results': [{'server_name': 'gpu-server-1',
   'is_local': False,
   'is_connected': True,
   'success': True,
   'loss': 0.5,
   'gradients_size': 3,
   'gradients': [4.593116492825124e+27,
    4.582813224188066e+30,
    6.888519090641375e+22]},
  {'server_name': 'gpu-server-2',
   'is_local': False,
   'is_connected': True,
   'success': True,
   'loss': 0.5,
   'gradients_size': 3,
   'gradients': [4.593116492825124e+27,
    4.582813224188066e+30,
    6.888519090641375e+22]},
  {'server_name': 'localhost',
   'is_local': True,
   'is_connected': True,
   'success': True,
   'loss': 0.5,
   'gradients_size': 3,
   'gradients': [4.593116492825124e+27,
    4.582813224188066e+30,
    6.888519090641375e+22]}],
 'total_servers': 3}

In [8]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyperparameters
num_epochs = 50
batch_size = 128
learning_rate = 0.001

# Data preprocessing
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

In [9]:
# Load CIFAR-10 dataset
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                      download=True, transform=transform_train)
trainloader = DataLoader(trainset, batch_size=batch_size,
                        shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                     download=True, transform=transform_test)
testloader = DataLoader(testset, batch_size=batch_size,
                       shuffle=False, num_workers=2)

In [None]:
# Load pretrained ResNet-50 and modify for CIFAR-10
model = resnet50(pretrained=True)
model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
model.maxpool = nn.Identity()  # Remove maxpool as CIFAR-10 images are small
model.fc = nn.Linear(model.fc.in_features, 10)  # Change output to 10 classes
model = model.to(device)

In [11]:
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3)

In [None]:
leaf_trainer.train(
    model,
    optimizer,
    trainloader,
    num_epochs,
    criterion
)

In [None]:
run = wandb.init(
    project="big-model-example"
)

In [None]:
leaf_trainer.register_model(model)

In [12]:
# Training loop
def train():
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for batch_idx, (inputs, targets) in enumerate(trainloader):
        inputs, targets = inputs.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
        
        if (batch_idx + 1) % 100 == 0:
            print(f'Batch: {batch_idx + 1} | Loss: {running_loss/(batch_idx + 1):.3f} | '
                  f'Acc: {100.*correct/total:.2f}%')
    
    return running_loss/len(trainloader), 100.*correct/total

In [13]:
# Testing loop
def test():
    model.eval()
    test_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for inputs, targets in testloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    
    return test_loss/len(testloader), 100.*correct/total

In [None]:
# Main training loop
print('Starting training...')
best_acc = 0
for epoch in range(num_epochs):
    start_time = time.time()
    
    train_loss, train_acc = train()
    test_loss, test_acc = test()
    
    scheduler.step(test_loss)
    
    print(f'\nEpoch: {epoch + 1}/{num_epochs}')
    print(f'Time: {time.time() - start_time:.2f}s')
    print(f'Train Loss: {train_loss:.3f} | Train Acc: {train_acc:.2f}%')
    print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc:.2f}%')
    
    # Save best model
    if test_acc > best_acc:
        print('Saving best model...')
        state = {
            'model': model.state_dict(),
            'acc': test_acc,
            'epoch': epoch,
        }
        torch.save(state, 'best_model.pth')
        best_acc = test_acc
print('Training completed!')