# Examples of using STOILO for Distributed Deep Learning

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


# Устройство (GPU или CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Гиперпараметры
num_epochs = 10
batch_size = 128
learning_rate = 0.001

# Преобразования для обучения и теста
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))
])
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))
])

# Загрузка датасета CIFAR-10
train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

# Модель: ResNet18 без предварительной тренировки, адаптированная под 10 классов
model = models.resnet18(pretrained=False)
model.fc = nn.Linear(model.fc.in_features, 10)
model = model.to(device)

# Функция потерь и оптимизатор
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Цикл обучения
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)

        # Прямой проход
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Обратный проход и оптимизация
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if (i + 1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {running_loss/100:.4f}')
            running_loss = 0.0

# Оценка качества на тестовом наборе
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.numpy())

# Вычисление метрик
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average='macro')
recall = recall_score(all_labels, all_preds, average='macro')
f1 = f1_score(all_labels, all_preds, average='macro')

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


100.0%


Epoch [1/10], Step [100/391], Loss: 1.8323
Epoch [1/10], Step [200/391], Loss: 1.5452
Epoch [1/10], Step [300/391], Loss: 1.4370
Epoch [2/10], Step [100/391], Loss: 1.2670
Epoch [2/10], Step [200/391], Loss: 1.2132
Epoch [2/10], Step [300/391], Loss: 1.1414
Epoch [3/10], Step [100/391], Loss: 1.0639
Epoch [3/10], Step [200/391], Loss: 1.0542
Epoch [3/10], Step [300/391], Loss: 1.0162
Epoch [4/10], Step [100/391], Loss: 0.9350
Epoch [4/10], Step [200/391], Loss: 0.9221
Epoch [4/10], Step [300/391], Loss: 0.9298
Epoch [5/10], Step [100/391], Loss: 0.8617
Epoch [5/10], Step [200/391], Loss: 0.8503
Epoch [5/10], Step [300/391], Loss: 0.8433
Epoch [6/10], Step [100/391], Loss: 0.7843
Epoch [6/10], Step [200/391], Loss: 0.8050
Epoch [6/10], Step [300/391], Loss: 0.7685
Epoch [7/10], Step [100/391], Loss: 0.7379
Epoch [7/10], Step [200/391], Loss: 0.7447
Epoch [7/10], Step [300/391], Loss: 0.7404
Epoch [8/10], Step [100/391], Loss: 0.6993
Epoch [8/10], Step [200/391], Loss: 0.7080
Epoch [8/10

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
from torchvision import models
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import stoilo
from stoilo.ddl import DPBGDTrainer


# Устройство (GPU или CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Гиперпараметры
num_epochs = 10
batch_size = 8192
learning_rate = 0.001

# Преобразования для обучения и теста
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))
])
transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261))
])

# Загрузка датасета CIFAR-10
train_dataset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
test_dataset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2, drop_last=True)

# for batch in train_loader:
#     import cloudpickle
#     serial = cloudpickle.dumps(batch)
#     from pympler import asizeof
#     print(asizeof.asizeof(serial))

In [2]:
# Модель: ResNet18 без предварительной тренировки, адаптированная под 10 классов
model = models.resnet18(pretrained=False)
model.fc = nn.Linear(model.fc.in_features, 10)
model = model.to(device)

# Функция потерь и оптимизатор
criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=learning_rate)

conn = await stoilo.connect('localhost:57010')

trainer = DPBGDTrainer(
    conn,
    model=model,
    loss_fn=criterion,
    optimizer_class=optim.Adam,
    optimizer_kwargs={'lr': learning_rate},
)

# Цикл обучения
loss_array = []
for epoch in range(num_epochs):
    model, loss = await trainer.epoch(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, loss={loss}")
    loss_array.append(loss)
print(*loss_array, sep='\n')

# Оценка качества на тестовом наборе
model.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for images, labels in test_loader:
        images = images.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.numpy())

# Вычисление метрик
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average='macro')
recall = recall_score(all_labels, all_preds, average='macro')
f1 = f1_score(all_labels, all_preds, average='macro')

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')


I0000 00:00:1747128993.441949   29553 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1747128993.461144   29553 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


Created 6 tasks
update norm: [('conv1.weight', 0.09699437767267227), ('bn1.weight', 0.008000008761882782), ('bn1.bias', 0.00799956452101469), ('layer1.0.conv1.weight', 0.1919887214899063), ('layer1.0.bn1.weight', 0.007999800145626068), ('layer1.0.bn1.bias', 0.00799973402172327), ('layer1.0.conv2.weight', 0.19198742508888245), ('layer1.0.bn2.weight', 0.007999852299690247), ('layer1.0.bn2.bias', 0.007998902350664139), ('layer1.1.conv1.weight', 0.19198395311832428), ('layer1.1.bn1.weight', 0.007999480701982975), ('layer1.1.bn1.bias', 0.007999789901077747), ('layer1.1.conv2.weight', 0.19198077917099), ('layer1.1.bn2.weight', 0.007999792695045471), ('layer1.1.bn2.bias', 0.007999636232852936), ('layer2.0.conv1.weight', 0.2714993357658386), ('layer2.0.bn1.weight', 0.011313462629914284), ('layer2.0.bn1.bias', 0.011313029564917088), ('layer2.0.conv2.weight', 0.38397738337516785), ('layer2.0.bn2.weight', 0.0113131208345294), ('layer2.0.bn2.bias', 0.011313123628497124), ('layer2.0.downsample.0.we

I0000 00:00:1747129725.788867   29553 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1747129725.923126   29553 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


Created 6 tasks
update norm: [('conv1.weight', 0.06628871709108353), ('bn1.weight', 0.005991214420646429), ('bn1.bias', 0.005488849710673094), ('layer1.0.conv1.weight', 0.1314864605665207), ('layer1.0.bn1.weight', 0.005558502394706011), ('layer1.0.bn1.bias', 0.006079147569835186), ('layer1.0.conv2.weight', 0.1359984427690506), ('layer1.0.bn2.weight', 0.005466504488140345), ('layer1.0.bn2.bias', 0.005737242288887501), ('layer1.1.conv1.weight', 0.13436788320541382), ('layer1.1.bn1.weight', 0.005838105920702219), ('layer1.1.bn1.bias', 0.0058059729635715485), ('layer1.1.conv2.weight', 0.13594752550125122), ('layer1.1.bn2.weight', 0.005742629989981651), ('layer1.1.bn2.bias', 0.005714145489037037), ('layer2.0.conv1.weight', 0.192329540848732), ('layer2.0.bn1.weight', 0.007882745005190372), ('layer2.0.bn1.bias', 0.008112040348351002), ('layer2.0.conv2.weight', 0.2707218527793884), ('layer2.0.bn2.weight', 0.008122853934764862), ('layer2.0.bn2.bias', 0.008411369286477566), ('layer2.0.downsample

I0000 00:00:1747130396.750633   29553 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1747130396.903415   29553 fork_posix.cc:75] Other threads are currently calling into gRPC, skipping fork() handlers


Created 6 tasks


AioRpcError: <AioRpcError of RPC that terminated with:
	status = StatusCode.UNAVAILABLE
	details = "Socket closed"
	debug_error_string = "UNKNOWN:Error received from peer  {created_time:"2025-05-13T13:00:41.452363239+03:00", grpc_status:14, grpc_message:"Socket closed"}"
>