# Pytorch　distribution Training

In [12]:
import torch
import torch.distributed as dist

## 判断当前环境是否支持分布式模块

In [2]:
print('Support Dist: ', dist.is_available())

Support Dist:  True


## 初始化进程组

我们有多种方式来创建一个进程组

`init_method`用于指定master和worker之间的通信的方法：

1. 通过一个显式的`tcp://ip:port`的形式
2. 通过一个共享文件的方式
3. 通过环境变量`MASTER_ADDR`,`MASTER_PORT`来获取，本质上和1是一样的。

In [None]:
def init_process_group(backend,
                       init_method="env://",
                       timeout=_default_pg_timeout,
                       **kwargs):
    """
    Initializes the default distributed process group, and this will also
    initialize the distributed package

    Arguments:
        backend (str or Backend): The backend to use. Depending on
            build-time configurations, valid values include ``mpi``, ``gloo``,
            and ``nccl``. This field should be given as a lowercase string
            (e.g., ``"gloo"``), which can also be accessed via
            :class:`Backend` attributes (e.g., ``Backend.GLOO``).
        init_method (str, optional): URL specifying how to initialize the
                                     process group.
        world_size (int, optional): Number of processes participating in
                                    the job.
        rank (int, optional): Rank of the current process.
        timeout (timedelta, optional): Timeout for operations executed against
            the process group. Default value equals 30 minutes.
            This is only applicable for the ``gloo`` backend.
        group_name (str, optional, deprecated): Group name.

    To enable ``backend == Backend.MPI``, PyTorch needs to built from source
    on a system that supports MPI. The same applies to NCCL as well.

    """
    pass

In [3]:
# Initialize Process Group

dist_backend = 'nccl'
dist_url = 'tcp://127.0.0.1:23456'
rank = 0
world_size = 1
dist.init_process_group(backend=dist_backend, init_method=dist_url, rank=rank, world_size=world_size)

## 判断是否初始化过

In [5]:
print('dist is initialized:',dist.is_initialized())

dist is initialized: True


## 定义模型

In [17]:
import torchvision.models as models

model_names = sorted(name for name in models.__dict__
    if name.islower() and not name.startswith("__")
    and callable(models.__dict__[name]))

print(model_names)

['alexnet', 'densenet121', 'densenet161', 'densenet169', 'densenet201', 'inception_v3', 'resnet101', 'resnet152', 'resnet18', 'resnet34', 'resnet50', 'squeezenet1_0', 'squeezenet1_1', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', 'vgg19', 'vgg19_bn']


In [None]:
model = models.resnet18(pretrained=False).cuda()

### 如果是单机多卡

```python
torch.distributed.init_process_group(backend="nccl")
model = DistributedDataParallel(model) # device_ids will include all GPU devices be default
```

## Make model DistributedDataParallel

In [13]:
# Args:
# -device_ids (list of int or torch.device): CUDA devices (default: all devices)
#- output_device (int or torch.device): device location of output (default: device_ids[0])

# model = torch.nn.parallel.DistributedDataParallel(model, device_ids=dp_device_ids, output_device=local_rank)
model = torch.nn.parallel.DistributedDataParallel(model)

## 设置优化器

In [14]:
starting_lr = 0.1
optimizer = torch.optim.SGD(model.parameters(), starting_lr, momentum=0.9, weight_decay=1e-4)

In [24]:
s = '严'
print(s.encode('utf-8'))
print(s.encode('utf-32'))

b'\xe4\xb8\xa5'
b'\xff\xfe\x00\x00%N\x00\x00'
