In [1]:
import gc
import datetime
import pynvml
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import numpy as np

In [None]:
nb_data_root = '/data/nb_data/datasets'
device = torch.device("cuda")
epoch = 1
batch_size = 32
lr = 0.01
momentum = 0.9

In [None]:
torch.cuda.empty_cache()

print("memory_allocated:%f Mb" % float(torch.cuda.memory_allocated() / 1024 ** 2))
print("max_memory_allocated:%f Mb" % float(torch.cuda.max_memory_allocated() / 1024 ** 2))
print("memory_cached:%f Mb" % float(torch.cuda.memory_cached() / 1024 ** 2))

In [None]:
class LeNet(nn.Module):
    def __init__(self):
        super(LeNet, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=1,out_channels=6,kernel_size=5,stride=1,padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2,stride=2)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=6,out_channels=16,kernel_size=5),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2,stride=2)
        )
        self.fc1 = nn.Sequential(
            nn.Linear(16*5*5, 120),
            nn.ReLU(),
            nn.LeakyReLU()
        )
        self.fc2 = nn.Sequential(
            nn.Linear(120, 84),
            nn.ReLU()
        )
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        #print("1", x.size())
        x = x.view(x.size()[0], -1)
        #print("2", x.size())
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x

In [None]:
transform = transforms.ToTensor()

train_dataset = datasets.MNIST(
    root = nb_data_root,
    train = True,
    download = False,
    transform = transform
)

test_dataset = datasets.MNIST(
    root= nb_data_root,
    train = False,
    download = False,
    transform = transform
)

trainloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size = batch_size,
    shuffle = True
)

testloader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size = batch_size,
    shuffle = False
)

In [None]:
def main():
    net = LeNet().to(device)
    criterion = nn.CrossEntropyLoss()
    optimzizer = optim.SGD(net.parameters(), lr=lr, momentum=momentum)
    for e in range(epoch):
        sum_loss = 0.0

        for i, data in enumerate(trainloader):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            optimzizer.zero_grad()
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimzizer.step()

            #print loss on 100th epoch
            sum_loss += loss.item()
            if i % 100 == 99:
                print("memory_allocated:%f Mb" % float(torch.cuda.memory_allocated() / 1024 ** 2))
                print("max_memory_allocated:%f Mb" % float(torch.cuda.max_memory_allocated() / 1024 ** 2))
                print("memory_cached:%f Mb" % float(torch.cuda.memory_cached() / 1024 ** 2))
                print("max_memory_cached:%f Mb" % float(torch.cuda.max_memory_cached() / 1024 ** 2))
                print("\n\n")
                # print('[%d, %d] loss: %.03f'
                #       % (e + 1, i + 1, sum_loss / 100))
                sum_loss = 0.0
        
        with torch.no_grad():
            correct = 0
            total = 0
            for data in testloader:
                images, labels = data
                images, labels = images.to(device), labels.to(device)
            outputs = net(images)
            print("memory_allocated:%f Mb" % float(torch.cuda.memory_allocated()/1024**2))
            print("max_memory_allocated:%f Mb" % float(torch.cuda.max_memory_allocated() / 1024 ** 2))
            print("memory_cached:%f Mb" % float(torch.cuda.memory_cached()/1024**2))

            _, pre = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (pre == labels).sum()
            # print('第%d个epoch的识别准确率为：%d%%' % (e + 1, (100 * correct / total)))

In [None]:
main()

In [2]:
def _get_tensors():
    for obj in gc.get_objects():
        if torch.is_tensor(obj):
            tensor = obj
        else:
            continue
        if tensor.is_cuda:
            yield tensor

def mem_log():
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(0)
    meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)

    ts_list = [tensor.size() for tensor in _get_tensors()]
    new_tensor_sizes = {(type(x), 
                         tuple(x.size()), 
                         ts_list.count(x.size()), 
                         np.prod(np.array(x.size()))*4/1024**2)
                         for x in _get_tensors()}
    for t, s, n, m in new_tensor_sizes:
        print('[tensor: %s * Size:%s | Memory: %s M | %s]' %(str(n), str(s), str(m*n)[:6], str(t)))

    print("memory_allocated:%f Mb" % float(torch.cuda.memory_allocated()/1024**2))
    print("max_memory_allocated:%f Mb" % float(torch.cuda.max_memory_allocated()/1024**2))
    print("memory_cached:%f Mb" % float(torch.cuda.memory_cached()/1024**2))
    print("max_memory_cached:%f Mb" % float(torch.cuda.max_memory_cached()/1024**2))
    print("Used Memory:%f Mb" % float(meminfo.used/1024**2))
    print("Free Memory:%f Mb" % float(meminfo.free/1024**2))
    print("Total Memory:%f Mb" % float(meminfo.total/1024**2))

    pynvml.nvmlShutdown()

In [3]:
dtype = torch.float

N, D_in, H, D_out = 64, 1000, 100, 10

device = torch.device("cuda")
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
mem_log()
for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    loss = (y_pred - y).pow(2).sum()
    #print(t, loss.item())
    loss.backward()
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        w1.grad.zero_()
        w2.grad.zero_()

mem_log()



[tensor: 1 * Size:(64, 10) | Memory: 0.0024 M | <class 'torch.Tensor'>]
[tensor: 1 * Size:(1000, 100) | Memory: 0.3814 M | <class 'torch.Tensor'>]
[tensor: 1 * Size:(100, 10) | Memory: 0.0038 M | <class 'torch.Tensor'>]
[tensor: 1 * Size:(64, 1000) | Memory: 0.2441 M | <class 'torch.Tensor'>]
memory_allocated:0.632324 Mb
max_memory_allocated:0.632324 Mb
memory_cached:2.000000 Mb
max_memory_cached:2.000000 Mb
Used Memory:589.000000 Mb
Free Memory:22330.062500 Mb
Total Memory:22919.062500 Mb
[tensor: 1 * Size:() | Memory: 3.8146 M | <class 'torch.Tensor'>]
[tensor: 1 * Size:(1000, 100) | Memory: 0.3814 M | <class 'torch.Tensor'>]
[tensor: 1 * Size:(100, 10) | Memory: 0.0038 M | <class 'torch.Tensor'>]
[tensor: 2 * Size:(64, 10) | Memory: 0.0048 M | <class 'torch.Tensor'>]
[tensor: 1 * Size:(64, 1000) | Memory: 0.2441 M | <class 'torch.Tensor'>]
memory_allocated:1.020996 Mb
max_memory_allocated:1.427734 Mb
memory_cached:2.000000 Mb
max_memory_cached:2.000000 Mb
Used Memory:591.000000 Mb
F