In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Saving and loading the model

- 主要涉及3个core function：
   1. <font color=green>**torch.save()**</font>: 存serialized object to disk.用Python’s pickle utility来实现serialization. 可以用于：Models, tensors, and dictionaries of all kinds of objects
   2. <font color=green>**torch.load()**</font>: 用pickle’s unpickling facilities来deserialize pickled object files到memory.
   3. <font color=green>**torch.nn.Module.load_state_dict()**</font>: Loads a model’s state dictionary using a deserialized state_dict.

In [2]:
## 定义一个net
net = nn.Sequential(
    nn.Linear(3, 5),
    nn.ReLU(),
    nn.Linear(5, 2), 
)

torch.manual_seed(7)
x = torch.randn((10, 3), dtype=torch.float)
y = torch.empty(10, dtype=torch.long).random_(2)

learning_rate = 1e-4
loss_fn = torch.nn.CrossEntropyLoss()
opt = torch.optim.SGD(net.parameters(), lr=learning_rate)

for ind in range(2000):
    y_pred = net(x)
    loss = loss_fn(y_pred, y)
    opt.zero_grad()
    loss.backward()
    opt.step()
    
# 训练后得到的参数
# for p in net.named_parameters():
#     print(p)

## save the module
torch.save(net.state_dict(), 'net.pt')

In [3]:
## load the module
#  1. 新建一个结构相同的module
new_net = nn.Sequential(
    nn.Linear(3, 5),
    nn.ReLU(),
    nn.Linear(5, 2), 
)

# 此时new_net的参数是没有训练过的初始化值
# for p in new_net.named_parameters():
#     print(p)

#  2. load state
new_net.load_state_dict(torch.load('net.pt', weights_only=True))

## load之后，new_net的参数已经更新成了net训练完的参数
# for p in new_net.named_parameters():
#     print(p)

<All keys matched successfully>

## 1. 理解module state和state_dict
### 1.1 module的state有以下几种类型：
  - <font color=blue>**parameters**</font>: learnable aspects of computation。
  - <font color=blue>**buffers**</font>: non-learnable aspects of computation。虽然non-learnable，但仍会影响computation。有两种buffers：
    - Persistent buffers: 存在state_dict中。在torch.save和load的时候会被serialized。<font color=orange>比如batchnorm中的running mean和var。</font>
    - non-Persistent buffers: 不存在state_dict中。在torch.save和load的时候不会被serialized。
- <font color=norange>其中，parameters和persistent buffers存在state_dict中。</font>如果state被存为state_dict的一部分，那么loading a serialized form of the module的时候，它就能被restore。 

In [4]:
## 用register_buffer()将running mean的当前值存到state_dict

class RunningMean(nn.Module):
    def __init__(self, num_features, momentum=0.9):
        super().__init__()
        self.momentum = momentum
        self.register_buffer('mean', torch.zeros(num_features))
        # 此时，self.mean会被存到state_dict中
    
    def forward(self, x):
        # 每次迭代时更新running mean的值
        # 作为state_dict的一部分，当loading module的时候会被restore
        self.mean = self.momentum * self.mean + (1.0 - self.momentum) * x
        return self.mean

torch.manual_seed(0)
m = RunningMean(4)
for _ in range(10):
    input = torch.randn(4)
    m(input)

print(m.state_dict())

# Serialized form will contain the 'mean' tensor
torch.save(m.state_dict(), 'mean.pt')

m_loaded = RunningMean(4)
m_loaded.load_state_dict(torch.load('mean.pt', weights_only=False))
# 安全考虑，weights_only参数应该设为true，这里要load非weights，所以设False

torch.all(m.mean == m_loaded.mean)

OrderedDict([('mean', tensor([-0.1494,  0.1179, -0.3679, -0.1974]))])


tensor(True)

In [5]:
## 将running mean存为non-Persistent buffers

class RunningMean(nn.Module):
    def __init__(self, num_features, momentum=0.9):
        super().__init__()
        self.momentum = momentum
        self.register_buffer('mean', torch.zeros(num_features), persistent=False)
        # 此时，self.mean不会被存到state_dict中
    
    def forward(self, x):
        self.mean = self.momentum * self.mean + (1.0 - self.momentum) * x
        return self.mean

torch.manual_seed(0)
m2 = RunningMean(4)
for _ in range(10):
    input = torch.randn(4)
    m2(input)

print(m2.state_dict()) # 此时输出的state_dict是空的

torch.save(m2.state_dict(), 'mean.pt')
m2_loaded = RunningMean(4)
m2_loaded.load_state_dict(torch.load('mean.pt', weights_only=True))
print(torch.all(m2.mean == m2_loaded.mean)) # 输出False

OrderedDict()
tensor(False)


#### 一个module的buffers可以用buffers()和named_buffers()来遍历

In [6]:
for buffer in m2.named_buffers():
    print(buffer)

('mean', tensor([-0.1494,  0.1179, -0.3679, -0.1974]))


#### 两种buffers都受model-wide device/type changes所使用的.to() method影响


In [7]:
m.to(device='cuda', dtype=torch.float64 )

RunningMean()

In [8]:
## 一个综合例子
class StatefulModule(nn.Module):
    def __init__(self):
        super().__init__()
        # 用nn.Parameter实例化的参数会自动将tensor register为module parameter
        self.param1 = nn.Parameter(torch.randn(2))

        # 另一种将tensor register为module parameter的方式：用register_parameter() method
        self.register_parameter('param2', nn.Parameter(torch.randn(3)))

        # 将attribute： "param3" 定义为一个parameter，但不做初始化。
        # 它的值'None'不会出现在state_dict中    
        self.register_parameter('param3', None)

        # Registers a list of parameters：没有name
        self.param_list = nn.ParameterList([nn.Parameter(torch.randn(2)) for i in range(3)])

        # Registers a dictionary of parameters：有name
        self.param_dict = nn.ParameterDict({
            'foo': nn.Parameter(torch.randn(3)),
            'bar': nn.Parameter(torch.randn(4))
        })

        # Registers a persistent buffer
        self.register_buffer('buffer1', torch.randn(4), persistent=True)

        # Registers a non-persistent buffer
        self.register_buffer('buffer2', torch.randn(5), persistent=False)

        # 将attribute："buffer3" 定义为一个buffer，但不做初始化
        # 它的值'None'也不会出现在state_dict中    
        self.register_buffer('buffer3', None)

        # 添加一个submodule就会将其parameters自动register为module的parameters
        self.linear = nn.Linear(2, 3)

m = StatefulModule()

# Save and load state_dict.
torch.save(m.state_dict(), 'state.pt')
m_loaded = StatefulModule()
m_loaded.load_state_dict(torch.load('state.pt', weights_only=True))

# state_dict中没有non-persistent buffer和reserved attributes "param3"与"buffer3"
for p,v in m_loaded.state_dict().items():
    print('name:', p, ' -- value:', v)

name: param1  -- value: tensor([-0.0404,  0.2881])
name: param2  -- value: tensor([-0.0075, -0.9145, -1.0886])
name: buffer1  -- value: tensor([ 1.3232,  0.0371, -0.2849, -0.1334])
name: param_list.0  -- value: tensor([-0.2666,  0.1894])
name: param_list.1  -- value: tensor([-0.2190,  2.0576])
name: param_list.2  -- value: tensor([-0.0354,  0.0627])
name: param_dict.bar  -- value: tensor([ 0.1753, -0.9315, -1.5055, -0.6610])
name: param_dict.foo  -- value: tensor([-0.7663,  1.0993,  2.7565])
name: linear.weight  -- value: tensor([[ 0.0197, -0.0610],
        [ 0.1431,  0.4496],
        [ 0.6698,  0.4491]])
name: linear.bias  -- value: tensor([ 0.6713, -0.0511, -0.6352])


### 1.2 state_dict
1. 所有nn.Module中都定义了**state_dict**。它是一个Python dictionary对象，maps each layer to its parameter tensor
2. 只有两类module中的state_dict不为空
   1. <font color=blue>**layer modules**</font> with learnable parameters和persistent buffers。
      - 比如：
        - learnable parameters：conv layer
        - persistent buffers：batchnorm中的running mean
   2. <font color=blue>**optimizer**</font>中的state_dict存放optimizer的state和超参数

In [9]:
## 例
# Define model
class TheModelClass(nn.Module):
    def __init__(self):
        super(TheModelClass, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialize model
model = TheModelClass()

# Initialize optimizer
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [10]:
# model's state_dict：包括weights和bias
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

# 分隔线
print('-' * 50)

# optimizer's state_dict：包括state和超参数
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])

Model's state_dict:
conv1.weight 	 torch.Size([6, 3, 5, 5])
conv1.bias 	 torch.Size([6])
conv2.weight 	 torch.Size([16, 6, 5, 5])
conv2.bias 	 torch.Size([16])
fc1.weight 	 torch.Size([120, 400])
fc1.bias 	 torch.Size([120])
fc2.weight 	 torch.Size([84, 120])
fc2.bias 	 torch.Size([84])
fc3.weight 	 torch.Size([10, 84])
fc3.bias 	 torch.Size([10])
--------------------------------------------------
Optimizer's state_dict:
state 	 {}
param_groups 	 [{'lr': 0.001, 'momentum': 0.9, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'maximize': False, 'foreach': None, 'differentiable': False, 'fused': None, 'params': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}]


## 2. saving & loading Model for Inference
### 2.1 save/load state_dict
· 建议用这种方式

In [11]:
# save
PATH = 'rk_models/savedClassModelState.pt'   # 路径的文件名后缀一般取pt或者pth
torch.save(model.state_dict(), PATH)

# load
model = TheModelClass()
model.load_state_dict(torch.load(PATH)) # 先用torch.load(PATH)是load整个model
model.eval()                            # 一定要切换到evaluation mode

  model.load_state_dict(torch.load(PATH)) # 先用torch.load(PATH)是load整个model


TheModelClass(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

### 2.2 save/load entire model
最好不用这种方式，这种方式的缺点：\
the serialized data is bound to the specific classes and the exact directory structure used when the model is saved. The reason for this is because pickle does not save the model class itself. Rather, it saves a path to the file containing the class, which is used during load time. Because of this, your code can break in various ways when used in other projects or after refactors.

In [12]:
# save
PATH = 'rk_models/savedClassModel.pt'   # 路径的文件名后缀一般取pt或者pth
torch.save(model, PATH)

# load
model = torch.load(PATH)
model.eval()

  model = torch.load(PATH)


TheModelClass(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

### 2.3 export/load model in transcript format
规模化的推理和部署建议用这种方式。因为，此时model可以在python和高性能的c++环境中运行。 you will be able to load the exported model and run inference without defining the model class.

In [13]:
# export:
model_scripted = torch.jit.script(model) # Export to TorchScript
model_scripted.save('model_scripted.pt') # Save

# load:
model = torch.jit.load('model_scripted.pt')
model.eval()

RecursiveScriptModule(
  original_name=TheModelClass
  (conv1): RecursiveScriptModule(original_name=Conv2d)
  (pool): RecursiveScriptModule(original_name=MaxPool2d)
  (conv2): RecursiveScriptModule(original_name=Conv2d)
  (fc1): RecursiveScriptModule(original_name=Linear)
  (fc2): RecursiveScriptModule(original_name=Linear)
  (fc3): RecursiveScriptModule(original_name=Linear)
)

## 3. saving & loading checkpoint for Inference/Resuming trainning
1. 用途，存储阶段性训练信息，用于inference或者以此为起点resume training。此时要保存的内容包括:
   1. model的state_dict
   2. optimizer的state_dict，因为它包括了buffers and parameters that are updated as the model trains.
   3. 当前epoch
   4. 最近的training loss
   5. 外部的torch.nn.Embedding layers，等等
2. 由于保存的内容多，所以存checkpoint的大小比只存state_dict更大，一般2-3倍
3. 存储的时候，将这些内容用dictionary的结构存储，一般存为后缀.tar的文件名中
4. load的时候，先初始化model和optimizer，然后从dictionary中load需要的信息item

In [14]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

net = Net()
print(net)


Net(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [15]:
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
EPOCH = 5
PATH = "model.pt"
LOSS = 0.4

In [16]:
# save
torch.save({
            'epoch': EPOCH,
            'model_state_dict': net.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': LOSS,
            }, PATH)

In [17]:
# init新的model结构
model = Net()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# load
checkpoint = torch.load(PATH)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

# 根据后续用途来设置model所处的mode
model.eval()
# - or -
model.train()

  checkpoint = torch.load(PATH)


Net(
  (conv1): Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=400, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)

## 4. saving multiple models in one file
和存checkpoint相似

In [20]:
# save
torch.save({
            'modelA_state_dict': modelA.state_dict(),
            'modelB_state_dict': modelB.state_dict(),
            'optimizerA_state_dict': optimizerA.state_dict(),
            'optimizerB_state_dict': optimizerB.state_dict(),
            ...
            }, PATH)

# load
modelA = TheModelAClass(*args, **kwargs)
modelB = TheModelBClass(*args, **kwargs)
optimizerA = TheOptimizerAClass(*args, **kwargs)
optimizerB = TheOptimizerBClass(*args, **kwargs)

checkpoint = torch.load(PATH)
modelA.load_state_dict(checkpoint['modelA_state_dict'])
modelB.load_state_dict(checkpoint['modelB_state_dict'])
optimizerA.load_state_dict(checkpoint['optimizerA_state_dict'])
optimizerB.load_state_dict(checkpoint['optimizerB_state_dict'])

modelA.eval()
modelB.eval()
# - or -
modelA.train()
modelB.train()

SyntaxError: ':' expected after dictionary key (1202435455.py, line 7)

## 5. warmstarting model using parameters from another model

In [None]:
# save
torch.save(modelA.state_dict(), PATH)
# load
modelB = TheModelBClass(*args, **kwargs)
modelB.load_state_dict(torch.load(PATH), strict=False)

## 6. saving & loading model across devices
1. transfer learning中用得多。可以是loading from a partial state_dict, which is missing some keys,或者loading a state_dict with more keys than the model that you are loading into。这两种情况下都可以设置'strict =False'来ignore non-matching keys.

2. 如果想要load parameters from one layer to another, 但有的keys不match, 只要改变被loading的state_dict中的parameter的key name，使他们与model that you are loading into中的key name相match就行

### 6.1 save on GPU, load on CPU

In [None]:
# save
torch.save(model.state_dict(), PATH)
# load
device = torch.device('cpu')
model = TheModelClass(*args, **kwargs)

model.load_state_dict(torch.load(PATH, map_location=device))

### 6.2 save on GPU, load on GPU

In [None]:
# save
torch.save(model.state_dict(), PATH)
# load
device = torch.device("cuda")
model = TheModelClass(*args, **kwargs)

model.load_state_dict(torch.load(PATH))
model.to(device)
# Make sure to call input = input.to(device) on any input tensors that you feed to the model

### 6.3 save on CPU, load on GPU

In [None]:
# save
torch.save(model.state_dict(), PATH)
# load
device = torch.device("cuda")
model = TheModelClass(*args, **kwargs)

model.load_state_dict(torch.load(PATH, map_location="cuda:0"))
model.to(device)
# Make sure to call input = input.to(device) on any input tensors that you feed to the model

### 6.4 saving torch.nn.DataParallel Models

In [None]:
# save
torch.save(model.module.state_dict(), PATH)
# load
# Load to whatever device you want