In [15]:
import torch
import torch.nn as nn
from torch import optim
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision import datasets
from tqdm import tqdm
from torch import nn

import torch.nn.functional as F

import numpy as np

from torch.utils.data import Dataset,DataLoader,TensorDataset

In [9]:
batch_size = 256        # 批的大小
learning_rate = 1e-3    # 学习率
num_epoches = 10       # 遍历训练集的次数

In [10]:
train_dataset = datasets.CIFAR10('./data', train=True, transform=transforms.ToTensor(), download=True)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataset = datasets.CIFAR10('./data', train=False, transform=transforms.ToTensor(), download=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

Files already downloaded and verified
Files already downloaded and verified


In [11]:
class VGG16(nn.Module):
    def __init__(self, num_classes=10):
        super(VGG16, self).__init__()
        self.features = nn.Sequential(
            #1
            nn.Conv2d(3,64,kernel_size=3,padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(True),
            #2
            nn.Conv2d(64,64,kernel_size=3,padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=2,stride=2),
            #3
            nn.Conv2d(64,128,kernel_size=3,padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(True),
            #4
            nn.Conv2d(128,128,kernel_size=3,padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=2,stride=2),
            #5
            nn.Conv2d(128,256,kernel_size=3,padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(True),
            #6
            nn.Conv2d(256,256,kernel_size=3,padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(True),
            #7
            nn.Conv2d(256,256,kernel_size=3,padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=2,stride=2),
            #8
            nn.Conv2d(256,512,kernel_size=3,padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(True),
            #9
            nn.Conv2d(512,512,kernel_size=3,padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(True),
            #10
            nn.Conv2d(512,512,kernel_size=3,padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=2,stride=2),
            #11
            nn.Conv2d(512,512,kernel_size=3,padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(True),
            #12
            nn.Conv2d(512,512,kernel_size=3,padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(True),
            #13
            nn.Conv2d(512,512,kernel_size=3,padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=2,stride=2),
            nn.AvgPool2d(kernel_size=1,stride=1),
            )
        self.classifier = nn.Sequential(
            #14
            nn.Linear(512,4096),
            nn.ReLU(True),
            nn.Dropout(),
            #15
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            #16
            nn.Linear(4096,num_classes),
            )
        #self.classifier = nn.Linear(512, 10)

    def forward(self, x):
        out = self.features(x)
        #        print(out.shape)
        out = out.view(out.size(0), -1)
        #        print(out.shape)
        out = self.classifier(out)
        #        print(out.shape)
        return out


In [12]:
model = VGG16()
use_gpu = torch.cuda.is_available()  # 判断是否有GPU加速
if use_gpu:
    model = model.cuda()

'''定义loss和optimizer'''
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)


In [13]:
for epoch in range(num_epoches):
    print('*' * 25, 'epoch {}'.format(epoch + 1), '*' * 25)  # .format为输出格式，formet括号里的即为左边花括号的输出
    running_loss = 0.0
    running_acc = 0.0
    for i, data in tqdm(enumerate(train_loader, 1)):

        img, label = data
        # cuda
        if use_gpu:
            img = img.cuda()
            label = label.cuda()
        img = Variable(img)
        label = Variable(label)
        # 向前传播
        out = model(img)
        loss = criterion(out, label)
        running_loss += loss.item() * label.size(0)
        _, pred = torch.max(out, 1)  # 预测最大值所在的位置标签
        num_correct = (pred == label).sum()
        accuracy = (pred == label).float().mean()
        running_acc += num_correct.item()
        # 向后传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print('Finish {} epoch, Loss: {:.6f}, Acc: {:.6f}'.format(
        epoch + 1, running_loss / (len(train_dataset)), running_acc / (len(train_dataset))))

    model.eval()  # 模型评估
    eval_loss = 0
    eval_acc = 0
    for data in test_loader:  # 测试模型
        img, label = data
        if use_gpu:
            img = Variable(img, volatile=True).cuda()
            label = Variable(label, volatile=True).cuda()
        else:
            img = Variable(img, volatile=True)
            label = Variable(label, volatile=True)
        out = model(img)
        loss = criterion(out, label)
        eval_loss += loss.item() * label.size(0)
        _, pred = torch.max(out, 1)
        num_correct = (pred == label).sum()
        eval_acc += num_correct.item()
    print('Test Loss: {:.6f}, Acc: {:.6f}'.format(eval_loss / (len(
        test_dataset)), eval_acc / (len(test_dataset))))
    print()


1it [00:00,  6.47it/s]

************************* epoch 1 *************************


782it [09:41,  1.34it/s]
  img = Variable(img, volatile=True).cuda()


Finish 1 epoch, Loss: 2.026781, Acc: 0.230200


  label = Variable(label, volatile=True).cuda()
0it [00:00, ?it/s]

Test Loss: 1.668141, Acc: 0.370300

************************* epoch 2 *************************


782it [10:33,  1.23it/s]


Finish 2 epoch, Loss: 1.563775, Acc: 0.424680


0it [00:00, ?it/s]

Test Loss: 1.693176, Acc: 0.388300

************************* epoch 3 *************************


782it [10:08,  1.28it/s]


Finish 3 epoch, Loss: 1.306501, Acc: 0.523720


0it [00:00, ?it/s]

Test Loss: 1.411429, Acc: 0.499800

************************* epoch 4 *************************


782it [10:29,  1.24it/s]


Finish 4 epoch, Loss: 1.147672, Acc: 0.585320


0it [00:00, ?it/s]

Test Loss: 1.192472, Acc: 0.570300

************************* epoch 5 *************************


782it [09:53,  1.32it/s]


Finish 5 epoch, Loss: 1.016525, Acc: 0.637060


0it [00:00, ?it/s]

Test Loss: 1.514249, Acc: 0.523400

************************* epoch 6 *************************


782it [09:53,  1.32it/s]


Finish 6 epoch, Loss: 0.896041, Acc: 0.681060


0it [00:00, ?it/s]

Test Loss: 1.060386, Acc: 0.626000

************************* epoch 7 *************************


782it [09:54,  1.32it/s]


Finish 7 epoch, Loss: 0.788433, Acc: 0.720340


0it [00:00, ?it/s]

Test Loss: 1.135527, Acc: 0.618100

************************* epoch 8 *************************


782it [09:53,  1.32it/s]


Finish 8 epoch, Loss: 0.694712, Acc: 0.753720


0it [00:00, ?it/s]

Test Loss: 2.312493, Acc: 0.443200

************************* epoch 9 *************************


782it [09:53,  1.32it/s]


Finish 9 epoch, Loss: 0.605033, Acc: 0.789200


0it [00:00, ?it/s]

Test Loss: 1.005858, Acc: 0.665900

************************* epoch 10 *************************


782it [09:54,  1.32it/s]


Finish 10 epoch, Loss: 0.512608, Acc: 0.821800
Test Loss: 1.403495, Acc: 0.595200



In [14]:
torch.save(model.state_dict(), './cnn.pth')

In [8]:
train_dataset = datasets.CIFAR10('./cifar10_data/',train=True,download=True,transform=transforms.Compose([transforms.ToTensor()]))


Files already downloaded and verified


In [21]:
train_loader = DataLoader(dataset = train_dataset, batch_size = 500, shuffle = True)

In [32]:
class Block(nn.Module):
    '''Depthwise conv + Pointwise conv'''
    def __init__(self, in_planes, out_planes, stride=1):
        super(Block, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, in_planes, kernel_size=3, stride=stride, padding=1, groups=in_planes, bias=False)
        self.bn1 = nn.BatchNorm2d(in_planes)
        self.conv2 = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, padding=0, bias=False)
        self.bn2 = nn.BatchNorm2d(out_planes)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = F.relu(self.bn2(self.conv2(out)))
        return out


class MobileNet(nn.Module):
    # (128,2) means conv planes=128, conv stride=2, by default conv stride=1
    cfg = [64, (128,2), 128, (256,2), 256, (512,2), 512, 512, 512, 512, 512, (1024,2), 1024]

    def __init__(self, num_classes=10):
        super(MobileNet, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(32)
        self.layers = self._make_layers(in_planes=32)
        self.linear = nn.Linear(1024, num_classes)

#         self.model = nn.Sequential(self.conv1, self.bn1, self.layers, self.linear)
#         self._initialize_weights()

    def _make_layers(self, in_planes):
        layers = []
        for x in self.cfg:
            out_planes = x if isinstance(x, int) else x[0]
            stride = 1 if isinstance(x, int) else x[1]
            layers.append(Block(in_planes, out_planes, stride))
            in_planes = out_planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layers(out)
        out = F.avg_pool2d(out, 2)
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out


In [54]:
epoch=5
net=MobileNet()
cost=torch.nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(net.parameters(),lr=0.0005)

In [59]:
for k in range(epoch):
    sum_loss = 0.0
    train_correct = 0
    for i,data in enumerate(train_loader,0):
        inputs,labels = data
        optimizer.zero_grad()
        outputs = net(inputs)
        
        
        loss = cost(outputs,labels)
        loss.backward()
        optimizer.step()
        
        print(loss)
        _,id = torch.max(outputs.data,1)
        sum_loss+=loss.data
        train_correct += torch.sum(id == labels.data)
        print('[%d,%d] loss:%.03f'%(k+1,k,sum_loss/len(train_loader)))

print('correct:%.03f%%'%(100*train_correct/len(train_dataset)))

tensor(2.2323, grad_fn=<NllLossBackward>)
[1,0] loss:0.022
tensor(2.2562, grad_fn=<NllLossBackward>)
[1,0] loss:0.045
tensor(2.2259, grad_fn=<NllLossBackward>)
[1,0] loss:0.067
tensor(2.2152, grad_fn=<NllLossBackward>)
[1,0] loss:0.089
tensor(2.2356, grad_fn=<NllLossBackward>)
[1,0] loss:0.112
tensor(2.1922, grad_fn=<NllLossBackward>)
[1,0] loss:0.134
tensor(2.2051, grad_fn=<NllLossBackward>)
[1,0] loss:0.156
tensor(2.2179, grad_fn=<NllLossBackward>)
[1,0] loss:0.178
tensor(2.1495, grad_fn=<NllLossBackward>)
[1,0] loss:0.199
tensor(2.1512, grad_fn=<NllLossBackward>)
[1,0] loss:0.221
tensor(2.1843, grad_fn=<NllLossBackward>)
[1,0] loss:0.243
tensor(2.1182, grad_fn=<NllLossBackward>)
[1,0] loss:0.264
tensor(2.0814, grad_fn=<NllLossBackward>)
[1,0] loss:0.285
tensor(2.0294, grad_fn=<NllLossBackward>)
[1,0] loss:0.305
tensor(2.0764, grad_fn=<NllLossBackward>)
[1,0] loss:0.326
tensor(2.0531, grad_fn=<NllLossBackward>)
[1,0] loss:0.346
tensor(1.9775, grad_fn=<NllLossBackward>)
[1,0] loss:0.3

tensor(1.5644, grad_fn=<NllLossBackward>)
[2,1] loss:0.608
tensor(1.5232, grad_fn=<NllLossBackward>)
[2,1] loss:0.623
tensor(1.5217, grad_fn=<NllLossBackward>)
[2,1] loss:0.638
tensor(1.4950, grad_fn=<NllLossBackward>)
[2,1] loss:0.653
tensor(1.4869, grad_fn=<NllLossBackward>)
[2,1] loss:0.668
tensor(1.5626, grad_fn=<NllLossBackward>)
[2,1] loss:0.684
tensor(1.4862, grad_fn=<NllLossBackward>)
[2,1] loss:0.699
tensor(1.5019, grad_fn=<NllLossBackward>)
[2,1] loss:0.714
tensor(1.4924, grad_fn=<NllLossBackward>)
[2,1] loss:0.729
tensor(1.5535, grad_fn=<NllLossBackward>)
[2,1] loss:0.744
tensor(1.6130, grad_fn=<NllLossBackward>)
[2,1] loss:0.760
tensor(1.4699, grad_fn=<NllLossBackward>)
[2,1] loss:0.775
tensor(1.5503, grad_fn=<NllLossBackward>)
[2,1] loss:0.791
tensor(1.5070, grad_fn=<NllLossBackward>)
[2,1] loss:0.806
tensor(1.5415, grad_fn=<NllLossBackward>)
[2,1] loss:0.821
tensor(1.5127, grad_fn=<NllLossBackward>)
[2,1] loss:0.836
tensor(1.5165, grad_fn=<NllLossBackward>)
[2,1] loss:0.8

tensor(1.4092, grad_fn=<NllLossBackward>)
[3,2] loss:1.023
tensor(1.2454, grad_fn=<NllLossBackward>)
[3,2] loss:1.036
tensor(1.3402, grad_fn=<NllLossBackward>)
[3,2] loss:1.049
tensor(1.3616, grad_fn=<NllLossBackward>)
[3,2] loss:1.063
tensor(1.3166, grad_fn=<NllLossBackward>)
[3,2] loss:1.076
tensor(1.3154, grad_fn=<NllLossBackward>)
[3,2] loss:1.089
tensor(1.3211, grad_fn=<NllLossBackward>)
[3,2] loss:1.102
tensor(1.3668, grad_fn=<NllLossBackward>)
[3,2] loss:1.116
tensor(1.3220, grad_fn=<NllLossBackward>)
[3,2] loss:1.129
tensor(1.3131, grad_fn=<NllLossBackward>)
[3,2] loss:1.142
tensor(1.3128, grad_fn=<NllLossBackward>)
[3,2] loss:1.155
tensor(1.3595, grad_fn=<NllLossBackward>)
[3,2] loss:1.169
tensor(1.2586, grad_fn=<NllLossBackward>)
[3,2] loss:1.182
tensor(1.2826, grad_fn=<NllLossBackward>)
[3,2] loss:1.194
tensor(1.4015, grad_fn=<NllLossBackward>)
[3,2] loss:1.208
tensor(1.3405, grad_fn=<NllLossBackward>)
[3,2] loss:1.222
tensor(1.2437, grad_fn=<NllLossBackward>)
[3,2] loss:1.2

tensor(0.8479, grad_fn=<NllLossBackward>)
[5,4] loss:0.161
tensor(0.8603, grad_fn=<NllLossBackward>)
[5,4] loss:0.169
tensor(0.8769, grad_fn=<NllLossBackward>)
[5,4] loss:0.178
tensor(0.8301, grad_fn=<NllLossBackward>)
[5,4] loss:0.187
tensor(0.8369, grad_fn=<NllLossBackward>)
[5,4] loss:0.195
tensor(0.8551, grad_fn=<NllLossBackward>)
[5,4] loss:0.203
tensor(0.8931, grad_fn=<NllLossBackward>)
[5,4] loss:0.212
tensor(0.8157, grad_fn=<NllLossBackward>)
[5,4] loss:0.221
tensor(0.8259, grad_fn=<NllLossBackward>)
[5,4] loss:0.229
tensor(0.9069, grad_fn=<NllLossBackward>)
[5,4] loss:0.238
tensor(0.8658, grad_fn=<NllLossBackward>)
[5,4] loss:0.247
tensor(0.8193, grad_fn=<NllLossBackward>)
[5,4] loss:0.255
tensor(0.9229, grad_fn=<NllLossBackward>)
[5,4] loss:0.264
tensor(0.9003, grad_fn=<NllLossBackward>)
[5,4] loss:0.273
tensor(0.8297, grad_fn=<NllLossBackward>)
[5,4] loss:0.281
tensor(0.8719, grad_fn=<NllLossBackward>)
[5,4] loss:0.290
tensor(0.9715, grad_fn=<NllLossBackward>)
[5,4] loss:0.3

In [61]:
torch.save(net.state_dict(), './net.pth')

In [3]:
def train_student(model, device, train_loader, optimizer, epoch):
    model.train()
    trained_samples = 0
#     for batch_idx, (data, target) in enumerate(train_loader):
#         data, target = data.to(device), target.to(device)
#         optimizer.zero_grad()
#         output = model(data)
    for i, data in enumerate(train_loader):
        inputs, labels = data
        optimizer.zero_grad()
        outputs = model(inputs)

        loss = F.cross_entropy(outputs, labels)
        loss.backward()
        optimizer.step()

        trained_samples += len(data)
#         progress = math.ceil(i / len(train_loader) * 50)
        print("\rTrain epoch %d: %d/%d" %
              (epoch, trained_samples, len(train_loader.dataset)), end='')

In [4]:
def test_student(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
#         for data, target in test_loader:
#             data, target = data.to(device), target.to(device)
#             output = model(data)
#             test_loss += F.cross_entropy(output, target, reduction='sum').item()  # sum up batch loss
        for i, data in test_loader:
            inputs, labels = data
            outputs = model(inputs)
            test_loss += F.cross_entropy(outputs, labels, reduction='sum').item()  # sum up batch loss
            pred = outputs.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(labels.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest: average loss: {:.4f}, accuracy: {}/{} ({:.0f}%)'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    return test_loss, correct / len(test_loader.dataset)

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
import time
import os


In [17]:
transform = transforms.Compose(
    [
     transforms.RandomHorizontalFlip(),
     transforms.RandomGrayscale(),
     transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

transform1 = transforms.Compose(
    [
     transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(trainset, batch_size=100,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform1)
test_loader = torch.utils.data.DataLoader(testset, batch_size=50,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


In [23]:
def train_student(model, device, train_loader, optimizer, epoch):
    model.train()
    trained_samples = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.cross_entropy(output, target)
        loss.backward()
        optimizer.step()

        trained_samples += len(data)
#         progress = math.ceil(batch_idx / len(train_loader) * 50)
        print("\rTrain epoch %d: %d/%d" %
              (epoch, trained_samples, len(train_loader.dataset)), end='')


In [17]:
def test_student(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.cross_entropy(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest: average loss: {:.4f}, accuracy: {}/{} ({:.0f}%)'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    return test_loss, correct / len(test_loader.dataset)

In [18]:
def student_main():
    epochs = 10
    batch_size = 64
    torch.manual_seed(0)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#     train_loader = torch.utils.data.DataLoader(
#         datasets.MNIST('../data/MNIST', train=True, download=True,
#                        transform=transforms.Compose([
#                            transforms.ToTensor(),
#                            transforms.Normalize((0.1307,), (0.3081,))
#                        ])),
#         batch_size=batch_size, shuffle=True)
#     test_loader = torch.utils.data.DataLoader(
#         datasets.MNIST('../data/MNIST', train=False, download=True, transform=transforms.Compose([
#             transforms.ToTensor(),
#             transforms.Normalize((0.1307,), (0.3081,))
#         ])),
#         batch_size=1000, shuffle=True)

    model = MobileNet().to(device)
    optimizer = torch.optim.Adadelta(model.parameters())
    
    student_history = []
    
    for epoch in range(1, epochs + 1):
        train_student(model, device, train_loader, optimizer, epoch)
        loss, acc = test_student(model, device, test_loader)
        student_history.append((loss, acc))

    torch.save(model.state_dict(), "student.pt")
    return model, student_history

In [24]:
student_simple_model, student_simple_history = student_main()

Train epoch 1: 50000/50000
Test: average loss: 1.3215, accuracy: 5365/10000 (54%)
Train epoch 2: 50000/50000
Test: average loss: 0.9672, accuracy: 6680/10000 (67%)
Train epoch 3: 50000/50000
Test: average loss: 0.7808, accuracy: 7272/10000 (73%)
Train epoch 4: 50000/50000
Test: average loss: 0.7796, accuracy: 7416/10000 (74%)
Train epoch 5: 50000/50000
Test: average loss: 0.6480, accuracy: 7750/10000 (78%)
Train epoch 6: 50000/50000
Test: average loss: 0.6812, accuracy: 7708/10000 (77%)
Train epoch 7: 50000/50000
Test: average loss: 0.6059, accuracy: 8042/10000 (80%)
Train epoch 8: 50000/50000
Test: average loss: 0.6395, accuracy: 7975/10000 (80%)
Train epoch 9: 50000/50000
Test: average loss: 0.6851, accuracy: 7933/10000 (79%)
Train epoch 10: 50000/50000
Test: average loss: 0.6507, accuracy: 7973/10000 (80%)


In [13]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
import torch.utils.data


torch.manual_seed(0)
torch.cuda.manual_seed(0)

In [6]:
class VGG16(nn.Module):
    def __init__(self, num_classes=10):
        super(VGG16, self).__init__()
        self.features = nn.Sequential(
            #1
            nn.Conv2d(3,64,kernel_size=3,padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(True),
            #2
            nn.Conv2d(64,64,kernel_size=3,padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=2,stride=2),
            #3
            nn.Conv2d(64,128,kernel_size=3,padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(True),
            #4
            nn.Conv2d(128,128,kernel_size=3,padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=2,stride=2),
            #5
            nn.Conv2d(128,256,kernel_size=3,padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(True),
            #6
            nn.Conv2d(256,256,kernel_size=3,padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(True),
            #7
            nn.Conv2d(256,256,kernel_size=3,padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=2,stride=2),
            #8
            nn.Conv2d(256,512,kernel_size=3,padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(True),
            #9
            nn.Conv2d(512,512,kernel_size=3,padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(True),
            #10
            nn.Conv2d(512,512,kernel_size=3,padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=2,stride=2),
            #11
            nn.Conv2d(512,512,kernel_size=3,padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(True),
            #12
            nn.Conv2d(512,512,kernel_size=3,padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(True),
            #13
            nn.Conv2d(512,512,kernel_size=3,padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(True),
            nn.MaxPool2d(kernel_size=2,stride=2),
            nn.AvgPool2d(kernel_size=1,stride=1),
            )
        self.classifier = nn.Sequential(
            #14
            nn.Linear(512,4096),
            nn.ReLU(True),
            nn.Dropout(),
            #15
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            #16
            nn.Linear(4096,num_classes),
            )
        #self.classifier = nn.Linear(512, 10)

    def forward(self, x):
        out = self.features(x)
        #        print(out.shape)
        out = out.view(out.size(0), -1)
        #        print(out.shape)
        out = self.classifier(out)
        #        print(out.shape)
        return out

In [22]:
def train_teacher(model, device, train_loader, optimizer, epoch):
    model.train()
    trained_samples = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.cross_entropy(output, target)
        loss.backward()
        optimizer.step()

        trained_samples += len(data)
        progress = math.ceil(batch_idx / len(train_loader) * 50)
        print("\rTrain epoch %d: %d/%d, [%-51s] %d%%" %
              (epoch, trained_samples, len(train_loader.dataset),
               '-' * progress + '>', progress * 2), end='')


In [23]:
def test_teacher(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.cross_entropy(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest: average loss: {:.4f}, accuracy: {}/{} ({:.0f}%)'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    return test_loss, correct / len(test_loader.dataset)

In [25]:
def teacher_main():
    epochs = 50
    batch_size = 64
    torch.manual_seed(0)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#     train_loader = torch.utils.data.DataLoader(
#         datasets.MNIST('../data/MNIST', train=True, download=True,
#                        transform=transforms.Compose([
#                            transforms.ToTensor(),
#                            transforms.Normalize((0.1307,), (0.3081,))
#                        ])),
#         batch_size=batch_size, shuffle=True)
#     test_loader = torch.utils.data.DataLoader(
#         datasets.MNIST('../data/MNIST', train=False, download=True, transform=transforms.Compose([
#             transforms.ToTensor(),
#             transforms.Normalize((0.1307,), (0.3081,))
#         ])),
#         batch_size=1000, shuffle=True)

    model = VGG16().to(device)
    optimizer = torch.optim.Adadelta(model.parameters())
    
    teacher_history = []

    for epoch in range(1, epochs + 1):
        train_teacher(model, device, train_loader, optimizer, epoch)
        loss, acc = test_teacher(model, device, test_loader)
        
        teacher_history.append((loss, acc))

    torch.save(model.state_dict(), "teacher.pt")
    return model, teacher_history

In [25]:
teacher_model, teacher_history = teacher_main()

Train epoch 1: 50000/50000, [-------------------------------------------------->] 100%
Test: average loss: 1.8880, accuracy: 2846/10000 (28%)
Train epoch 2: 50000/50000, [-------------------------------------------------->] 100%
Test: average loss: 1.4881, accuracy: 3977/10000 (40%)
Train epoch 3: 50000/50000, [-------------------------------------------------->] 100%
Test: average loss: 1.2899, accuracy: 5523/10000 (55%)
Train epoch 4: 50000/50000, [-------------------------------------------------->] 100%
Test: average loss: 1.1057, accuracy: 6213/10000 (62%)
Train epoch 5: 50000/50000, [-------------------------------------------------->] 100%
Test: average loss: 0.9722, accuracy: 6767/10000 (68%)
Train epoch 6: 50000/50000, [-------------------------------------------------->] 100%
Test: average loss: 0.7948, accuracy: 7416/10000 (74%)
Train epoch 7: 50000/50000, [-------------------------------------------------->] 100%
Test: average loss: 1.0998, accuracy: 6904/10000 (69%)
Train 

In [18]:
import numpy as np
from matplotlib import pyplot as plt

def softmax_t(x, t):
    x_exp = np.exp(x / t)
    return x_exp / np.sum(x_exp)

# test_loader_bs1 = torch.utils.data.DataLoader(
#     datasets.MNIST('../data/MNIST', train=False, download=True, transform=transforms.Compose([
#         transforms.ToTensor(),
#         transforms.Normalize((0.1307,), (0.3081,))
#     ])),
#     batch_size=1, shuffle=True)
trainset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                        download=True, transform=transform)
test_loader_bs1 = torch.utils.data.DataLoader(trainset, batch_size=100,
                                          shuffle=True, num_workers=2)

Files already downloaded and verified


In [20]:
teacher_model.eval()
with torch.no_grad():
    data, target = next(iter(test_loader_bs1))
    data, target = data.to('cuda'), target.to('cuda')
    output = teacher_model(data)

test_x = data.cpu().numpy()
y_out = output.cpu().numpy()
y_out = y_out[0, ::]
print('Output (NO softmax):', y_out)



plt.subplot(3, 1, 1)
plt.imshow(test_x[0, 0, ::])

plt.subplot(3, 1, 2)
plt.bar(list(range(10)), softmax_t(y_out, 1), width=0.3)

plt.subplot(3, 1, 3)
plt.bar(list(range(10)), softmax_t(y_out, 10), width=0.3)
plt.show()

NameError: name 'teacher_model' is not defined

In [27]:
 teacherNet = VGG16()

In [28]:
 teacherNet.load_state_dict(torch.load("./teacher.pt"))

<All keys matched successfully>

In [29]:
teacherNet.eval()

VGG16(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU(inplace=True)
    (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU(inplace=True)
    (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (14): Conv2d(128, 2

In [35]:
teacherNet.train(mode=False)
teacherNet = teacherNet.to(device)


In [None]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
with torch.no_grad():
    data, target = next(iter(test_loader_bs1))
    data, target = data.to('cuda'), target.to('cuda')
    output = teacherNet(data)

test_x = data.cpu().numpy()
y_out = output.cpu().numpy()
y_out = y_out[0, ::]
print('Output (NO softmax):', y_out)



plt.subplot(3, 1, 1)
plt.imshow(test_x[0, 0, ::])

plt.subplot(3, 1, 2)
plt.bar(list(range(10)), softmax_t(y_out, 1), width=0.3)

plt.subplot(3, 1, 3)
plt.bar(list(range(10)), softmax_t(y_out, 10), width=0.3)
plt.show()

Output (NO softmax): [-20.48947  -19.996756 -24.105139 -18.923903 -21.519308 -21.83338
 -20.308414 -19.067097 -18.018135  -6.518038]


In [31]:
def distillation(y, labels, teacher_scores, temp, alpha):
    return nn.KLDivLoss()(F.log_softmax(y / temp, dim=1), F.softmax(teacher_scores / temp, dim=1)) * (
            temp * temp * 2.0 * alpha) + F.cross_entropy(y, labels) * (1. - alpha)

In [34]:
def train_student_kd(model, device, train_loader, optimizer, epoch):
    model.train()
    trained_samples = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        teacher_output = teacherNet(data)
        teacher_output = teacherNet.detach()  # 切断老师网络的反向传播，感谢B站“淡淡的落”的提醒
        loss = distillation(output, target, teacher_output, temp=5.0, alpha=0.7)
        loss.backward()
        optimizer.step()

        trained_samples += len(data)
        progress = math.ceil(batch_idx / len(train_loader) * 50)
        print("\rTrain epoch %d: %d/%d, [%-51s] %d%%" %
              (epoch, trained_samples, len(train_loader.dataset),
               '-' * progress + '>', progress * 2), end='')

In [None]:
def test_student_kd(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.cross_entropy(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest: average loss: {:.4f}, accuracy: {}/{} ({:.0f}%)'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    return test_loss, correct / len(test_loader.dataset)