In [2]:
%matplotlib inline
import torch
from torch.utils.data import TensorDataset, DataLoader  
from torch.nn import init
import torch.optim as optim 
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
from collections import OrderedDict
import numpy as np
import time 
from IPython import display
from matplotlib import pyplot as plt

In [4]:
def dropout(X, drop_prob):
    X = X.float()
    assert 0<=drop_prob<=1
    keep_prob = 1- drop_prob
    if keep_prob == 0:
        return torch.zeros_like(X)
    mask = (torch.rand(X.shape,dtype = torch.float)<keep_prob).float()
    return mask*X/keep_prob

In [6]:
X = torch.arange(16).view(2,8)

In [14]:
num_inputs, num_outputs, num_hiddens1, num_hiddens2 = 784, 10, 256, 256

w1 = torch.tensor(np.random.normal(0, 0.01, (num_inputs, num_hiddens1)), dtype = torch.float,requires_grad = True)
b1 = torch.zeros(num_hiddens1, requires_grad = True)
w2 = torch.tensor(np.random.normal(0, 0.01, (num_hiddens1, num_hiddens2)), dtype = torch.float,requires_grad = True)
b2 = torch.zeros(num_hiddens2, requires_grad = True)
w3 = torch.tensor(np.random.normal(0, 0.01, (num_hiddens2, num_outputs)), dtype = torch.float,requires_grad = True)
b3 = torch.zeros(num_outputs, requires_grad = True)

params = [w1, b1, w2, b2, w3, b3]

In [8]:
#数据切割
def load_data_fashion_mnist(batch_size):
    mnist_train = torchvision.datasets.FashionMNIST(root = './Dataset/FashionMNIST', train = True, 
                                                    download = True, transform = transforms.ToTensor())
    mnist_test = torchvision.datasets.FashionMNIST(root = './Dataset/FashionMNIST', train = False, 
                                                   download = True, transform = transforms.ToTensor())
    num_workers = 4
    train_iter = DataLoader(mnist_train, batch_size = batch_size, shuffle = True, num_workers = num_workers)
    test_iter = DataLoader(mnist_test, batch_size = batch_size, shuffle = True, num_workers = num_workers)
    return train_iter, test_iter

train_iter, test_iter = load_data_fashion_mnist(batch_size = 256)

In [9]:
def network(X, params, drop_prob1, drop_prob2, is_training = True ):
    X = X.view(-1, num_inputs)
    w1, b1, w2, b2, w3, b3 = params
    h1 = (torch.matmul(X, w1) + b1).relu()
    if is_training:
        h1 = dropout(h1, drop_prob1)
    h2 = (torch.matmul(h1, w2) + b2).relu()
    if is_training:
        h2 = dropout(h2, drop_prob2)
    return torch.matmul(h2, w3) + b3    

In [28]:
def softmax(y):
    y_exp = y.exp()
    partition = y_exp.sum(dim = 1, keepdim = True)
    return (y_exp/partition)

In [20]:
def sgd(params, lr):
    for param in params:
        param.data -= lr*param.grad

In [21]:
def crossentrpyloss(y_hat, y):
    return -(torch.log(y_hat.gather(1, y.view(-1,1))).mean())

In [35]:
def evaluate_accuracy(data_iter, params, drop_prob1, drop_prob2, network):
    acc_num, n = 0, 0
    for X, y in data_iter:
        if isinstance(network, nn.Module):
            network.eval()
            acc_num += (network(X).argmax(dim = 1, keepdim = True) == y.view(-1,1)).float().sum().item()
            network.train()
        else:
            if('is_training' in network.__code__.co_varnames):
                acc_num += (network(X, params, drop_prob1, drop_prob2, is_training = False).argmax(dim = 1, keepdim = True) == y.view(-1,1)).float().sum().item()
            else:
                acc_num += (network(X).argmax(dim = 1, keepdim = True) == y.view(-1,1)).float().sum().item()
        n += len(y)
    return acc_num/n

In [45]:
def train_loop_dropout(n_epoch, drop_prob1, drop_prob2, 
                       train_iter, test_iter, 
                       model_fn, loss_fn, 
                       params = None, lr = None, optimizer = None):
    for epoch in range(n_epoch):
        train_loss, train_acc_num, n, i = 0.0, 0.0, 0, 0
        for train_X, train_y in train_iter:
            y_hat = softmax(model_fn(train_X, params, drop_prob1, drop_prob2))
            loss = loss_fn(y_hat, train_y)
            
            if optimizer is not None:
                optimizer.zero_grad()
            elif params is not None and params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()
            
            loss.backward()
            
            if optimizer is not None:
                optimizer.step()
            else:
                sgd(params, lr)
            
            train_loss += loss.item()
            train_acc_num += (y_hat.argmax(dim = 1) == train_y).float().sum().item()
            n += len(train_y)
            i+=1
        if epoch%5 == 0:
            test_loss, j = 0.0, 0
            acc_test = evaluate_accuracy(test_iter, params, drop_prob1, drop_prob2, model_fn)
            acc_train = train_acc_num/n
            for test_X, test_y in test_iter:
                j += 1
                test_loss += loss_fn(softmax(model_fn(test_X, params, drop_prob1, drop_prob2, is_training = False)), test_y).item()
            print('epoch: %d, acc_train: %.2f, acc_test: %.2f, train_loss: %.2f, test_loss, %.2f'% (epoch, acc_train, acc_test, train_loss/i, test_loss/j))
    return params

In [46]:
train_loop_dropout(n_epoch = 20, drop_prob1 = 0.2, drop_prob2 = 0.5, 
                       train_iter = train_iter, test_iter = test_iter, 
                       model_fn = network, loss_fn = crossentrpyloss, 
                       params = params, lr = 0.1, optimizer = None)

epoch: 0, acc_train: 0.89, acc_test: 0.87, train_loss: 0.31, test_loss, 0.36
epoch: 5, acc_train: 0.89, acc_test: 0.87, train_loss: 0.29, test_loss, 0.37
epoch: 10, acc_train: 0.90, acc_test: 0.88, train_loss: 0.28, test_loss, 0.33
epoch: 15, acc_train: 0.90, acc_test: 0.88, train_loss: 0.26, test_loss, 0.33


[tensor([[-0.0018, -0.0108,  0.0001,  ...,  0.0147, -0.0201, -0.0013],
         [ 0.0044, -0.0094,  0.0123,  ...,  0.0019, -0.0214, -0.0026],
         [-0.0109,  0.0055, -0.0287,  ..., -0.0040,  0.0047,  0.0048],
         ...,
         [-0.0208, -0.0056,  0.0045,  ...,  0.0038,  0.0127,  0.0063],
         [-0.0134, -0.0008,  0.0065,  ...,  0.0019,  0.0055, -0.0002],
         [-0.0061, -0.0039, -0.0010,  ...,  0.0008,  0.0105,  0.0035]],
        requires_grad=True),
 tensor([-4.4109e-02,  2.1628e-02, -2.5281e-03,  5.1956e-03,  3.4606e-03,
          1.7530e-01,  5.2560e-03, -8.0741e-02, -1.2479e-01,  1.0875e-03,
          5.8505e-02, -5.6759e-02,  8.8973e-03,  9.6511e-02,  1.2680e-01,
          2.0120e-01,  7.6371e-02,  1.3513e-01, -1.1088e-01,  2.7709e-02,
          5.4412e-04,  3.8776e-02,  1.7626e-01,  1.6261e-02,  9.2947e-02,
         -2.2806e-01,  7.3626e-02, -3.5554e-02,  1.6702e-01,  1.0270e-03,
          5.4554e-02, -4.9989e-02,  1.4277e-01,  1.9133e-01,  1.5309e-03,
          7.