In [1]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

In [2]:
train=datasets.MNIST('data/MNIST_data/', train=True, download=True,
                     transform=transforms.Compose([transforms.ToTensor()]))
test=datasets.MNIST('data/MNIST_data/',
                    train=False,
                    download=True,
                    transform=transforms.Compose([transforms.ToTensor()]))

In [3]:
x=train.data.float()/255
y=train.targets

x=x.view(x.size(0), -1)
print(x.size(), y.size())

input_size=x.size(-1)
output_size=int(max(y))+1
print(input_size, output_size)

torch.Size([60000, 784]) torch.Size([60000])
784 10


In [4]:
ratios=[0.8, 0.2]
train_cnt = int(x.size(0) * ratios[0])
valid_cnt = int(x.size(0) * ratios[1])
test_cnt=len(test.data)
cnts=[train_cnt, valid_cnt]
print(cnts)

[48000, 12000]


In [5]:
indices=torch.randperm(x.size(0))
print(indices)
x=torch.index_select(x, dim=0, index=indices)
y=torch.index_select(y, dim=0, index=indices)

x=list(x.split(cnts, dim=0))
y=list(y.split(cnts, dim=0))

tensor([10621, 43798, 56664,  ...,  8893, 54656, 56144])


In [6]:
print(y[0].size(), y[1].size())

torch.Size([48000]) torch.Size([12000])


In [7]:
x+=[(test.data.float()/255).view(test_cnt, -1)]
y+=[test.targets]

for x_i, y_i in zip(x, y):
    print(x_i.size(), y_i.size())

torch.Size([48000, 784]) torch.Size([48000])
torch.Size([12000, 784]) torch.Size([12000])
torch.Size([10000, 784]) torch.Size([10000])


In [8]:
class Block(nn.Module):
    def __init__(self,
                 input_size,
                 output_size,
                 use_batch_norm=True,
                 dropout_p=0.4):
        self.input_size=input_size
        self.output_size=output_size
        self.use_batch_norm = use_batch_norm
        self.dropout_p=dropout_p

        super().__init__()

        def get_regulaizer(use_batch_norm, size):
            return nn.BatchNorm1d(size) if use_batch_norm else nn.Dropout(dropout_p)
        
        self.block=nn.Sequential(
            nn.Linear(input_size, output_size),
            nn.LeakyReLU(),
            get_regulaizer(use_batch_norm, output_size)
        )
    def forward(self, x):
        y=self.block(x)
        return y

In [9]:
class MyModel(nn.Module):
    def __init__(self, input_size, output_size, use_batch_norm=True, dropout_p=0.4 ) :
        super().__init__()

        self.layers=nn.Sequential(
            Block(input_size, 500, use_batch_norm, dropout_p),
            Block(500, 400, use_batch_norm, dropout_p),
            Block(400, 300, use_batch_norm, dropout_p),
            Block(300, 200, use_batch_norm, dropout_p),
            Block(200, 100, use_batch_norm, dropout_p),
            nn.Linear(100, output_size),
            nn.LogSoftmax(dim=-1)
        )
    
    def forward(self, x) :
        y= self.layers(x)
        return y

In [10]:
model=MyModel(input_size, output_size, use_batch_norm=True)

In [11]:
crit = nn.NLLLoss()
optimizer = optim.Adam(model.parameters())

In [12]:
device=torch.device('cpu')
if torch.cuda.is_available():
    device=torch.device('cuda')

In [13]:
model=model.to(device)
x=[x_i.to(device) for x_i in x]
y=[y_i.to(device) for y_i in y]

In [14]:
# train
n_epochs=10000
batch_size=256
print_interval=10

In [16]:
from copy import deepcopy

lowest_loss=np.inf
best_model=None
early_stop=100
lowest_epoch=np.inf

In [19]:
train_history, valid_history = [], []

for i in range(n_epochs):
    model.train()
    indices=torch.randperm(x[0].size(0)).to(device)
    x_=torch.index_select(x[0], dim=0, index=indices)
    y_=torch.index_select(y[0], dim=0, index=indices)
    x_=x_.split(batch_size, dim=0)
    y_=y_.split(batch_size, dim=0)

    train_loss, valid_loss=0,0

    y_hat = []

    for x_i, y_i in zip(x_, y_):
        y_hat_i = model(x_i)
        loss=crit(y_hat_i, y_i.squeeze())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss+=float(loss)

    train_loss=train_loss/len(x_)

    model.eval()
    with torch.no_grad():
        x_=x[1].split(batch_size, dim=0)
        y_=y[1].split(batch_size, dim=0)

        valid_loss=0

        for x_i, y_i in zip(x_, y_):
            y_hat_i=model(x_i)
            loss=crit(y_hat_i, y_i.squeeze())

            valid_loss+=float(loss)
            y_hat += [y_hat_i]

    valid_loss=valid_loss/len(x_i)

    train_history+=[train_loss]
    valid_history+=[valid_loss]

    if(i+1) % print_interval==0: 
        print('Epoch : ',(i+1), 'train loss : ',train_loss, 'valid_loss : ', valid_loss, 'lowest_loss : ', lowest_loss)

    if valid_loss <= lowest_loss :
        lowest_loss = valid_loss
        lowest_epoch=i

        best_model=deepcopy(model.state_dict())
    else:
        if early_stop > 0 and lowest_epoch + early_stop < i+1:
            print("There is no improvement duraing last %d epoch"%early_stop)
            break
    print("best validation loss from epoch %d : %4e" %(lowest_epoch, lowest_epoch))

    model.load_state_dict(best_model)

best validation loss from epoch 0 : 0.000000e+00
best validation loss from epoch 0 : 0.000000e+00
best validation loss from epoch 0 : 0.000000e+00
best validation loss from epoch 0 : 0.000000e+00
best validation loss from epoch 0 : 0.000000e+00
best validation loss from epoch 0 : 0.000000e+00
best validation loss from epoch 0 : 0.000000e+00
best validation loss from epoch 0 : 0.000000e+00
best validation loss from epoch 0 : 0.000000e+00
Epoch :  10 train loss :  0.004153110943826369 valid_loss :  0.018450313059604793 lowest_loss :  0.016080166508800176
best validation loss from epoch 0 : 0.000000e+00
best validation loss from epoch 0 : 0.000000e+00
best validation loss from epoch 0 : 0.000000e+00
best validation loss from epoch 0 : 0.000000e+00
best validation loss from epoch 0 : 0.000000e+00
best validation loss from epoch 0 : 0.000000e+00
best validation loss from epoch 0 : 0.000000e+00
best validation loss from epoch 0 : 0.000000e+00
best validation loss from epoch 0 : 0.000000e+00
