## Lab 2
### Part 2: Dealing with overfitting

Today we work with [Fashion-MNIST dataset](https://github.com/zalandoresearch/fashion-mnist) (*hint: it is available in `torchvision`*).

Your goal for today:
1. Train a FC (fully-connected) network that achieves >= 0.885 test accuracy.
2. Cause considerable overfitting by modifying the network (e.g. increasing the number of network parameters and/or layers) and demonstrate in in the appropriate way (e.g. plot loss and accurasy on train and validation set w.r.t. network complexity).
3. Try to deal with overfitting (at least partially) by using regularization techniques (Dropout/Batchnorm/...) and demonstrate the results.

__Please, write a small report describing your ideas, tries and achieved results in the end of this file.__

*Note*: Tasks 2 and 3 are interrelated, in task 3 your goal is to make the network from task 2 less prone to overfitting. Task 1 is independent from 2 and 3.

*Note 2*: We recomment to use Google Colab or other machine with GPU acceleration.

In [1]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import torchsummary
from IPython.display import clear_output
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure
import numpy as np
import os


device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [2]:
# Technical function
def mkdir(path):
    if not os.path.exists(root_path):
        os.mkdir(root_path)
        print('Directory', path, 'is created!')
    else:
        print('Directory', path, 'already exists!')
        
root_path = 'fmnist'
mkdir(root_path)

Directory fmnist already exists!


In [3]:
download = True
train_transform = transforms.ToTensor()
test_transform = transforms.ToTensor()
transforms.Compose((transforms.ToTensor()))


fmnist_dataset_train = torchvision.datasets.FashionMNIST(root_path, 
                                                        train=True, 
                                                        transform=train_transform,
                                                        target_transform=None,
                                                        download=download)
fmnist_dataset_test = torchvision.datasets.FashionMNIST(root_path, 
                                                       train=False, 
                                                       transform=test_transform,
                                                       target_transform=None,
                                                       download=download)

In [4]:
train_loader = torch.utils.data.DataLoader(fmnist_dataset_train, batch_size=128, shuffle=True, num_workers=2)
test_loader = torch.utils.data.DataLoader(fmnist_dataset_test, batch_size=256, shuffle=False, num_workers=2)

In [5]:
len(fmnist_dataset_test)

10000

In [6]:
for img, label in train_loader:
    print(img.shape)
    #print(img)
    print(label.shape)
    print(label.size(0))
    break

torch.Size([128, 1, 28, 28])
torch.Size([128])
128


### Task 1
Train a network that achieves $\geq 0.885$ test accuracy. It's fine to use only Linear (`nn.Linear`) layers and activations/dropout/batchnorm. Convolutional layers might be a great use, but we will meet them a bit later.

In [7]:
class TinyNeuralNetwork(nn.Module):
    def __init__(self, input_shape=28*28, num_classes=10, input_channels=1):
        super(self.__class__, self).__init__()
        self.model = nn.Sequential(
            nn.Flatten(), # This layer converts image into a vector to use Linear layers afterwards
            nn.Linear(input_shape, 256),
            nn.Tanh(),
            nn.Linear(256, 512),
            nn.Tanh(),
            nn.Linear(512, num_classes),
        )
        
    def forward(self, inp):       
        out = self.model(inp)
        return out

In [8]:
torchsummary.summary(TinyNeuralNetwork().to(device), (28*28,))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                  [-1, 784]               0
            Linear-2                  [-1, 256]         200,960
              Tanh-3                  [-1, 256]               0
            Linear-4                  [-1, 512]         131,584
              Tanh-5                  [-1, 512]               0
            Linear-6                   [-1, 10]           5,130
Total params: 337,674
Trainable params: 337,674
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.02
Params size (MB): 1.29
Estimated Total Size (MB): 1.31
----------------------------------------------------------------


Your experiments come here:

In [9]:
def train_epoch(model, device, train_loader, test_loader, criterion, optimizer): 
    torch.manual_seed(5)

    model.train()

    current_test_accuracy = 0
    running_loss = 0
    pred = 0
    correct = 0
    for data, label in train_loader:
        data, label = data.to(device), label.to(device)

        optimizer.zero_grad()
        output = model(data)

        loss = criterion(output, label)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(label.view_as(pred)).sum().item()
    
    current_loss = running_loss / len(train_loader.dataset)
    print(f'Current train loss = {current_loss}')

    current_train_accuracy = correct / len(train_loader.dataset)
    print(f'Current train accuracy = {current_train_accuracy}')

    model.eval()

    current_test_loss = 0
    current_test_accuracy = 0 

    test_loss = 0
    correct = 0   
    running_loss = 0
    with torch.no_grad():
        for data, label in test_loader:
            data, label = data.to(device), label.to(device)
            output = model(data)

            test_loss = criterion(output, label)
            running_loss += test_loss.item()

            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(label.view_as(pred)).sum().item()

    current_test_loss = running_loss / len(test_loader.dataset) 
    print(f'Current test loss = {current_test_loss}')

    current_test_accuracy = correct / len(test_loader.dataset)
    print(f'Current test accuracy = {current_test_accuracy}')

    return current_loss, current_train_accuracy, current_test_loss, current_test_accuracy



In [10]:
def train_model(epochs, model, device, train_loader, test_loader, loss_func, opt):
    test_loss_history = list()
    train_loss_history = list()
    test_ac_history = list()
    train_ac_history = list()

    train_loss = 0
    test_loss = 0
    test_ac = 0
    train_ac = 0
    for epoch in range(1, epochs+1):
        
        print(f'Training Epoch {epoch}')
        train_loss, train_ac, test_loss, test_ac = train_epoch(model, device, train_loader, test_loader, loss_func, opt)
        

        train_loss_history.append(train_loss)
        test_loss_history.append(test_loss)
        test_ac_history.append(test_ac)
        train_ac_history.append(train_ac)

    return train_loss_history, test_loss_history, train_ac_history, test_ac_history

In [11]:
model = TinyNeuralNetwork().to(device)
opt = torch.optim.Adam(model.parameters(), lr=0.0003) # YOUR CODE HERE
loss_func = nn.CrossEntropyLoss() # YOUR CODE HERE
epochs = 30

In [12]:
train_loss_history, test_loss_history, train_ac_history, test_ac_history = train_model(epochs, model, device, train_loader, test_loader, loss_func, opt)

Training Epoch 1
Current train loss = 0.004723517412443956
Current train accuracy = 0.79045
Current test loss = 0.0019249307334423065
Current test accuracy = 0.8235
Training Epoch 2
Current train loss = 0.003233499656120936
Current train accuracy = 0.8511333333333333
Current test loss = 0.001774142238497734
Current test accuracy = 0.838
Training Epoch 3
Current train loss = 0.0029774306101103625
Current train accuracy = 0.8627833333333333
Current test loss = 0.001687738673388958
Current test accuracy = 0.8433
Training Epoch 4
Current train loss = 0.002810122769822677
Current train accuracy = 0.8703833333333333
Current test loss = 0.0016249188616871833
Current test accuracy = 0.8505
Training Epoch 5
Current train loss = 0.0026813768977920214
Current train accuracy = 0.8756666666666667
Current test loss = 0.0015755114004015922
Current test accuracy = 0.8545
Training Epoch 6
Current train loss = 0.0025752032237748306
Current train accuracy = 0.8796166666666667
Current test loss = 0.001536

In [None]:
grid = np.arange(1, epochs + 1)

plt.plot(grid, test_ac_history, label='test accuracy')
plt.plot(grid, train_ac_history, label='train accuracy')
plt.xlabel("epoch")
plt.title("accuracy")
plt.show()

plt.plot(grid, train_loss_history, label='train loss')
plt.plot(grid, test_loss_history, label='test loss')
plt.xlabel("epoch")
plt.legend()
plt.show()


In [None]:
print(f'Test accuracy = {test_ac_history[-1]}')

### Task 2: Overfit it.
Build a network that will overfit to this dataset. Demonstrate the overfitting in the appropriate way (e.g. plot loss and accurasy on train and test set w.r.t. network complexity).

*Note:* you also might decrease the size of `train` dataset to enforce the overfitting and speed up the computations.

In [None]:
class OverfittingNeuralNetwork(nn.Module):
    def __init__(self, input_shape=28*28, num_classes=10, input_channels=1):
        super(self.__class__, self).__init__()
        self.model = nn.Sequential(
            nn.Flatten(), # This layer converts image into a vector to use Linear layers afterwards
            # Your network structure comes here
            nn.Linear(input_shape, 28*28*2),
            nn.ReLU(),
            nn.Linear(28*28*2, 512),
            nn.Tanh(),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 100),
            nn.ReLU(),
            nn.Linear(100, num_classes)
        )
        
    def forward(self, inp):       
        out = self.model(inp)
        return out

In [None]:
torchsummary.summary(OverfittingNeuralNetwork().to(device), (28*28,))

In [None]:
model_over = OverfittingNeuralNetwork().to(device)
opt = torch.optim.Adam(model_over.parameters(), lr=0.0003) # YOUR CODE HERE
loss_func = nn.CrossEntropyLoss() # YOUR CODE HERE

# Your experiments, come here
epochs = 50

train_loss_history, test_loss_history, train_ac_history, test_ac_history = train_model(epochs, model_over, device, train_loader, test_loader, loss_func, opt)

grid = np.arange(1, epochs + 1)

plt.plot(grid, test_ac_history, label='test accuracy')
plt.plot(grid, train_ac_history, label='train accuracy')
plt.xlabel("epoch")
plt.title("accuracy")
plt.show()

plt.plot(grid, train_loss_history, label='train loss')
plt.plot(grid, test_loss_history, label='test loss')
plt.xlabel("epoch")
plt.legend()
plt.show()


На графиках видно переобучение модели, график train loss уменьшается, а test loss начинает быстро расти

### Task 3: Fix it.
Fix the overfitted network from the previous step (at least partially) by using regularization techniques (Dropout/Batchnorm/...) and demonstrate the results. 

In [None]:
class FixedNeuralNetwork(nn.Module):
    def __init__(self, input_shape=28*28, num_classes=10, input_channels=1):
        super(self.__class__, self).__init__()
        self.model = nn.Sequential(
            nn.Flatten(), # This layer converts image into a vector to use Linear layers afterwards
            # Your network structure comes here
            nn.Linear(input_shape, 28*28*2),
            nn.BatchNorm1d(28*28*2),
            nn.Dropout(0.8),
            nn.ReLU(),
            nn.Linear(28*28*2, 512),
            nn.Dropout(0.8),
            nn.Tanh(),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.8),
            nn.Linear(256, 100),
            nn.Dropout(0.8),
            nn.ReLU(),
            nn.Linear(100, num_classes)
        )
        
    def forward(self, inp):       
        out = self.model(inp)
        return out

In [None]:
torchsummary.summary(FixedNeuralNetwork().to(device), (28*28,))

In [None]:
model_fix = FixedNeuralNetwork().to(device)
opt = torch.optim.Adam(model_fix.parameters(), lr=0.0003, weight_decay=0.008) # YOUR CODE HERE
loss_func = nn.CrossEntropyLoss() # YOUR CODE HERE# YOUR CODE HERE

# Your experiments, come here
epochs = 20

train_loss_history, test_loss_history, train_ac_history, test_ac_history = train_model(epochs, model_fix, device, train_loader, test_loader, loss_func, opt)

grid = np.arange(1, epochs + 1)

plt.plot(grid, test_ac_history, label='test accuracy')
plt.plot(grid, train_ac_history, label='train accuracy')
plt.xlabel("epoch")
plt.title("accuracy")
plt.show()

plt.plot(grid, train_loss_history, label='train loss')
plt.plot(grid, test_loss_history, label='test loss')
plt.xlabel("epoch")
plt.legend()
plt.show()

### Conclusions:
_Write down small report with your conclusions and your ideas._

Для борьбы с переобучением использовались Batch normalization и Dropout, но они не помогали. Дополнение L2 регулярицацией помогло.
