# Solution for Overfitting
- More data
- Reduce # of features
- Regularization
- Dropout

In [1]:
import torch
import torchvision.datasets as dsets
import torchvision.transforms as transforms
import random

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
learning_rate = 1e-3
training_epochs = 20
batch_size = 100
drop_prob = 0.3

In [3]:
mnist_train = dsets.MNIST(root='MNIST_data', train=True, transform=transforms.ToTensor())
mnist_test = dsets.MNIST(root='MNIST_data', train=False, transform=transforms.ToTensor())

data_loader = torch.utils.data.DataLoader(dataset=mnist_train, batch_size=batch_size, shuffle=True, drop_last=True)

In [4]:
linear1 = torch.nn.Linear(784, 512, bias=True)
linear2 = torch.nn.Linear(512, 512, bias=True)
linear3 = torch.nn.Linear(512, 512, bias=True)
linear4 = torch.nn.Linear(512, 512, bias=True)
linear5 = torch.nn.Linear(512, 10, bias=True)
relu = torch.nn.ReLU()
dropout = torch.nn.Dropout(p=drop_prob)

torch.nn.init.xavier_uniform_(linear1.weight)
torch.nn.init.xavier_uniform_(linear2.weight)
torch.nn.init.xavier_uniform_(linear3.weight)
torch.nn.init.xavier_uniform_(linear4.weight)
torch.nn.init.xavier_uniform_(linear5.weight)

Parameter containing:
tensor([[-0.0357,  0.0963, -0.0819,  ..., -0.1016,  0.0583, -0.0413],
        [ 0.0895,  0.0749,  0.0632,  ..., -0.0441, -0.0761,  0.0131],
        [-0.0832, -0.0944, -0.0843,  ..., -0.0613, -0.0363, -0.0687],
        ...,
        [ 0.0169, -0.0403,  0.0759,  ..., -0.0437,  0.0595, -0.0811],
        [-0.0350,  0.0225,  0.0283,  ...,  0.0437,  0.0443,  0.0868],
        [ 0.0080, -0.0915, -0.0996,  ...,  0.0929,  0.0489,  0.0404]],
       requires_grad=True)

In [5]:
model = torch.nn.Sequential(linear1, relu, dropout,
                            linear2, relu, dropout,
                            linear3, relu, dropout,
                            linear4, relu, dropout,
                            linear5).to(device)
criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [6]:
total_batch = len(data_loader)
model.train()
for epoch in range(training_epochs):
    avg_cost =0
    for X,Y in data_loader:
        X = X.view(-1,28*28).to(device)
        Y = Y.to(device)
        
        optimizer.zero_grad()
        hypothesis = model(X)
        cost = criterion(hypothesis, Y)
        cost.backward()
        optimizer.step()
        
        avg_cost += cost/total_batch        
    print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.9f}'.format(avg_cost))

Epoch: 0001 cost = 0.312456757
Epoch: 0002 cost = 0.141547784
Epoch: 0003 cost = 0.112969115
Epoch: 0004 cost = 0.095832258
Epoch: 0005 cost = 0.078776084
Epoch: 0006 cost = 0.074406125
Epoch: 0007 cost = 0.067496806
Epoch: 0008 cost = 0.064080909
Epoch: 0009 cost = 0.058568526
Epoch: 0010 cost = 0.054047011
Epoch: 0011 cost = 0.050450530
Epoch: 0012 cost = 0.050119083
Epoch: 0013 cost = 0.050072830
Epoch: 0014 cost = 0.045038395
Epoch: 0015 cost = 0.041415732
Epoch: 0016 cost = 0.044126023
Epoch: 0017 cost = 0.041531879
Epoch: 0018 cost = 0.039127368
Epoch: 0019 cost = 0.037724327
Epoch: 0020 cost = 0.038225565


In [7]:
with torch.no_grad():
    model.eval()
    
    X_test = mnist_test.test_data.view(-1, 28*28).float().to(device)
    Y_test = mnist_test.test_labels.to(device)
    
    prediction = model(X_test)
    correct_prediction = torch.argmax(prediction, 1) == Y_test
    accuracy = correct_prediction.float().mean()
    print('Accuracy:', accuracy.item())

Accuracy: 0.9837999939918518




In [8]:
print(torch.argmax(prediction, 1))
print(torch.argmax(prediction, 1).shape)
print(torch.argmax(prediction, 1) == Y_test)
print((torch.argmax(prediction, 1) == Y_test).float().mean())

tensor([7, 2, 1,  ..., 4, 5, 6], device='cuda:0')
torch.Size([10000])
tensor([True, True, True,  ..., True, True, True], device='cuda:0')
tensor(0.9838, device='cuda:0')
