In [None]:
import torch
import torch.nn as nn
from tqdm import tqdm
import matplotlib.pyplot as plt
from torchvision import datasets, transforms
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1028, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1028, shuffle=True)

In [None]:
class Adam:
    def __init__(self, model, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.model = model
        self.mt = [torch.zeros_like(p) for p in model.parameters()]
        self.vt = [torch.zeros_like(p) for p in model.parameters()]
        self.t = 0
        self.beta1 = beta1
        self.beta2 = beta2
        self.alpha = alpha
        self.epsilon = epsilon

        # print(model.parameters())
        # print(next(iter(model.parameters())).grad)
        # print(next(iter(model.parameters())))

    def zero_grad(self):
        for p in self.model.parameters():
            if p.grad is not None:
                p.grad = torch.zeros_like(p.grad)

    def step(self):
        self.t += 1

        self.mt = [self.beta1 * m + (1 - self.beta1) * p.grad for p, m in zip(self.model.parameters(), self.mt)]
        self.vt = [self.beta2 * v + (1 - self.beta2) * p.grad ** 2 for p, v in zip(self.model.parameters(), self.vt)]
        m_hat_t = [m / (1 - self.beta1 ** self.t) for m in self.mt]
        v_hat_t = [v / (1 - self.beta2 ** self.t) for v in self.vt]

        for p, m, v in zip(self.model.parameters(), m_hat_t, v_hat_t):
            p.data = p.data - self.alpha * m / (v.sqrt() + self.epsilon)
        
        # print(next(iter(model.parameters())))

    def  test(self):
        return self.mt

# model = torch.nn.Sequential(nn.Dropout(p=0.4), nn.Linear(28 * 28, 1200),
#                                 nn.Dropout(p=0.4), nn.Linear(1200, 10),
#                                 nn.LogSoftmax(dim=-1)).to(device)
# criterion = nn.NLLLoss()

# x = next(iter(train_loader))[0].view(-1, 28 * 28).to(device)
# y = next(iter(train_loader))[1].to(device)

# imp_optim = Adam(model)
# imp_optim.zero_grad()
# log_prob = model(x)
# loss = criterion(log_prob, y)
# loss.backward()
# imp_optim.step()
# imp_optim.test()[0].shape

In [None]:

def train(model, optimizer, loss_fct, nb_epochs=3):
    accuracy = []
    
    for epoch in tqdm(range(nb_epochs)):
        n = 0
        for x, y in train_loader:

            model.train()
            x = x.view(-1, 28 * 28).to(device)
            y = y.to(device)

            log_prob = model(x)
            loss = loss_fct(log_prob, y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            model.train(mode=False)
            test_accuracy = (log_prob.argmax(-1) == y).sum().item() / y.shape[0]
            accuracy.append(test_accuracy*100)
            
            # n  += 1
            # if n % 50 == 0:
            #     print("Accuracy: ", test_accuracy*100)

    return accuracy


criterion = nn.NLLLoss()
model1 = torch.nn.Sequential(nn.Dropout(p=0.4), nn.Linear(28 * 28, 500),
                            nn.Dropout(p=0.4), nn.Linear(500, 10),
                            nn.LogSoftmax(dim=-1)).to(device)
imp_optim = Adam(model1)

start_time = time.time()
imp_acc = train(model1, imp_optim, criterion)
end_time = time.time()
print(f"Training time for model1: {end_time - start_time} seconds\n\n\n")

#################################################################################################

criterion = nn.NLLLoss()
model2 = torch.nn.Sequential(nn.Dropout(p=0.4), nn.Linear(28 * 28, 500),
                            nn.Dropout(p=0.4), nn.Linear(500, 10),
                            nn.LogSoftmax(dim=-1)).to(device)
pytorch_optim = torch.optim.Adam(model2.parameters())

start_time = time.time()
torch_acc = train(model2, pytorch_optim, criterion)
end_time = time.time()
print(f"Training time for model2: {end_time - start_time} seconds")

In [None]:
plt.plot(imp_acc, label="Implemented Adam")
plt.plot(torch_acc, label="PyTorch Adam")
plt.legend()
plt.show()