In [5]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

import torch
device = torch.device("cuda")
import torch.nn.functional as F
import wandb

wandb.init(project="Thomas-Masters-Project")

learning_rate = 0.001
epochs = 5
batch_size = 256

wandb.config = {
    "learning_rate": learning_rate,
    "epochs": epochs,
    "batch_size": batch_size
}

def test(model, test_loader):
    model.eval()
    correct = 0
    with torch.no_grad(): 
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output, *_ = model(data)
            pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
            _, idx = output.max(dim=1)
            correct += (idx == target).sum().item()

    accuracy = 100. * correct / len(test_loader.dataset)
    print('Test set: Accuracy: {}/{} ({:.0f}%)\n'.format(
        correct, len(test_loader.dataset), accuracy))

    wandb.log({"accuracy": accuracy})

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▅▇▆█
loss,█▆▆▅▄▄▄▄▃▃▃▃▃▃▃▂▃▂▂▂▂▂▂▁▂▂▁▁▁▂▁▂▁▁▁▁▁▁▁▁

0,1
accuracy,1.18
loss,0.02136


In [6]:
# MNIST

from torch.utils import data
from torchvision import datasets
from torchvision import transforms

train_loader = data.DataLoader(
        datasets.MNIST('./data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=batch_size, shuffle=True, drop_last=True)

test_loader = data.DataLoader(
        datasets.MNIST('./data', train=False,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=2048, shuffle=False, drop_last=False)

In [10]:
# MNIST

import torch.nn as nn
from torch.optim import Adam

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(1, 32, 5, 1),
            nn.BatchNorm2d(32),
            nn.ReLU(True),
            nn.MaxPool2d(2)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(32, 64, 5, 1),
            nn.BatchNorm2d(64),
            nn.ReLU(True),
            nn.MaxPool2d(2)
        )
        
        self.fc1 = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(True)
        )
        self.fc2 = nn.Linear(512, 10)

    def forward(self, x):
        conv1 = self.conv1(x)
        conv2 = self.conv2(conv1)
        flatten = conv2.view(x.shape[0], -1)        
        fc1 = self.fc1(flatten)
        fc2 = self.fc2(fc1)
        return fc2, [conv1, conv2]

In [13]:
# MNIST

import numpy as np
from large_margin import LargeMarginLoss

lm = LargeMarginLoss(
    gamma=10000,
    alpha_factor=4,
    top_k=1,
    dist_norm=np.inf
)


def train_lm(model, train_loader, optimizer, epoch, lm):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data = data.to(device)
        one_hot = torch.zeros(len(target), 10).scatter_(1, target.unsqueeze(1), 1.).float()
        one_hot = one_hot.cuda()
        optimizer.zero_grad()
        output, feature_maps = model(data)
        print(feature_maps[0].shape)
        #loss = F.mse_loss(output, target) * 5e-4 # l2_loss_weght
        loss = lm(output, one_hot, feature_maps)
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

net = Net().to(device)
# net = nn.DataParallel(net).to(device)
optim = Adam(net.parameters())
for i in range(0, epochs):
    train_lm(net, train_loader, optim, i, lm)
    test(net, test_loader)

torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size([256, 32, 12, 12])
torch.Size

KeyboardInterrupt: 

In [2]:
from torch.utils import data
from torchvision import datasets
from torchvision import transforms

train_loader = data.DataLoader(
        datasets.CIFAR100('./data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229, 0.224, 0.225])
                       ])),
        batch_size=batch_size, shuffle=True, drop_last=True)

test_loader = data.DataLoader(
        datasets.CIFAR100('./data', train=False,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize(mean=[0.485,0.456,0.406], std=[0.229, 0.224, 0.225])
                       ])),
        batch_size=2048, shuffle=False, drop_last=False)

Files already downloaded and verified


In [3]:
import torch.nn as nn
from torch.optim import Adam, SGD, RMSprop

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 32, 5, 1),
            nn.BatchNorm2d(32),
            nn.ReLU(True),
            nn.MaxPool2d(2)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(32, 64, 5, 1),
            nn.BatchNorm2d(64),
            nn.ReLU(True),
            nn.MaxPool2d(2)
        )
        
        self.fc1 = nn.Sequential(
            nn.Linear(1600, 1024),
            nn.ReLU(True)
        )

        self.fc2 = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(True)
        )

        self.fc3 = nn.Linear(512, 100)

    def forward(self, x):
        conv1 = self.conv1(x)
        conv2 = self.conv2(conv1)
        flatten = conv2.view(x.shape[0], -1)       
        fc1 = self.fc1(flatten)
        fc2 = self.fc2(fc1)
        fc3 = self.fc3(fc2)
        return fc3, [conv1, conv2]

In [4]:
import numpy as np
from large_margin import LargeMarginLoss


lm = LargeMarginLoss(
    gamma=10000,
    alpha_factor=4,
    top_k=1,
    dist_norm=np.inf
)

def train_lm(model, train_loader, optimizer, epoch, lm):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data = data.to(device)
        one_hot = torch.zeros(len(target), 100).scatter_(1, target.unsqueeze(1), 1.).float()
        one_hot = one_hot.cuda()
        optimizer.zero_grad()
        output, feature_maps = model(data)
        print(feature_maps.shape)
        #loss = F.mse_loss(output, target) * 5e-4 # l2_loss_weght
        loss = lm(output, one_hot, feature_maps)
        
        wandb.log({"loss": loss})
        # optional
        wandb.watch(model)
        
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

net = Net().to(device)
# net = nn.DataParallel(net).to(device)
optim = SGD(net.parameters(), lr=learning_rate, momentum=0) #Adam(net.parameters())
for i in range(0, epochs):
    train_lm(net, train_loader, optim, i, lm)
    test(net, test_loader)

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Test set: Accuracy: 90/10000 (1%)

Test set: Accuracy: 104/10000 (1%)

Test set: Accuracy: 113/10000 (1%)

Test set: Accuracy: 110/10000 (1%)

Test set: Accuracy: 118/10000 (1%)



In [None]:
def train_ce(model, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output, _ = model(data)
        loss = F.cross_entropy(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 100 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

net = Net().to(device)
# net = nn.DataParallel(net).to(device)
optim = Adam(net.parameters())
for i in range(0, epochs):    
    train_ce(net, train_loader, optim, i)
    test(net, test_loader)