In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np

from torch.optim.optimizer import Optimizer, required
from torch import Tensor
from torch.nn import Parameter

from tqdm import tqdm

In [5]:
import torchvision
from torch.utils.data import DataLoader,Dataset
from torchvision import datasets, transforms, models

In [2]:
def l2normalize(v, eps=1e-12):
    return v / (v.norm() + eps)


class SpectralNorm(nn.Module):
    def __init__(self, module, name='weight', power_iterations=1):
        super(SpectralNorm, self).__init__()
        self.module = module
        self.name = name
        self.power_iterations = power_iterations
        if not self._made_params():
            self._make_params()

    def _update_u_v(self):
        u = getattr(self.module, self.name + "_u")
        v = getattr(self.module, self.name + "_v")
        w = getattr(self.module, self.name + "_bar")

        height = w.data.shape[0]
        for _ in range(self.power_iterations):
            v.data = l2normalize(torch.mv(torch.t(w.view(height,-1).data), u.data))
            u.data = l2normalize(torch.mv(w.view(height,-1).data, v.data))

        # sigma = torch.dot(u.data, torch.mv(w.view(height,-1).data, v.data))
        sigma = u.dot(w.view(height, -1).mv(v))
        setattr(self.module, self.name, w / sigma.expand_as(w))

    def _made_params(self):
        try:
            u = getattr(self.module, self.name + "_u")
            v = getattr(self.module, self.name + "_v")
            w = getattr(self.module, self.name + "_bar")
            return True
        except AttributeError:
            return False


    def _make_params(self):
        w = getattr(self.module, self.name)

        height = w.data.shape[0]
        width = w.view(height, -1).data.shape[1]

        u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False)
        v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False)
        u.data = l2normalize(u.data)
        v.data = l2normalize(v.data)
        w_bar = Parameter(w.data)

        del self.module._parameters[self.name]

        self.module.register_parameter(self.name + "_u", u)
        self.module.register_parameter(self.name + "_v", v)
        self.module.register_parameter(self.name + "_bar", w_bar)


    def forward(self, *args):
        self._update_u_v()
        return self.module.forward(*args)

In [3]:
class Self_Attn(nn.Module):
    """ Self attention Layer"""
    def __init__(self,in_dim,activation):
        super(Self_Attn,self).__init__()
        self.chanel_in = in_dim
        self.activation = activation
        
        self.query_conv = nn.Conv2d(in_channels = in_dim , out_channels = in_dim//8 , kernel_size= 1)
        self.key_conv = nn.Conv2d(in_channels = in_dim , out_channels = in_dim//8 , kernel_size= 1)
        self.value_conv = nn.Conv2d(in_channels = in_dim , out_channels = in_dim , kernel_size= 1)
        self.gamma = nn.Parameter(torch.zeros(1))

        self.softmax  = nn.Softmax(dim=-1) #
    def forward(self,x):
        """
            inputs :
                x : input feature maps( B X C X W X H)
            returns :
                out : self attention value + input feature 
                attention: B X N X N (N is Width*Height)
        """
        m_batchsize,C,width ,height = x.size()
        proj_query  = self.query_conv(x).view(m_batchsize,-1,width*height).permute(0,2,1) # B X CX(N)
        proj_key =  self.key_conv(x).view(m_batchsize,-1,width*height) # B X C x (*W*H)
        energy =  torch.bmm(proj_query,proj_key) # transpose check
        attention = self.softmax(energy) # BX (N) X (N) 
        proj_value = self.value_conv(x).view(m_batchsize,-1,width*height) # B X C X N

        out = torch.bmm(proj_value,attention.permute(0,2,1) )
        out = out.view(m_batchsize,C,width,height)
        
        out = self.gamma*out + x
        return out,attention


class SelfAttention(nn.Module):
    """Discriminator, Auxiliary Classifier."""

    def __init__(self, batch_size=64, image_size=64, conv_dim=64):
        super(SelfAttention, self).__init__()
        self.imsize = image_size
        layer1 = []
        layer2 = []
        layer3 = []
        last = []

        layer1.append(SpectralNorm(nn.Conv2d(3, conv_dim, 4, 2, 1)))
        layer1.append(nn.LeakyReLU(0.1))

        curr_dim = conv_dim

        layer2.append(SpectralNorm(nn.Conv2d(curr_dim, curr_dim * 2, 4, 2, 1)))
        layer2.append(nn.LeakyReLU(0.1))
        curr_dim = curr_dim * 2

        layer3.append(SpectralNorm(nn.Conv2d(curr_dim, curr_dim * 2, 4, 2, 1)))
        layer3.append(nn.LeakyReLU(0.1))
        curr_dim = curr_dim * 2

        if self.imsize == 64:
            layer4 = []
            layer4.append(SpectralNorm(nn.Conv2d(curr_dim, curr_dim * 2, 4, 2, 1)))
            layer4.append(nn.LeakyReLU(0.1))
            self.l4 = nn.Sequential(*layer4)
            curr_dim = curr_dim*2
        self.l1 = nn.Sequential(*layer1)
        self.l2 = nn.Sequential(*layer2)
        self.l3 = nn.Sequential(*layer3)

        last.append(nn.Conv2d(curr_dim, 1, 4))
        last.append(nn.Flatten())
        last.append(nn.Linear(13*13,1))
        last.append(nn.Sigmoid())
        self.last = nn.Sequential(*last)

        self.attn1 = Self_Attn(256, 'relu')
        self.attn2 = Self_Attn(512, 'relu')

    def forward(self, x):
        out = self.l1(x)
        out = self.l2(out)
        out = self.l3(out)
        out,p1 = self.attn1(out)
        out=self.l4(out)
        out,p2 = self.attn2(out)
        out=self.last(out)
        # return out
        return out.squeeze(), p1, p2

In [12]:
# https://discuss.pytorch.org/t/balanced-sampling-between-classes-with-torchvision-dataloader/2703/3
def make_weights_for_balanced_classes(images, nclasses):                        
    count = [0] * nclasses                                                      
    for item in images:                                                         
        count[item[1]] += 1                                                     
    weight_per_class = [0.] * nclasses                                      
    N = float(sum(count))  
    print(count)
    for i in range(nclasses):                                                   
        weight_per_class[i] = N/float(count[i])   
    print(weight_per_class)
    weight = [0] * len(images)                                              
    for idx, val in enumerate(images):                                          
        weight[idx] = weight_per_class[val[1]]                                  
    return weight   


In [4]:
model = SelfAttention()

In [8]:
model = model.cuda()

In [10]:
train_transforms = transforms.Compose([transforms.Resize(256),
                                       transforms.RandomHorizontalFlip(p=0.5),
                                       transforms.RandomApply([
                                           transforms.RandomRotation(5),
                                           transforms.RandomAffine(degrees=5,scale=(0.95,1.05))
                                           ], p=0.5),
                                       transforms.ToTensor(),
                                       transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                             std=[0.229, 0.224, 0.225])
                                       
                                       ])
train_data = datasets.ImageFolder('/data/tam/kaggle/extract_raw_img',       
                    transform=train_transforms)



In [13]:
weights = make_weights_for_balanced_classes(train_data.imgs, len(train_data.classes))                                                                
weights = torch.DoubleTensor(weights)                                       
sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights)) 

[594342, 548450]
[1.9227851977480979, 2.0836758136566687]


In [14]:
trainloader = torch.utils.data.DataLoader(train_data, batch_size=16,                             
                    sampler = sampler, num_workers=1, pin_memory=True)

In [15]:
test_data = datasets.ImageFolder('/data/tam/kaggle/extract_raw_img_test',       
                    transform=train_transforms)

testloader = torch.utils.data.DataLoader(test_data, batch_size=16,num_workers=1, pin_memory=True)

In [17]:
criterion = nn.BCELoss().cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [23]:
epochs = 1
steps = 0
running_loss = 0
print_every = 100
train_losses, test_losses = [], []
for epoch in range(epochs):
    for inputs, labels in tqdm(trainloader):
#     for inputs, labels in tqdm(testloader):
        model.train()
        steps += 1
#         labels = np.array([labels])
        inputs, labels = inputs.to(device), labels.float().to(device)
#         inputs, labels = inputs.to(device), labels[1].float().to(device)

        optimizer.zero_grad()
        logps,_,_ = model.forward(inputs)
        loss = criterion(logps, labels)
#         loss = F.binary_cross_entropy_with_logits(logps, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
        if steps % print_every == 0:
            print(loss)
            test_loss = 0
            accuracy = 0
            model.eval()
            with torch.no_grad():
                for inputs, labels in testloader:
                    inputs, labels = inputs.to(device),labels.float().to(device)
                    logps,_,_ = model.forward(inputs)
                    batch_loss = criterion(logps, labels)
                    test_loss += batch_loss.item()
                    equals = labels == (logps >0.5)
                    accuracy += torch.mean(equals.type(torch.FloatTensor)).item()
            print(f"Epoch {epoch+1}/{epochs}.. "
                  f"Train loss: {running_loss/print_every:.3f}.. "
                  f"Test loss: {test_loss/len(testloader):.3f}.. "
                  f"Test accuracy: {accuracy/len(testloader):.3f}")
            running_loss = 0
            model.train()




  0%|          | 0/71425 [00:00<?, ?it/s][A[A[A


  0%|          | 1/71425 [00:00<8:40:03,  2.29it/s][A[A[A


  0%|          | 4/71425 [00:00<6:19:11,  3.14it/s][A[A[A


  0%|          | 7/71425 [00:00<4:39:23,  4.26it/s][A[A[A


  0%|          | 10/71425 [00:00<3:28:58,  5.70it/s][A[A[A


  0%|          | 13/71425 [00:00<2:39:55,  7.44it/s][A[A[A


  0%|          | 16/71425 [00:01<2:06:36,  9.40it/s][A[A[A


  0%|          | 19/71425 [00:01<1:42:19, 11.63it/s][A[A[A


  0%|          | 22/71425 [00:01<1:25:20, 13.94it/s][A[A[A


  0%|          | 25/71425 [00:01<1:13:30, 16.19it/s][A[A[A


  0%|          | 28/71425 [00:01<1:04:51, 18.35it/s][A[A[A


  0%|          | 31/71425 [00:01<59:44, 19.92it/s]  [A[A[A


  0%|          | 34/71425 [00:01<55:59, 21.25it/s][A[A[A


  0%|          | 37/71425 [00:01<53:10, 22.37it/s][A[A[A


  0%|          | 40/71425 [00:01<51:20, 23.17it/s][A[A[A


  0%|          | 43/71425 [00:02<49:56, 23.82it/s][A[A

tensor(0.7338, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)





  0%|          | 97/71425 [00:19<46:10, 25.75it/s][A[A[A


  0%|          | 100/71425 [00:32<56:41:08,  2.86s/it][A[A[A


  0%|          | 103/71425 [00:32<39:54:54,  2.01s/it][A[A[A

Epoch 1/1.. Train loss: 0.694.. Test loss: 0.717.. Test accuracy: 0.359





  0%|          | 106/71425 [00:32<28:10:23,  1.42s/it][A[A[A


  0%|          | 109/71425 [00:32<19:56:57,  1.01s/it][A[A[A


  0%|          | 112/71425 [00:33<14:11:38,  1.40it/s][A[A[A


  0%|          | 115/71425 [00:33<10:09:58,  1.95it/s][A[A[A


  0%|          | 118/71425 [00:33<7:20:34,  2.70it/s] [A[A[A


  0%|          | 121/71425 [00:33<5:22:25,  3.69it/s][A[A[A


  0%|          | 124/71425 [00:33<3:59:35,  4.96it/s][A[A[A


  0%|          | 127/71425 [00:33<3:02:15,  6.52it/s][A[A[A


  0%|          | 130/71425 [00:33<2:21:32,  8.40it/s][A[A[A


  0%|          | 133/71425 [00:33<1:53:38, 10.46it/s][A[A[A


  0%|          | 136/71425 [00:33<1:33:07, 12.76it/s][A[A[A


  0%|          | 139/71425 [00:34<1:19:29, 14.95it/s][A[A[A


  0%|          | 142/71425 [00:34<1:09:20, 17.13it/s][A[A[A


  0%|          | 145/71425 [00:34<1:03:03, 18.84it/s][A[A[A


  0%|          | 148/71425 [00:34<57:53, 20.52it/s]  [A[A[A


  0%|          | 

tensor(0.6482, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)





  0%|          | 199/71425 [00:49<47:40, 24.90it/s][A[A[A


  0%|          | 200/71425 [01:04<168:48:34,  8.53s/it][A[A[A


  0%|          | 203/71425 [01:04<118:24:16,  5.98s/it][A[A[A

Epoch 1/1.. Train loss: 0.686.. Test loss: 0.871.. Test accuracy: 0.347





  0%|          | 206/71425 [01:05<83:06:43,  4.20s/it] [A[A[A


  0%|          | 209/71425 [01:05<58:24:18,  2.95s/it][A[A[A


  0%|          | 212/71425 [01:05<41:06:45,  2.08s/it][A[A[A


  0%|          | 215/71425 [01:05<29:00:36,  1.47s/it][A[A[A


  0%|          | 218/71425 [01:05<20:32:29,  1.04s/it][A[A[A


  0%|          | 221/71425 [01:05<14:36:29,  1.35it/s][A[A[A


  0%|          | 224/71425 [01:05<10:27:27,  1.89it/s][A[A[A


  0%|          | 227/71425 [01:05<7:32:52,  2.62it/s] [A[A[A


  0%|          | 230/71425 [01:05<5:30:39,  3.59it/s][A[A[A


  0%|          | 233/71425 [01:06<4:05:04,  4.84it/s][A[A[A


  0%|          | 236/71425 [01:06<3:05:15,  6.40it/s][A[A[A


  0%|          | 239/71425 [01:06<2:23:18,  8.28it/s][A[A[A


  0%|          | 242/71425 [01:06<1:53:57, 10.41it/s][A[A[A


  0%|          | 245/71425 [01:06<1:33:29, 12.69it/s][A[A[A


  0%|          | 248/71425 [01:06<1:19:12, 14.98it/s][A[A[A


  0%|        

tensor(0.6206, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)





  0%|          | 299/71425 [01:19<46:39, 25.41it/s][A[A[A


  0%|          | 300/71425 [01:36<167:50:55,  8.50s/it][A[A[A


  0%|          | 303/71425 [01:37<117:43:43,  5.96s/it][A[A[A

Epoch 1/1.. Train loss: 0.680.. Test loss: 0.833.. Test accuracy: 0.198





  0%|          | 306/71425 [01:37<82:38:52,  4.18s/it] [A[A[A


  0%|          | 309/71425 [01:37<58:05:32,  2.94s/it][A[A[A


  0%|          | 312/71425 [01:37<40:54:02,  2.07s/it][A[A[A


  0%|          | 315/71425 [01:37<28:52:27,  1.46s/it][A[A[A


  0%|          | 318/71425 [01:37<20:27:23,  1.04s/it][A[A[A


  0%|          | 321/71425 [01:37<14:33:08,  1.36it/s][A[A[A


  0%|          | 324/71425 [01:37<10:26:08,  1.89it/s][A[A[A


  0%|          | 327/71425 [01:38<7:32:17,  2.62it/s] [A[A[A


  0%|          | 330/71425 [01:38<5:31:24,  3.58it/s][A[A[A


  0%|          | 333/71425 [01:38<4:06:22,  4.81it/s][A[A[A


  0%|          | 336/71425 [01:38<3:07:04,  6.33it/s][A[A[A


  0%|          | 339/71425 [01:38<2:25:27,  8.14it/s][A[A[A


  0%|          | 342/71425 [01:38<1:56:22, 10.18it/s][A[A[A


  0%|          | 345/71425 [01:38<1:35:50, 12.36it/s][A[A[A


  0%|          | 348/71425 [01:38<1:21:34, 14.52it/s][A[A[A


  0%|        

tensor(0.6914, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)





  1%|          | 399/71425 [01:59<46:03, 25.70it/s][A[A[A


  1%|          | 400/71425 [02:09<167:39:59,  8.50s/it][A[A[A


  1%|          | 403/71425 [02:09<117:35:43,  5.96s/it][A[A[A

Epoch 1/1.. Train loss: 0.698.. Test loss: 0.687.. Test accuracy: 0.538





  1%|          | 406/71425 [02:09<82:33:09,  4.18s/it] [A[A[A


  1%|          | 409/71425 [02:09<58:01:13,  2.94s/it][A[A[A


  1%|          | 412/71425 [02:09<40:51:29,  2.07s/it][A[A[A


  1%|          | 415/71425 [02:09<28:51:15,  1.46s/it][A[A[A


  1%|          | 418/71425 [02:09<20:25:58,  1.04s/it][A[A[A


  1%|          | 421/71425 [02:09<14:32:46,  1.36it/s][A[A[A


  1%|          | 424/71425 [02:10<10:26:00,  1.89it/s][A[A[A


  1%|          | 427/71425 [02:10<7:32:50,  2.61it/s] [A[A[A


  1%|          | 430/71425 [02:10<5:31:35,  3.57it/s][A[A[A


  1%|          | 433/71425 [02:10<4:07:09,  4.79it/s][A[A[A


  1%|          | 436/71425 [02:10<3:07:23,  6.31it/s][A[A[A


  1%|          | 439/71425 [02:10<2:26:06,  8.10it/s][A[A[A


  1%|          | 442/71425 [02:10<1:56:56, 10.12it/s][A[A[A


  1%|          | 445/71425 [02:10<1:36:36, 12.24it/s][A[A[A


  1%|          | 448/71425 [02:11<1:22:04, 14.41it/s][A[A[A


  1%|        

tensor(0.7042, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)





  1%|          | 499/71425 [02:29<54:09, 21.83it/s][A[A[A


  1%|          | 500/71425 [02:41<166:19:45,  8.44s/it][A[A[A


  1%|          | 503/71425 [02:41<116:39:57,  5.92s/it][A[A[A

Epoch 1/1.. Train loss: 0.685.. Test loss: 0.727.. Test accuracy: 0.473





  1%|          | 506/71425 [02:41<81:54:29,  4.16s/it] [A[A[A


  1%|          | 509/71425 [02:41<57:34:48,  2.92s/it][A[A[A


  1%|          | 512/71425 [02:41<40:33:08,  2.06s/it][A[A[A


  1%|          | 515/71425 [02:41<28:38:01,  1.45s/it][A[A[A


  1%|          | 518/71425 [02:42<20:17:07,  1.03s/it][A[A[A


  1%|          | 521/71425 [02:42<14:26:53,  1.36it/s][A[A[A


  1%|          | 524/71425 [02:42<10:22:01,  1.90it/s][A[A[A


  1%|          | 527/71425 [02:42<7:29:59,  2.63it/s] [A[A[A


  1%|          | 530/71425 [02:42<5:29:51,  3.58it/s][A[A[A


  1%|          | 533/71425 [02:42<4:05:09,  4.82it/s][A[A[A


  1%|          | 536/71425 [02:42<3:06:38,  6.33it/s][A[A[A


  1%|          | 539/71425 [02:42<2:25:07,  8.14it/s][A[A[A


  1%|          | 542/71425 [02:43<1:56:33, 10.14it/s][A[A[A


  1%|          | 545/71425 [02:43<1:36:00, 12.31it/s][A[A[A


  1%|          | 548/71425 [02:43<1:21:23, 14.51it/s][A[A[A


  1%|        

tensor(0.6046, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)





  1%|          | 599/71425 [02:59<49:41, 23.75it/s][A[A[A


  1%|          | 600/71425 [03:13<167:28:10,  8.51s/it][A[A[A


  1%|          | 603/71425 [03:13<117:28:38,  5.97s/it][A[A[A

Epoch 1/1.. Train loss: 0.684.. Test loss: 0.703.. Test accuracy: 0.664





  1%|          | 606/71425 [03:13<82:28:38,  4.19s/it] [A[A[A


  1%|          | 609/71425 [03:14<57:58:27,  2.95s/it][A[A[A


  1%|          | 612/71425 [03:14<40:49:40,  2.08s/it][A[A[A


  1%|          | 615/71425 [03:14<28:50:19,  1.47s/it][A[A[A


  1%|          | 618/71425 [03:14<20:25:37,  1.04s/it][A[A[A


  1%|          | 621/71425 [03:14<14:32:56,  1.35it/s][A[A[A


  1%|          | 624/71425 [03:14<10:25:59,  1.89it/s][A[A[A


  1%|          | 627/71425 [03:14<7:32:11,  2.61it/s] [A[A[A


  1%|          | 630/71425 [03:14<5:31:49,  3.56it/s][A[A[A


  1%|          | 633/71425 [03:15<4:06:47,  4.78it/s][A[A[A


  1%|          | 636/71425 [03:15<3:08:04,  6.27it/s][A[A[A


  1%|          | 639/71425 [03:15<2:27:05,  8.02it/s][A[A[A


  1%|          | 642/71425 [03:15<1:57:36, 10.03it/s][A[A[A


  1%|          | 645/71425 [03:15<1:38:04, 12.03it/s][A[A[A


  1%|          | 648/71425 [03:15<1:23:22, 14.15it/s][A[A[A


  1%|        

tensor(0.6711, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)





  1%|          | 699/71425 [03:29<49:00, 24.05it/s][A[A[A


  1%|          | 700/71425 [03:45<166:15:07,  8.46s/it][A[A[A


  1%|          | 703/71425 [03:46<116:36:32,  5.94s/it][A[A[A

Epoch 1/1.. Train loss: 0.679.. Test loss: 0.746.. Test accuracy: 0.314





  1%|          | 706/71425 [03:46<81:51:42,  4.17s/it] [A[A[A


  1%|          | 709/71425 [03:46<57:33:16,  2.93s/it][A[A[A


  1%|          | 712/71425 [03:46<40:31:59,  2.06s/it][A[A[A


  1%|          | 715/71425 [03:46<28:37:09,  1.46s/it][A[A[A


  1%|          | 718/71425 [03:46<20:16:33,  1.03s/it][A[A[A


  1%|          | 721/71425 [03:46<14:26:17,  1.36it/s][A[A[A


  1%|          | 724/71425 [03:46<10:20:36,  1.90it/s][A[A[A


  1%|          | 727/71425 [03:47<7:28:36,  2.63it/s] [A[A[A


  1%|          | 730/71425 [03:47<5:28:10,  3.59it/s][A[A[A


  1%|          | 733/71425 [03:47<4:04:06,  4.83it/s][A[A[A


  1%|          | 736/71425 [03:47<3:05:08,  6.36it/s][A[A[A


  1%|          | 739/71425 [03:47<2:23:43,  8.20it/s][A[A[A


  1%|          | 742/71425 [03:47<1:54:43, 10.27it/s][A[A[A


  1%|          | 745/71425 [03:47<1:34:26, 12.47it/s][A[A[A


  1%|          | 748/71425 [03:47<1:20:12, 14.69it/s][A[A[A


  1%|        

tensor(0.7042, device='cuda:0', grad_fn=<BinaryCrossEntropyBackward>)





  1%|          | 799/71425 [04:09<48:44, 24.15it/s][A[A[A


  1%|          | 800/71425 [04:17<164:54:25,  8.41s/it][A[A[A


  1%|          | 803/71425 [04:18<115:39:37,  5.90s/it][A[A[A

Epoch 1/1.. Train loss: 0.672.. Test loss: 0.777.. Test accuracy: 0.517





  1%|          | 806/71425 [04:18<81:11:59,  4.14s/it] [A[A[A

KeyboardInterrupt: 