In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import Subset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.nn import CosineSimilarity
from torchvision.transforms import ToTensor
import torchvision.models as models
from pathlib import Path
import PIL.Image
import random
import math
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import ExponentialLR

In [None]:
torch.autograd.set_detect_anomaly(True)

In [None]:
path = Path('../input')

In [None]:
len(list((path/'train'/'train').iterdir()))

In [None]:
tr_df = pd.read_csv(path/'train.csv')

In [None]:
tr_df.head()

In [None]:
len(tr_df)

In [None]:
class CactusDataset(Dataset):
    def __init__(self, path, labels):
        self.path = path
        self.labels = labels
        self.flist = list(labels.keys())
        self.len = len(self.flist)
        
    def __getitem__(self, index):
        fname = self.flist[index]
        img = ToTensor()(PIL.Image.open(self.path/fname))
        label = self.labels[fname]
        return img, label
    
    def __len__(self):
        return self.len

In [None]:
va_ratio = 0.1

In [None]:
tr_df, va_df = train_test_split(tr_df, train_size=1-va_ratio, test_size=va_ratio, random_state=42, stratify=tr_df['has_cactus'])

In [None]:
tr_labels = {k: v for k, v in zip(tr_df['id'], tr_df['has_cactus'])}
tr_ds = CactusDataset(path/'train'/'train', tr_labels)

In [None]:
va_labels = {k: v for k, v in zip(va_df['id'], va_df['has_cactus'])}
va_ds = CactusDataset(path/'train'/'train', va_labels)

In [None]:
len(tr_ds), len(va_ds)

In [None]:
bs = 64
nw = 0

In [None]:
tr_dl = DataLoader(tr_ds, batch_size=bs, num_workers=nw, drop_last=True, pin_memory=True)

In [None]:
va_dl = DataLoader(va_ds, batch_size=bs, num_workers=nw, drop_last=True, pin_memory=True)

In [None]:
class ArcMarginProduct(nn.Module):
    def __init__(self, s=32.0, m=0.50, easy_margin=True):
        super(ArcMarginProduct, self).__init__()
        self.s = s
        self.m = m

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)

        # make the function cos(theta+m) monotonic decreasing while theta in [0°,180°]
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, cosine, label):
        # cos(theta + m)
#         cosine = torch.clamp(cosine, -1.0, 1.0)
#         print("cos:", cosine)#, cosine >= 1., cosine <= -1.)
#         assert(not (cosine.clone().detach().cpu().numpy() >= 1.).any())
#         assert(not (cosine.clone().detach().cpu().numpy() <= -1.).any())
#         sine_2 = F.relu(1.0 - torch.pow(cosine, 2))
#         print(sine_2)#, sine_2 > 1.0)
#         assert(not (sine_2.detach().cpu().numpy() > 1.0).any())
#         sine = torch.sqrt(sine_2)
#         sine = torch.clamp(sine, -1.0, 1.0)
#         print("sin:", sine)#, sine >= 1., sine <= -1.)
#         assert(not (sine.clone().detach().cpu().numpy() >= 1.).any())
#         assert(not (sine.clone().detach().cpu().numpy() <= -1.).any())
#         phi = cosine * self.cos_m - sine * self.sin_m
        phi = cosine - self.m

#         if self.easy_margin:
#             phi = torch.where(cosine > 0, phi, cosine)
#         else:
#             phi = torch.where((cosine - self.th) > 0, phi, cosine - self.mm)

        #one_hot = torch.zeros(cosine.size(), device='cuda' if torch.cuda.is_available() else 'cpu')
        one_hot = torch.zeros_like(cosine)
        one_hot.scatter_(1, label.view(-1, 1), 1)
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output = output * self.s
        
        output = F.cross_entropy(output, label)
#         print("loss:", output)
        return output

In [None]:
##################################  Arcface head #############################################################
import math
class Arcface(nn.Module):
    # implementation of additive margin softmax loss in https://arxiv.org/abs/1801.05599    
    def __init__(self, s=64., m=0.5):
        super(Arcface, self).__init__()
#         self.classnum = classnum
#         self.kernel = nn.Parameter(torch.Tensor(embedding_size, classnum).normal_().cuda())
        # initial kernel
        # self.kernel.data.uniform_(-1, 1).renorm_(2,1,1e-5).mul_(1e5)
        self.s = s # scalar value default is 64, see normface https://arxiv.org/abs/1704.06369
        self.set_m(m)
        
    def set_m(self, m):
        self.m = m # the margin value, default is 0.5
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.mm = self.sin_m * m  # issue 1
        self.threshold = math.cos(math.pi - m)
        
    def forward(self, cos_theta, label):
        cos_theta.clamp_(-1.0, 1.0)
#         assert(not (cos_theta.clone().detach().cpu().numpy() > 1.).any())
#         assert(not (cos_theta.clone().detach().cpu().numpy() < -1.).any())
        cos_theta_2 = torch.pow(cos_theta, 2)
#         assert((cos_theta_2.clone().detach().cpu().numpy() <= 1.).all())
        sin_theta_2 = F.relu(1 - cos_theta_2)
#         assert((sin_theta_2.clone().detach().cpu().numpy() >= 0.).all())
        sin_theta = torch.sqrt(sin_theta_2)
        cos_theta_m = cos_theta * self.cos_m - sin_theta * self.sin_m

        output = cos_theta * 1.0 # a little bit hacky way to prevent in_place operation on cos_theta
        idx_ = torch.arange(0, len(label), dtype=torch.long)
        output[idx_, label] = cos_theta_m[idx_, label]
        output *= self.s # scale up in order to make softmax work, first introduced in normface

        output = F.cross_entropy(output, label)
        return output

In [None]:
class CosineEmbedding(nn.Module):
    def __init__(self, embedding_dim, num_classes):
        super(CosineEmbedding, self).__init__()
        self.weight = nn.Parameter(torch.Tensor(num_classes, embedding_dim))
        nn.init.xavier_uniform_(self.weight)
        
    def forward(self, inputs):
#         print("emb weight:", self.weight)
#         print("inp:", inputs)
#         return F.linear(F.normalize(inputs), F.normalize(self.weight))
        return F.linear(inputs, self.weight)

    def normalize(self):
        return
        with torch.no_grad():
            torch.div(self.weight, self.weight.norm(p=None, dim=1, keepdim=True), out=self.weight)

In [None]:
model = models.resnet18(pretrained=True)

In [None]:
embedding_dim = 2
num_classes = 2

In [None]:
model.fc = nn.Sequential(
    nn.Linear(in_features=512, out_features=512, bias=True),
    nn.ReLU(),
    nn.BatchNorm1d(num_features=512),
    nn.Dropout(),
#     nn.Linear(in_features=512, out_features=num_classes, bias=True))
    nn.Linear(in_features=512, out_features=embedding_dim, bias=False))
# model.fc = nn.Linear(in_features=512, out_features=embedding_dim, bias=False)
model.cuda();

In [None]:
embedding = CosineEmbedding(embedding_dim, num_classes)
embedding.cuda();

In [None]:
for param in model.parameters():
    param.requires_grad = False
for param in model.fc.parameters():
    param.requires_grad = True

In [None]:
# criterion = Arcface(s=64.0, m=0.5)# 0.5 * ((2 * math.pi) / num_classes))
criterion = ArcMarginProduct(s=1.0, m=0.0)
# criterion = nn.CrossEntropyLoss()

In [None]:
lr = 1e-2

In [None]:
optimizer = Adam(model.fc.parameters(), lr=lr)

In [None]:
optimizer.add_param_group({'params': embedding.parameters(), 'lr': lr})

In [None]:
scheduler = ExponentialLR(optimizer=optimizer, gamma=0.95)

In [None]:
num_epochs = 200
model_dir = Path('/kaggle/working/')

In [None]:
tr_metrics = []
va_metrics = []

In [None]:
for epoch in tqdm(range(num_epochs)):
# for epoch in range(num_epochs):
    print('Epoch: {:02d}\n'.format(epoch))
    
    iterations = 0
    running_loss = 0.0
    running_acc = 0.0
    running_ce = 0.0
    model.train()
    for inputs, targets in tqdm(tr_dl):
#     for inputs, targets in tr_dl:
        inputs = inputs.cuda()
        targets = targets.cuda()
        optimizer.zero_grad()
        outputs = model(inputs)
        outputs = embedding(outputs)
        loss = criterion(outputs, targets)
        loss.backward()
        embedding.normalize()
        optimizer.step()
        
        loss = loss.item()
        running_loss += loss
        _, preds = torch.max(outputs, 1)
        acc = torch.sum(preds == targets.data, dtype=torch.float32) / inputs.shape[0]
        running_acc += acc
        ce = F.cross_entropy(outputs, targets).mean().item()
        running_ce += ce
        tr_metrics.append((loss, acc, ce))
        iterations += 1
    
    tr_loss = running_loss / iterations
    tr_acc = running_acc / iterations
    tr_ce = running_ce / iterations
    print('Train: Loss: {:.6f} Acc: {:.6f} CE: {:.6f}'.format(tr_loss, tr_acc, tr_ce))
    
    iterations = 0
    running_loss = 0.0
    running_ce = 0.0
    running_acc = 0.0
    model.eval()
    with torch.no_grad():
        for inputs, targets in tqdm(va_dl):
#         for inputs, targets in va_dl:
            inputs = inputs.cuda()
            targets = targets.cuda()
            outputs = model(inputs)
            outputs = embedding(outputs)
            
            loss = criterion(outputs, targets)
            loss = loss.item()
            running_loss += loss
            _, preds = torch.max(outputs, 1)
#             print(preds, targets.data)
            acc = torch.sum(preds == targets.data, dtype=torch.float32) / inputs.shape[0]
#             print(acc)
            running_acc += acc
            running_ce += F.cross_entropy(outputs, targets).mean().item()
            iterations += 1
            
    va_loss = running_loss / iterations
    va_acc = running_acc / iterations
    va_ce = running_ce / iterations
    va_metrics.append((va_loss, va_acc, va_ce))
    print('Val: Loss: {:.6f} Acc: {:.6f} CE: {:.6f}'.format(va_loss, va_acc, va_ce))
#     print(embedding.weight)
    scheduler.step()

In [None]:
# cosine_distance = CosineSimilarity(dim=0, eps=1e-6)

In [None]:
# cosine_distance(embedding.embedding.weight[0], embedding.embedding.weight[1]) 

In [None]:
# embedding.embedding.weight

In [None]:
plt.plot(list(map(lambda t:t[0], tr_metrics)))

In [None]:
plt.plot(list(map(lambda t:t[0], va_metrics)))

In [None]:
plt.plot(list(map(lambda t:t[1], va_metrics)))

In [None]:
plt.plot(list(map(lambda t:t[2], va_metrics)))