In [0]:
from collections import defaultdict
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import random
import torch

In [0]:
! nvidia-smi

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
try:
    from apex import amp
except:
    ! git clone https://github.com/NVIDIA/apex
    ! cd apex && pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
    from apex import amp

In [0]:
ROOT = '/content/gdrive/My Drive/ML/stuff'
DATA = f'{ROOT}/data'
OUT = f'{ROOT}/out'

if not Path(OUT).exists():
    ! mkdir "$OUT"
    ! mkdir "$OUT/images"

In [0]:
df = pd.read_csv(f"{DATA}/data.csv")

In [0]:
df = df.drop(columns=["Unnamed: 0", "Unnamed: 0.1", "author", "file_url", "sample_url"])

In [0]:
df["tags"] = df["tags"].apply(lambda x: set(x.split()))

In [0]:
n_tags = 256

In [0]:
tags_count = defaultdict(lambda: 0)
for tags in df["tags"]:
    for t in tags:
        tags_count[t] += 1
to_sort = []
for t in tags_count:
    to_sort.append((tags_count[t], t))
to_sort.sort()
to_sort = to_sort[::-1]
common_tags = [x[1] for x in to_sort[:n_tags]]

In [0]:
upvoted = df[df["score"] > 90]


In [0]:
del df
del to_sort
del tags_count

In [0]:
upvoted = upvoted.values

In [0]:
tensors = dict()
for v in upvoted:
    fav, id, rating, score, tags = v
    res = [1 if t in tags else 0 for t in common_tags]
    res += [1 if r == rating else 0 for r in ['s', 'q', 'e']]
    res += [score, fav]
    tensors[id] = torch.Tensor(res)


In [0]:
del upvoted

In [0]:
users = pickle.load(open(f"{DATA}/users.p", "rb"))
delete = set()
for u in users:
    if len(users[u]) < 64:
        delete.add(u)
for u in delete:
    del users[u]
del delete

In [0]:
def save_all(model, opti, n_batch):
    tmp_path = "/tmp/all_train_data"
    data = [model.state_dict(), opti.state_dict(), n_batch]
    out_path_2 = f"{OUT}/train_{(n_batch // 5000) % 50}"
    torch.save(data, open(tmp_path, "wb"))
    ! cp "$tmp_path" "$OUT"
    ! cp "$tmp_path" "$out_path_2"
def load_all(model, opti):
    tmp_path = "/tmp/all_train_data"
    ! cp "$OUT/all_train_data" "$tmp_path"
    l = torch.load(open(tmp_path, "rb"))
    model.load_state_dict(l[0])
    opti.load_state_dict(l[1])
    return [model, opti] + l[2:]

In [0]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

In [0]:
def layer(in_depth, k):
    return torch.nn.Sequential(
        torch.nn.Conv1d(in_depth, k, kernel_size=1, stride=1, padding=0, bias=False),
        torch.nn.BatchNorm1d(k),
        torch.nn.LeakyReLU(0.2)
    )
size_bottleneck = 32
class Network(torch.nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        
        self.conv = torch.nn.ModuleList([
            layer(n_tags + 5, 256),
            layer(256, 512),
        ])
        
        self.features_extract = torch.nn.ModuleList([
            torch.nn.Linear(512, 256),
            torch.nn.LeakyReLU(0.2),
            torch.nn.Linear(256, size_bottleneck),
        ])
        
        self.is_fav = torch.nn.ModuleList([
            layer(size_bottleneck + n_tags + 5, 256),
            layer(256, 128),
            torch.nn.Conv1d(128, 1, kernel_size=1, stride=1, padding=0),
            torch.nn.Sigmoid()
        ])
    
    def forward(self, favs, to_identify):
        x = favs
        for l in self.conv:
            x = l(x)
        x = x.mean(dim=2)
        for l in self.features_extract:
            x = l(x)
        
        fingerprint = x
        
        user_extended = x.view(-1, size_bottleneck, 1).repeat(1, 1, to_identify.shape[-1])
        x = torch.cat([to_identify, user_extended], dim=1)
        
        for l in self.is_fav:
            x = l(x)
        return x.transpose(2, 1).reshape(x.shape[0], -1), fingerprint
model = Network().to(device)



lr = 0.00001

optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss = torch.nn.BCELoss()
MP = False

if MP:
    model, optimizer = amp.initialize(
       model, optimizer, opt_level="O2", 
       keep_batchnorm_fp32=True, loss_scale="dynamic"
    )

n_batch = 0
#losses = []

In [0]:
model, optimizer, n_batch = load_all(model, optimizer)

In [0]:
batch_size = 64
ones = torch.ones(batch_size).to(device)
model.train()
for epoch in range(1000):
    
    users_shuffled = list(users)
    random.shuffle(users_shuffled)
    
    for batch, begin in enumerate(range(0, len(users), batch_size)):
        n_batch += 1
        optimizer.zero_grad()

        extract1 = []
        extract2 = []
        inputs_test1 = []
        inputs_test2 = []
        
        n_samples = 32
        n2_samples = n_samples * 2
        
        current_batch_size = min(begin + batch_size, len(users)) - begin
        if current_batch_size != batch_size:
            continue
        
        for i in range(begin, begin + current_batch_size):
            u = users_shuffled[i]
            favs = [tensors[i] for i in users[u]]
            random.shuffle(favs)
            
            extract1.append(torch.stack(favs[:n_samples]))
            extract2.append(torch.stack(favs[n_samples:n2_samples]))
            
            negatives = random.sample(list(tensors), n2_samples)
            negatives1 = torch.stack([tensors[i] for i in negatives[:n_samples]])
            negatives2 = torch.stack([tensors[i] for i in negatives[n_samples:]])
            inputs_test1.append(torch.cat([extract1[-1], negatives1]))
            inputs_test2.append(torch.cat([extract2[-1], negatives2]))

            extract1[-1] = extract1[-1].transpose(1, 0).to(device)
            extract2[-1] = extract2[-1].transpose(1, 0).to(device)
            inputs_test1[-1] = inputs_test1[-1].transpose(1, 0).to(device)
            inputs_test2[-1] = inputs_test2[-1].transpose(1, 0).to(device)

        targets = torch.tensor([[1] * n_samples + [0] * n_samples] * current_batch_size).to(device).float()
        extract1 = torch.stack(extract1)
        extract2 = torch.stack(extract2)
        inputs_test1 = torch.stack(inputs_test1)
        inputs_test2 = torch.stack(inputs_test2)

        res1, fingerprint1 = model(extract1, inputs_test2)
        res2, fingerprint2 = model(extract2, inputs_test1)
        

        #print(inputs_test.shape, for_representation.shape)
        loss1 = loss(res1, targets)
        loss2 = loss(res2, targets)
        loss_diff = ((fingerprint1 - fingerprint2) ** 2).mean()
        variance = ((fingerprint1 - fingerprint1.mean(dim=0)) ** 2).mean() + ((fingerprint2 - fingerprint2.mean(dim=0)) ** 2).mean()
        norm_loss = (fingerprint1 ** 2).mean() + (fingerprint2 ** 2).mean()
        loss_variance = -variance
        l = loss1 + loss_diff * 0.5 + loss_variance * 0.02 + 0.001 * norm_loss ** 2
        if MP:
            with amp.scale_loss(l, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            l.backward()
        optimizer.step()
        #if batch % 100 == 0:
        #    losses.append(l.item())
        if n_batch % 100 == 0:
            mean_positive = (res1[:, :n_samples] > 0.5).float().mean().item()
            mean_negative = (res1[:, n_samples:] > 0.5).float().mean().item()
            print(f"{n_batch} : {l.item()}, positive : {mean_positive}, negative : {mean_negative}, diff : {loss_diff.item()}, variance : {variance.item()}")
        if n_batch % 5000 == 0:
            save_all(model, optimizer, n_batch)

In [0]:
save_all(model, optimizer, n_batch)

In [0]:
loss1, loss_diff, loss_variance, norm_loss

In [0]:
def run_on_user(user):
    model.eval()
    favs = [tensors[i] for i in users[user]]
    for_representation = torch.stack(favs[:512])
    for_test = torch.stack(favs[:1])
    for_representation = for_representation.transpose(1, 0).to(device)
    for_test = for_test.transpose(1, 0).to(device)
    res, fingerprint = model(for_representation.unsqueeze(0), for_test.unsqueeze(0))
    return fingerprint.cpu()
baseline = run_on_user("my_username").data
baseline

In [0]:
l = []
for u in users:
    l.append((((run_on_user(u) - baseline) ** 2).mean().item(), u))
import heapq
tmp = heapq.nlargest(16, l)
tmp.sort()

In [0]:
result_dict = {}
for u in users:
    result_dict[u] = run_on_user(u).data.numpy()

In [0]:
pickle.dump(result_dict, open(f"{DATA}/users_preprocess.p", "wb"))