In [1]:
import numpy as np 
import pandas as pd 

df = pd.read_csv('/kaggle/input/vkcup1/train_mod_v3.csv')
df


Unnamed: 0.1,Unnamed: 0,u,v,t,h,mask2drop
0,0,0,1,27,6,1
1,1,2,3,13,8,1
2,2,4,5,74,2,1
3,3,6,7,36,9,1
4,4,8,9,52,2,1
...,...,...,...,...,...,...
16745086,17414504,2369189,2300645,6,7,1
16745087,17414505,62448,88256,60,0,1
16745088,17414506,919913,1183054,71,0,1
16745089,17414507,1774,881,67,0,1


In [2]:
from sklearn.model_selection import train_test_split

df_train = df


In [3]:
N_TOTAL = max(df['u'].max(), df['v'].max()) + 1
df


Unnamed: 0.1,Unnamed: 0,u,v,t,h,mask2drop
0,0,0,1,27,6,1
1,1,2,3,13,8,1
2,2,4,5,74,2,1
3,3,6,7,36,9,1
4,4,8,9,52,2,1
...,...,...,...,...,...,...
16745086,17414504,2369189,2300645,6,7,1
16745087,17414505,62448,88256,60,0,1
16745088,17414506,919913,1183054,71,0,1
16745089,17414507,1774,881,67,0,1


In [4]:
N_TOTAL

2522474

In [5]:
def apk(pred, target, k):
    if len(pred) >= k:
        pred = pred[:k]

    ans, cnt = 0, 0
    tot = min(len(target), k) 
    s = set()
    for i in range(len(pred)):
        if pred[i] in target and pred[i] not in s:
            cnt += 1
            ans += cnt / (i + 1)
            s.add(pred[i])
    return ans / tot


def mapk(pred, target, k):
    assert len(pred) == len(target)
    sum_metric = 0
    for cur_pred, cur_target in zip(pred, target):
        sum_metric += apk(cur_pred, cur_target, k)
    return sum_metric / len(pred) 


In [6]:
import torch
import torch.nn as nn
from tqdm.auto import trange


class UserDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.g = [[] for _ in range(N_TOTAL)]
        for i in trange(df.shape[0]):
            u, v = df['u'].iloc[i], df['v'].iloc[i]
            self.g[u].append(v)
            self.g[v].append(u)

    def __getitem__(self, index):
        answer = torch.zeros(N_TOTAL)
        for u in self.g[index]:
            answer[u] = 1
        return answer

    def __len__(self):
        return len(self.g)

In [7]:
class VAE(nn.Module):
    def __init__(self, hidden2_dim, hidden_dim):
        super().__init__()


        self.encoder_common = nn.Sequential(
            nn.Linear(N_TOTAL, hidden2_dim),
            nn.ReLU(),
            nn.Linear(hidden2_dim, hidden_dim),
            nn.ReLU()
        )
        self.encoder_mu = nn.Linear(hidden_dim, hidden_dim) 
        self.encoder_sigma = nn.Linear(hidden_dim, hidden_dim) 

        self.decoder_common = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden2_dim)
        )
        self.decoder_mu = nn.Linear(hidden2_dim, N_TOTAL)
        self.decoder_sigma = nn.Linear(hidden2_dim, N_TOTAL)

    def gaussian_sampler(self, mu, logsigma):
        noise = torch.randn_like(mu)
        return noise * logsigma.exp() + mu

    def encode(self, x):
        hidden = self.encoder_common(x)
        mu = self.encoder_mu(hidden)
        sigma = self.encoder_sigma(hidden)
        return mu, sigma

    def decode(self, mu, sigma):
        sample = self.gaussian_sampler(mu, sigma)
        hidden = self.decoder_common(sample)
        decoded_mu, decoded_sigma = self.decoder_mu(hidden), self.decoder_sigma(hidden)
        return decoded_mu, decoded_sigma

    def forward(self, x):
        latent_mu, latent_logsigma = self.encode(x)
        reconstruction_mu, reconstruction_logsigma = self.decode(latent_mu, latent_logsigma)
        return reconstruction_mu, reconstruction_logsigma, latent_mu, latent_logsigma


In [8]:
import math

def KL_divergence(mu, logsigma):
    return -0.5 * torch.sum(1 + 2 * logsigma - mu ** 2 - torch.exp(logsigma) ** 2)


def log_likelihood(x, mu, logsigma):
    sigma = torch.exp(logsigma)
    pi2 = (2 * math.pi) ** 0.5
    return torch.sum(-( (x - mu) ** 2 / (2 * sigma ** 2)) + torch.log(1 / (sigma * pi2)))


def loss_vae(x, mu_gen, logsigma_gen, mu_z, logsigma_z):
    return -log_likelihood(x, mu_gen, logsigma_gen) + KL_divergence(mu_z, logsigma_z)

In [9]:
dataset = UserDataset(df_train)
loader = torch.utils.data.DataLoader(dataset, batch_size=4)

  0%|          | 0/16745091 [00:00<?, ?it/s]

In [10]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [11]:
vae = VAE(16, 64).to(device)
optimizer = torch.optim.Adam(vae.parameters(), lr=1e-3)
train_history, dev_history = [], []

In [12]:
loader = torch.utils.data.DataLoader(dataset, batch_size=32)

In [13]:
from tqdm.auto import trange, tqdm
from IPython.display import clear_output
import matplotlib.pyplot as plt

EPOCHS = 1

for epoch in trange(EPOCHS):
    i = 0
    for batch in tqdm(loader):
        optimizer.zero_grad()
        reconstruction = vae(batch[0].to(device))
        loss = loss_vae(batch[0].to(device), *reconstruction)
        loss.backward()
        optimizer.step()
        train_history.append((len(train_history), loss.item()))
        if len(train_history) % 100 == 0 and len(train_history) > 0 and False:
            clear_output(True)
            plt.scatter(*zip(*train_history), alpha=0.1, label='train_loss')
            plt.legend(); plt.grid(); plt.show()
            if len(dev_history):
                plt.plot(*zip(*dev_history), color='red', label='dev_loss')
            plt.legend(); plt.grid(); plt.show()
        i += 1
        if i % 10000 == 0:
            print(i)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/78828 [00:00<?, ?it/s]

10000
20000
30000
40000
50000
60000
70000


In [14]:
import pickle
torch.save(vae.to('cpu'), "recvae.pt")
with open('loss_train.txt', 'wb') as f: 
    pickle.dump(train_history, f)