In [1]:
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import random
import torch.optim as optim
import pickle
import torch.utils.data
from torch.backends import cudnn
from scipy.sparse import csr_matrix
import math
import bottleneck as bn
import matplotlib.pyplot as plt
import time

### dataset

In [2]:
gpu = 0
inter = 5
train = torch.load('data/cul/train_' + str(inter) + '.pt')
val = torch.load('data/cul/val_' + str(inter) + '.pt')
test = torch.load('data/cul/test_' + str(inter) + '.pt')

train_matrix = torch.load('data/cul/train_matrix_' + str(inter) + '.pt')
train_nei = np.load('data/cul/train_nei_' + str(inter) + '.npy').item()

train_matrix_input = train_matrix.clone().type(torch.FloatTensor)
for idx, (u,i) in enumerate(val):
    train_matrix_input[u][val[idx][1]] = 0
    train_matrix_input[u][test[idx][1]] = 0

num_users = train_matrix.size()[0]
num_items = train_matrix.size()[1]

print(num_users)
print(num_items)
print(train.size()[0]+val.size()[0]*2)

# for neg_sample
matrix = train_matrix.numpy()
neg_max = num_items - min(np.sum(matrix, axis = 1))
neg_count = neg_max - np.sum(matrix, axis = 1)

i, j = np.where(matrix == 0)
user = 0
count = 0
negs = []
for index, idx in enumerate(i):
    if user < idx:
        user = idx
        neg = j[count:index].tolist()
        neg += [-1]*(int(neg_max)-len(neg))
        negs.append(neg)
        count = index 
neg = j[count:].tolist()
neg += [-1]*(int(neg_max)-len(neg))        
negs.append(neg)
negs_np = np.array(negs)

5219
25187
130799


In [3]:
# train set
class traindset(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data
        
    def __getitem__(self, idx):
        return self.data[idx]
        
    def __len__(self):
        return np.shape(self.data)[0]

val_dset = traindset(val)
test_dset = traindset(test)
train_dset = traindset(train)
num_users = train_matrix.size()[0]
num_items = train_matrix.size()[1]

# negative sampling
def get_train_instances(pairs, num_neg):  
    users = pairs[:,0] 
    pos = pairs[:,1]
    
    uss, iss, jss = [],[],[]
    for i in range(len(users)):
        neg_idx = np.random.randint(0, neg_count[users[i]], size=(num_neg, ))
        negs = negs_np[users[i], neg_idx]
        for j in range(len(negs)):
            uss.append(users[i])
            iss.append(pos[i])
            jss.append(negs[j])
            
    return torch.LongTensor(uss), torch.LongTensor(iss), torch.LongTensor(jss)

def get_bd_instances(pairs, num_neg):
    users = pairs[:,0]
    pos = pairs[:,1]
    
    weight_T = weights_T[users]
    weight_S = weights_S[users]

    idx_T = torch.multinomial(weight_T, num_neg, replacement=True)
    idx_S = torch.multinomial(weight_S, num_neg, replacement=True)

    uss = torch.reshape(torch.reshape(users, (-1, 1)).expand(len(users), num_neg), (-1, ))
    iss = torch.reshape(torch.reshape(pos, (-1, 1)).expand(len(users), num_neg), (-1, ))
    jss_T = torch.reshape(idx_T, (-1,))
    jss_S = torch.reshape(idx_S, (-1,))

    return uss, iss, jss_T, jss_S

def HR(k, eval_sort):
    eval_sort = eval_sort.cpu().numpy()
    _, idy = np.where(eval_sort == 0)
    
    return len(np.where(idy < k)[0]) / len(eval_sort)

def NDCG(k, eval_sort):
    eval_sort = eval_sort.cpu().numpy()
    _, idy = np.where(eval_sort == 0)
    rank = idy[np.where(idy < k)[0]]
    
    return np.sum(1 / np.log2(rank+2)) / len(eval_sort)

### base model - BPR

In [4]:
class BPR(nn.Module):
    def __init__(self, emb_dim, num_neg, matrix):
        super(BPR, self).__init__()
        self.num_users = matrix.size()[0]
        self.num_items = matrix.size()[1]
        self.u_emb = nn.Embedding(self.num_users, emb_dim) #max_norm = 1
        self.i_emb = nn.Embedding(self.num_items, emb_dim)
        self.emb_dim = emb_dim
    
    def forward(self, u, i, j):
        # embeddings
        user_emb = self.u_emb(u)
        pos_emb = self.i_emb(i)
        neg_emb = self.i_emb(j)
        
        pos_score = torch.sum(torch.mul(user_emb, pos_emb), dim=1)
        neg_score = torch.sum(torch.mul(user_emb, neg_emb), dim=1)

        return pos_score, neg_score
    
    def evaluation(self, pairs):
    # do not use this function
        # indices
        users = pairs[:, 0]
        pos_items = pairs[:, 1].cpu().numpy().reshape((len(users), 1))
        neg_items = np.zeros((len(users), 999))
        for i in range(len(users)):
            neg_idx = np.random.randint(0, neg_count[users[i]], size=(1, 999))
            neg_items[i] = np.array(negs_np[users[i], neg_idx])
        eval_items = torch.LongTensor(np.concatenate((pos_items, neg_items), axis=1)).cuda(gpu)
        
        # embeddings
        user_emb = self.u_emb(users)
        eval_emb = self.i_emb(eval_items)
        
        # distance
        user_emb = torch.unsqueeze(user_emb, -1)
        eval_dist = torch.sum(torch.mul(user_emb.transpose(1, 2).expand(len(users), 1000, self.emb_dim), eval_emb), dim = 2) # (N*100)
        eval_sort = torch.argsort(eval_dist, dim=1, descending=True)
        
        return HR(5, eval_sort), HR(10, eval_sort), HR(20, eval_sort), NDCG(5, eval_sort), NDCG(10, eval_sort), NDCG(20, eval_sort)


### Warm-up

In [5]:
# model
T_size = 50
S_size = 5
T = BPR(T_size, 1, train_matrix_input)
S = BPR(S_size, 1, train_matrix_input)

In [6]:
# Teacher
use_cuda = torch.cuda.is_available()
bs = 128
num_neg = 1
lr = 0.001
wd = 0.001
epochs = 1000
verbose = 100

# data
train_loader = torch.utils.data.DataLoader(dataset = train_dset, batch_size = bs, shuffle = True)
test_loader = torch.utils.data.DataLoader(dataset = test_dset, batch_size = 1024, shuffle = False)

optimizer = optim.Adam(T.parameters(), lr = lr, weight_decay=wd)
if use_cuda:
    T = T.cuda(gpu)

for epoch in range(epochs):
    T.train()
    loss_train = np.zeros(3)
    t0 = time.time()

    for batch_idx, pairs in enumerate(train_loader):
        u, i, j = get_train_instances(pairs, num_neg)
        if use_cuda:
            u, i, j = u.cuda(gpu), i.cuda(gpu), j.cuda(gpu) 

        ### train
        optimizer.zero_grad()
        pos, neg = T(u, i, j)

        loss_dist = - torch.log(torch.sigmoid(pos - neg)).sum()
        loss = loss_dist 
        loss.backward()
        optimizer.step()

        loss_train[0] += loss.cpu().tolist() 
    loss_train /= len(train_loader)

    if epoch % verbose == 0:
        print('epoch = {}, loss = {:.3f}, time = {:.4f}'.format(epoch, loss_train[0], time.time()-t0))

epoch = 0, loss = 439.152, time = 3.9740
epoch = 100, loss = 16.056, time = 3.9485
epoch = 200, loss = 15.917, time = 4.0147
epoch = 300, loss = 15.873, time = 3.8288
epoch = 400, loss = 15.860, time = 4.0124
epoch = 500, loss = 15.795, time = 3.7586
epoch = 600, loss = 15.860, time = 3.7550
epoch = 700, loss = 15.707, time = 3.7986
epoch = 800, loss = 15.877, time = 3.7538
epoch = 900, loss = 15.833, time = 3.7650


In [7]:
# Student
use_cuda = torch.cuda.is_available()
bs = 128
num_neg = 1
lr = 0.001
wd = 0
epochs = 1000
verbose = 100

# data
train_loader = torch.utils.data.DataLoader(dataset = train_dset, batch_size = bs, shuffle = True)
test_loader = torch.utils.data.DataLoader(dataset = test_dset, batch_size = 1024, shuffle = False)

optimizer = optim.Adam(S.parameters(), lr = lr, weight_decay = wd)
if use_cuda:
    S = S.cuda(gpu)

for epoch in range(epochs):
    S.train()
    loss_train = np.zeros(3)
    t0 = time.time()

    for batch_idx, pairs in enumerate(train_loader):
        u, i, j = get_train_instances(pairs, num_neg)
        if use_cuda:
            u, i, j = u.cuda(gpu), i.cuda(gpu), j.cuda(gpu) 

        ### train
        optimizer.zero_grad()
        pos, neg = S(u, i, j)

        loss_dist = - torch.log(torch.sigmoid(pos - neg)).sum()
        loss = loss_dist 
        loss.backward()
        optimizer.step()

        loss_train[0] += loss.cpu().tolist() 
    loss_train /= len(train_loader)

    if epoch % verbose == 0:
        print('epoch = {}, loss = {:.3f}, time = {:.4f}'.format(epoch, loss_train[0], time.time()-t0))

epoch = 0, loss = 174.849, time = 3.2463
epoch = 100, loss = 20.485, time = 3.2589
epoch = 200, loss = 13.370, time = 3.4075
epoch = 300, loss = 11.361, time = 3.5590
epoch = 400, loss = 10.384, time = 3.2499
epoch = 500, loss = 9.741, time = 3.2246
epoch = 600, loss = 9.612, time = 3.2309
epoch = 700, loss = 9.192, time = 3.3215
epoch = 800, loss = 9.020, time = 3.3509
epoch = 900, loss = 8.655, time = 3.2460


### Bidirectional Distillation

In [8]:
use_cuda = torch.cuda.is_available()
lamb_T = 0.5
lamb_S = 0.5
lamb_CF = 1

temp_T = 5
temp_S = 5

neg_KD = 1
eps = 1e-4
eps_t = 1e-2
wd = 0

update = 10
epochs = 100
verbose = 50

batch_size = 128
lr_T = 0.001
lr_S = 0.001

# model
print(sum(p.numel() for p in T.parameters()))
print(sum(p.numel() for p in S.parameters()))

# loss
optimizer_T = optim.Adam(T.parameters(), lr = lr_T, weight_decay=wd)
optimizer_S = optim.Adam(S.parameters(), lr = lr_S)
criterion = torch.nn.BCEWithLogitsLoss(reduction='sum') # for L_BD

if use_cuda:
    T = T.cuda(gpu)
    S = S.cuda(gpu)

for epoch in range(0, epochs+1):
    loss_train = np.zeros(3)
    t0 = time.time()

    # update rank matrix
    if (epoch % update == 0):
        with torch.no_grad():
            T.eval()
            S.eval()
            rank_matrix_T = torch.zeros_like(train_matrix_input).type(torch.LongTensor)
            rank_matrix_S = torch.zeros_like(train_matrix_input).type(torch.LongTensor)
            mask_matrix = (1-train_matrix_input).cuda(gpu).float()     

            for u in range(num_users):
                uss = (torch.ones(num_items) * u).type(torch.LongTensor).cuda(gpu)
                iss = torch.LongTensor(np.arange(num_items)).cuda(gpu)
                jss = torch.LongTensor(np.arange(num_items)).cuda(gpu)
                _, row1 = T(uss, iss, jss)
                _, row2 = S(uss, iss, jss)
                
                row_mask1 = row1.view(num_items) * mask_matrix[u]            
                rank_list1 = torch.argsort(row_mask1)
                rank_matrix_T[u] = rank_list1
                row_mask2 = row2.view(num_items) * mask_matrix[u]              
                rank_list2 = torch.argsort(row_mask2)
                rank_matrix_S[u] = rank_list2 

            ranklist_T = torch.zeros_like(rank_matrix_T)
            for i in range(len(ranklist_T)):
                row = rank_matrix_T[i]
                ranklist_T[i][row] = torch.LongTensor(np.arange(len(row))) + 1
            ranklist_S = torch.zeros_like(rank_matrix_S)
            for i in range(len(ranklist_S)):
                row = rank_matrix_S[i]
                ranklist_S[i][row] = torch.LongTensor(np.arange(len(row))) + 1
            
            rank_dif_T = ranklist_T - ranklist_S
            rank_dif_S = ranklist_S - ranklist_T

            weights_T = torch.exp(rank_dif_T.type(torch.FloatTensor) * eps).cuda(gpu)
            weights_S = torch.tanh(torch.max(rank_dif_S.type(torch.FloatTensor) * eps_t, torch.zeros_like(rank_dif_S).type(torch.FloatTensor))).cuda(gpu)

    T.train()
    S.train()
    # training
    for batch_idx, pairs in enumerate(train_loader):
        u, i, j_T, j_S = get_bd_instances(pairs, neg_KD)
        u, i, j_T, j_S = u.cuda(gpu), i.cuda(gpu), j_T.cuda(gpu), j_S.cuda(gpu)

        ### train
        optimizer_T.zero_grad()
        optimizer_S.zero_grad()

        # For BD T->S
        _, neg2T = T(u, i, j_S)
        pos2, neg2 = S(u, i, j_S)
        # For BD S->T
        _, neg1S = S(u, i, j_T)
        pos1, neg1 = T(u, i, j_T) 

        # loss for T
        loss_T_CF = - torch.log(torch.sigmoid(pos1 - neg1)).sum()
        pseudo_label = torch.sigmoid(neg1S / temp_S).detach() 
        loss_T_WS = criterion(neg1, pseudo_label)
        loss_T = loss_T_CF * lamb_CF + loss_T_WS * lamb_T
        loss_T.backward()
        optimizer_T.step()

        # loss for S
        loss_S_CF = - torch.log(torch.sigmoid(pos2 - neg2)).sum()
        pseudo_label = torch.sigmoid(neg2T / temp_T).detach() 
        loss_S_WS = criterion(neg2, pseudo_label)
        loss_S = loss_S_CF * lamb_CF + loss_S_WS * lamb_S
        loss_S.backward()
        optimizer_S.step()
        
        #loss_train[0] += loss.cpu().tolist() 
        loss_train[1] += loss_T.cpu().tolist() 
        loss_train[2] += loss_S.cpu().tolist()
    loss_train /= len(train_loader)

    if epoch % verbose == 0:
        print('epoch = {}, loss = {:.3f}+{:.3f}= {:.3f}, time = {:.4f}'.format(epoch, loss_train[1], loss_train[2], loss_train[0], time.time()-t0))
        ## full val
        rank_T = []
        for row in test:
            row = row.numpy()
            rank_T.append(num_items - np.where(rank_matrix_T[row[0]] == row[1])[0][0])
        rank_T = np.array(rank_T)
        ndcg = 1 / np.log2(rank_T + 2)

        print("{:.4f}, {:.4f}, {:.4f}, {:.4f}, {:.4f}, {:.4f}".format((rank_T < 50).mean(), (rank_T < 100).mean(), (rank_T < 200).mean(), (ndcg * (rank_T < 50)).mean(), (ndcg * (rank_T < 100)).mean(), (ndcg * (rank_T < 200)).mean()))
    
        rank_S = []
        for row in test:
            row = row.numpy()
            rank_S.append(num_items - np.where(rank_matrix_S[row[0]] == row[1])[0][0])
        rank_S = np.array(rank_S)
        ndcg = 1 / np.log2(rank_S + 2)

        print("{:.4f}, {:.4f}, {:.4f}, {:.4f}, {:.4f}, {:.4f}".format((rank_S < 50).mean(), (rank_S < 100).mean(), (rank_S < 200).mean(), (ndcg * (rank_S < 50)).mean(), (ndcg * (rank_S < 100)).mean(), (ndcg * (rank_S < 200)).mean()))                                                

1520300
152030
epoch = 0, loss = 56.920+192.609= 0.000, time = 21.2252
0.1152, 0.1774, 0.2684, 0.0286, 0.0386, 0.0513
0.0680, 0.1106, 0.1757, 0.0159, 0.0228, 0.0319
epoch = 50, loss = 48.574+89.676= 0.000, time = 21.4700
0.1422, 0.2159, 0.3114, 0.0409, 0.0528, 0.0661
0.0879, 0.1307, 0.1933, 0.0224, 0.0293, 0.0381
epoch = 100, loss = 49.312+78.750= 0.000, time = 22.0093
0.1429, 0.2219, 0.3232, 0.0412, 0.0540, 0.0681
0.0849, 0.1257, 0.1997, 0.0223, 0.0289, 0.0392
