In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"


In [2]:
config = {
    'batch_size' : 1024,
    'path' : './',
    'neg_item' : 1,
    'n_embedding' : 64,
    'topk' : 10,
    'core' : 8,
    'regularization' : 0.005,
    'lr' : 0.01
}

In [6]:
pool = multiprocessing.Pool(config['core'])
train_data = EpinionData(config)
train_data.load_data()
model = NGCF(train_data.n_user, train_data.n_item, n_embedding=config['n_embedding']).cuda()
test_data = EpinionTest(config)
test_data.load_test()
test_dataloader = DataLoader(test_data, batch_size = config['batch_size']*2, shuffle=False) 
optim = torch.optim.Adam(model.parameters(), lr = config['lr'])
sparse_eye = sp.eye(train_data.n_user+train_data.n_item, dtype = np.float)
sparse_eye = sparse_mx_to_torch_sparse_tensor(sparse_eye).cuda()



In [7]:
for e in range(150):
    train_data.make_batch_sampling()
    dataloader = DataLoader(train_data, batch_size=config['batch_size'], shuffle=True)
    total_loss = 0.0
    start = timer()
    for user, pos, neg in dataloader:
        model.train()
        optim.zero_grad()
        user = torch.LongTensor(user).cuda()
        pos_item = torch.LongTensor(pos).cuda()
        neg_item = torch.LongTensor(neg).cuda()

        user_embedding, item_embedding = model(train_data.L_c, sparse_eye)
        user_batch_embed = user_embedding[user]
        p_i_batch_embed = item_embedding[pos_item]
        n_i_batch_embed = item_embedding[neg_item]
        bpr_loss, reg_loss = bpr(config['batch_size'], user_batch_embed, p_i_batch_embed, n_i_batch_embed, config['regularization'])
        loss = bpr_loss + reg_loss
        loss.backward()
        optim.step()
        total_loss += loss.item()
    if e % 10 == 0:
        print("Epoch : {:d}, Loss : {:4f}, Time : {:4f}".format(e, total_loss, timer()-start))
    test_timer = timer()
    hit, ndcg = test(model, train_data, test_dataloader, pool, sparse_eye)
    if e % 10 == 0:
        print("Epoch : {:d}, Hit@{:d} : {:4f},NDCG@{:d} : {:4f} Time : {:4f}".format(e, config['topk'],hit, config['topk'], ndcg,timer()-test_timer))
        


Epoch : 0, Loss : 14.602896, Time : 1.084796
Epoch : 0, Hit@10 : 0.000512,NDCG@10 : 0.000604 Time : 3.520539
Epoch : 10, Loss : 3.149358, Time : 1.002570
Epoch : 10, Hit@10 : 0.002016,NDCG@10 : 0.002352 Time : 3.151343
Epoch : 20, Loss : 2.170063, Time : 1.011023
Epoch : 20, Hit@10 : 0.001505,NDCG@10 : 0.001757 Time : 3.141455
Epoch : 30, Loss : 1.899260, Time : 1.084933
Epoch : 30, Hit@10 : 0.001414,NDCG@10 : 0.001641 Time : 3.158865
Epoch : 40, Loss : 1.818218, Time : 0.839374
Epoch : 40, Hit@10 : 0.001505,NDCG@10 : 0.001781 Time : 4.342403
Epoch : 50, Loss : 1.778883, Time : 0.841014
Epoch : 50, Hit@10 : 0.001475,NDCG@10 : 0.001739 Time : 4.491719
Epoch : 60, Loss : 1.751278, Time : 0.959894
Epoch : 60, Hit@10 : 0.001354,NDCG@10 : 0.001562 Time : 4.440440
Epoch : 70, Loss : 1.722532, Time : 0.843796
Epoch : 70, Hit@10 : 0.001445,NDCG@10 : 0.001689 Time : 4.417083
Epoch : 80, Loss : 1.717094, Time : 0.837045
Epoch : 80, Hit@10 : 0.001414,NDCG@10 : 0.001598 Time : 4.547980
Epoch : 90,

In [3]:
from Data import EpinionData, EpinionTest
from torch.utils.data import DataLoader
import torch  
from Model import NGCF
from utility import bpr, test
from timeit import default_timer as timer
import torch.nn.functional as F
import random
import scipy.sparse as sp
import numpy as np
import pandas as pd

def bpr(batch_size, user, pos_item, neg_item, regularization):
    pos_score = torch.sum(torch.mul(user, pos_item), dim=1)
    neg_score = torch.sum(torch.mul(user, neg_item), dim=1)
    loss_term = F.logsigmoid(pos_score-neg_score)
    bpr_loss = -torch.mean(loss_term)
    regularizer = 1./2*(user**2).sum() + 1./2*(pos_item**2).sum() + 1./2*(neg_item**2).sum()
    regularizer = regularizer / batch_size
    regularizer_loss = regularizer * regularization
    return bpr_loss, regularizer_loss


def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)


def test(model, train_module, test_dataloader, pool, eye):
    model.eval()
    correct = []
    test_dataset = test_dataloader.dataset

    with torch.no_grad():
        user_embedding, item_embedding = model(train_module.L_c, eye)
    n_user_batch = len(test_dataset.test_users) // train_module.batch_size + 1
    total_hit_rate = []
    total_ndcg = []
    for index, batch_test in enumerate(test_dataloader):
        user_batch = batch_test.cuda()
        user_embed = user_embedding[user_batch]
        item_embed = item_embedding[torch.arange(0, train_module.n_item, dtype = torch.long).cuda()]
        rate_batch = torch.matmul(user_embed, torch.transpose(item_embed, 0, 1))
        # Train에 나왔던 item은 뽑히지 않게
        make_pos_minus(user_batch, rate_batch, train_module.train_items)

        topk_item = torch.topk(rate_batch, k=train_module.topk, dim=1).indices
        topk_item = topk_item.detach().cpu().numpy()
        
        for i, user in enumerate(user_batch):
            hit, ndcg = test_one_user(user.item(), topk_item[i], test_dataset)
            total_hit_rate += hit
            total_ndcg.append(ndcg)
        # user_batch_rating = zip(topk_item, user_batch)
        # result = pool.map(test_one_user, user_batch_rating)
        length = float(test_dataset.n_test)
    return np.sum(total_hit_rate)/length, np.sum(total_ndcg)/model.n_user



def make_pos_minus(user_batch, rate_batch, pos_train_item):
    for i, user in enumerate(user_batch):
        user = user.item()
        pos_item = pos_train_item[user]
        rate_batch[i, pos_item] = -10000.0 

def test_one_user(user, topk_item, test_dataset):
    # hit rate
    pos_test_items = test_dataset.test_items[user]
    hit_list = []
    for pos in pos_test_items:
        if pos in topk_item:
            hit_list.append(1)
        else:
            hit_list.append(0)
    # NDCG
    reverse_list = np.asfarray(sorted(hit_list, reverse=True))
    IDCG = np.sum(reverse_list / np.log2(np.arange(2, len(reverse_list)+2)))
    DCG = np.sum(hit_list / np.log2(np.arange(2, len(hit_list)+2)))
    NDCG = 0.0 if DCG == 0.0 else DCG/IDCG
        

    return hit_list, NDCG

In [4]:
import torch.nn as nn
import torch
import numpy as np

class NGCF(nn.Module):
    def __init__(self, n_user, n_item, n_embedding):
        super(NGCF, self).__init__()
        self.n_user = n_user
        self.n_item = n_item
        self.n_embedding = n_embedding
        self.user_embedding = nn.Embedding(self.n_user, self.n_embedding)
        self.item_embedding = nn.Embedding(self.n_item, self.n_embedding)
        nn.init.xavier_uniform_(self.user_embedding.weight)
        nn.init.xavier_uniform_(self.item_embedding.weight)
        self.Front_Linear_List = nn.ModuleList()
        self.Back_Linear_List = nn.ModuleList()
        self.dropout_list = nn.ModuleList()
        for _ in range(3):
            self.Front_Linear_List.append(nn.Linear(64,64))
            self.Back_Linear_List.append(nn.Linear(64,64))
            self.dropout_list.append(nn.Dropout(p=0.5))

    def forward(self, H, sparse_eye):
        E_l_embedding = torch.cat((self.user_embedding.weight, self.item_embedding.weight), dim = 0)
        all_embedding = [E_l_embedding]
        H_I = H+sparse_eye
        for i in range(3):
            Front = torch.sparse.mm(H_I, E_l_embedding)
            Front_cal = nn.functional.leaky_relu(self.Front_Linear_List[i](Front))
            Back = torch.mul(E_l_embedding, Front)
            Back = nn.functional.leaky_relu(self.Back_Linear_List[i](Back))
            E_l_embedding = Front_cal + Back
            E_l_embedding = self.dropout_list[i](E_l_embedding)
            normalize_embed = nn.functional.normalize(E_l_embedding, p=2, dim=1)
            all_embedding += [normalize_embed]
        
        all_embedding = torch.cat(all_embedding, dim=1)
        user_embedding, item_embedding = torch.split(all_embedding, [self.n_user, self.n_item], dim=0)
        return user_embedding, item_embedding
            


In [5]:
import pandas as pd
import pickle
import scipy.sparse as sp
import numpy as np
import random
from torch.utils.data import Dataset

class EpinionTest(Dataset):
    def __init__(self, config):
        self.n_test = 0
        self.test_items = {}
        self.path = config['path']
        
    def __len__(self):
        return len(self.test_items.keys())

    def __getitem__(self, index):
        return self.test_users[index]

    def load_test(self):
        with open(self.path+'/test.txt') as f:
            for l in f.readlines():
                if len(l) == 0:
                    break
                l = l.strip('\n')
                items = [int(i) for i in l.split(' ')]
                self.n_test += len(items[1:])
                self.test_items[items[0]] = items[1:]
        self.test_users = torch.LongTensor(list(self.test_items.keys()))

        

class EpinionData(Dataset):
    def __init__(self, config):
        self.path = config['path']
        self.batch_size = config['batch_size']
        self.topk = config['topk']
        self.n_user = 0
        self.n_item = 0
        self.n_train = 0
        self.n_test = 0
        self.train_items = {}
        self.test_items = {}
        self.R = sp.dok_matrix((10000000, 10000000), dtype=np.float32)
        self.n_neg_item = config['neg_item']
        self.pos_item = []
        self.neg_item = []

    def __len__(self):
        return len(self.exist_users)

    def __getitem__(self, index):
        return self.exist_users[index], self.pos_item[index], self.neg_item[index]


    def load_data(self):
        # self.item_list = pd.read_table(self.path+'/item_list.txt', sep = '\t', header=0)
        # self.user_list = pd.read_table(self.path+'/user_list.txt', sep = '\t', header=0)
        self.exist_users = []
        
        with open(self.path+'/train.txt') as f:
            for l in f.readlines():
                if len(l) > 2:
                    l = l.strip('\n').split(' ')
                    items = [int(i) for i in l[1:]]
                    uid = int(l[0])
                    self.R[uid, items] = 1.0
                    self.train_items[uid] = items
                    self.exist_users.append(uid)
                    self.n_item = max(self.n_item, max(items))
                    self.n_user = max(self.n_user, uid)
                    self.n_train += len(items)
        
        self.n_user += 1
        self.n_item += 1
        self.R.resize((self.n_user, self.n_item))
        
      
        self.Item_By_Item = sp.dok_matrix((self.n_item, self.n_item), dtype = np.float32)
        self.User_By_User = sp.dok_matrix((self.n_user, self.n_user), dtype = np.float32)
        self.H = sp.dok_matrix((self.n_user+self.n_item, self.n_user+self.n_item), dtype = np.float32)
        self.normalize_H()
               
    def normalize_H(self):
        self.H = self.H.tolil()
        self.R = self.R.tolil()
        self.H[:self.n_user, self.n_user:] = self.R
        self.H[self.n_user:, :self.n_user] = self.R.T
        # item과 user matrix를 넣으면 됨.
        # self.H[:self.n_user, :self.n_user] = User Matrix
        # self.H[self.n_user:, self.n_user:] = Item Matrix
        self.H = self.H.todok()

        # Normalize 
        rowsum = np.array(self.H.sum(1))
        D_inv = np.power(rowsum, -1/2).flatten()
        D_inv[np.isinf(D_inv)] = 0.0
        D_ = sp.diags(D_inv)
        self.L_c = (D_.dot(self.H)).dot(D_)
        self.L_c = sparse_mx_to_torch_sparse_tensor(self.L_c).cuda()

    def make_batch_sampling(self):
        pos_item, neg_item = [], []
        for user in self.exist_users:
            pos_item += [random.choice(self.train_items[user])]
            neg_item += self.neg_sampling(user)
        self.pos_item = np.asarray(pos_item)
        self.neg_item = np.asarray(neg_item).squeeze(1)


    def neg_sampling(self, user):
        neg = []
        while True:
            if len(neg) >= self.n_neg_item:
                break
            rand = np.random.randint(0, self.n_item, 1)
            if rand not in self.train_items[user] and rand not in neg:
                neg.append(rand)
        return neg


In [8]:
print("user의 수 : ",len(train_data.exist_users))


user의 수 :  27545


In [6]:
interaction = 0
item_set = set()
for k,v in data.train_items.items():
    item_set.update(v)
    interaction += len(v)

In [7]:
print("item의 수 : ",len(item_set))
print("interaction의 수 : ", interaction)

item의 수 :  37279
interaction의 수 :  99763
