In [53]:
import os
import numpy as np
import pandas as pd
import random
from math import ceil
from tqdm import trange
from subprocess import call
from itertools import islice
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix, dok_matrix

In [54]:
os.chdir('/home2/skylee/GP')
dataset = ['epinion82', 'epinion91', 'librarything82', 'librarything91'][0]
dataset

'epinion82'

In [55]:
users_list = [] 
items_list = [] 

with open('./preprocessing_data/' + dataset + '/network.txt', 'r') as f:
    for line in f: # read rest of numbers
        users_list.append(int(line.split()[0]))
        items_list.append(list(map(int,line.split()[1:])))

print(users_list[2])
print(items_list[2])

8
[14394]


In [56]:
user_max = max(users_list)
item_max = 0

for i in range(len(items_list)):
    if max(items_list[i]) > item_max:
        item_max = max(items_list[i])

max_index = max(user_max, item_max)

print(user_max)
print(item_max)
print(max_index)

16621
16621
16621


In [57]:
matrix = pd.DataFrame(np.zeros((max_index+1, max_index+1)))
matrix.shape

(16622, 16622)

In [58]:
num_user = len(users_list)
count = 0

for i in range(num_user):
    for j in range(len(items_list[i])):
        matrix.loc[users_list[i], int(items_list[i][j])] = 1
    count += len(items_list[i])
count

47118

In [59]:
# Test for symmetry
np.allclose(matrix, matrix.T, rtol=1e-8, atol=1e-8)

True

In [23]:
X = csr_matrix(matrix)
X

<39918x39918 sparse matrix of type '<class 'numpy.float64'>'
	with 75947 stored elements in Compressed Sparse Row format>

In [11]:
class BPR:
    def __init__(self, learning_rate = 0.01, n_factors = 15, n_iters = 10,
                 batch_size = 1000, reg = 0.01, seed = 1234, verbose = True):
        self.reg = reg
        self.seed = seed
        self.verbose = verbose
        self.n_iters = n_iters
        self.n_factors = n_factors
        self.batch_size = batch_size
        self.learning_rate = learning_rate

        # to avoid re-computation
        self.prediction = None

    def fit(self, ratings):
        indptr = ratings.indptr #value counts in each row
        indices = ratings.indices
        n_users, n_items = ratings.shape

        # batch size should be smaller than number of users
        batch_size = self.batch_size
        if n_users < batch_size:
            batch_size = n_users

        batch_iters = n_users // batch_size

        # initialize random weights
        rstate = np.random.RandomState(self.seed)
        self.user_factors = rstate.normal(size = (n_users, self.n_factors))
        self.item_factors = rstate.normal(size = (n_items, self.n_factors))

        # progress bar
        loop = range(self.n_iters)
        if self.verbose:
            loop = trange(self.n_iters, desc = self.__class__.__name__)
        
        for _ in loop:
            for _ in range(batch_iters):
                sampled = self.sample(n_users, n_items, indices, indptr)
                sampled_users, sampled_pos_items, sampled_neg_items = sampled
                self.update(sampled_users, sampled_pos_items, sampled_neg_items)

        return self


    def sample(self, n_users, n_items, indices, indptr):
        """
        sample batches of random triplets u, i, j
        """
        sampled_pos_items = np.zeros(self.batch_size, dtype=np.int)
        sampled_neg_items = np.zeros(self.batch_size, dtype=np.int)
        sampled_users = random.sample(users_list, self.batch_size)
        sampled_users_list = []

        for idx, user in enumerate(sampled_users):
            sampled_users_list.append(user)

            pos_items = indices[indptr[user]:indptr[user+1]]
            pos_item = np.random.choice(pos_items)
            neg_item = np.random.choice(n_items)


            while neg_item in pos_items:
                neg_item = np.random.choice(n_items)

            sampled_pos_items[idx] = pos_item
            sampled_neg_items[idx] = neg_item

        return sampled_users_list, sampled_pos_items, sampled_neg_items
    
    def update(self, u, i, j):
        """
        update according to the bootstrapped user u,
        positive item i and negative item j
        """
        user_u = self.user_factors[u]
        item_i = self.item_factors[i]
        item_j = self.item_factors[j]

        
        r_uij = np.sum(user_u * (item_i - item_j), axis = 1)
        sigmoid = np.exp(-r_uij) / (1.0 + np.exp(-r_uij))

        # dimenstion matching
        sigmoid_tiled = np.tile(sigmoid, (self.n_factors, 1)).T

        # update using gradient descent
        grad_u = sigmoid_tiled * (item_j - item_i) + self.reg * user_u
        grad_i = sigmoid_tiled * -user_u + self.reg * item_i
        grad_j = sigmoid_tiled * user_u + self.reg * item_j
        self.user_factors[u] -= self.learning_rate * grad_u
        self.item_factors[i] -= self.learning_rate * grad_i
        self.item_factors[j] -= self.learning_rate * grad_j

        return self

    def predict(self):
        """
        Obtain the predicted ratingfs for every users and items
        """        
        
        if self.prediction is None:
            self.prediction = self.user_factors.dot(self.item_factors.T)

        return self.prediction

    def predict_user(self, user):
        """
        Returns the predicted ratings for the specified user
        """
        user_pred = self.user_factors[user].dot(self.item_factors.T)
        
        return user_pred

    def recommend(self, ratings, N = 5):
        """
        Returns the top N ranked items for given user id
        """

        n_users = ratings.shape[0]
        recommendation = np.zeros((n_users, N), dtype = np.uint32)
        for user in range(n_users):
            top_n = self.recommend_user(ratings, user, N)
            recommendation[user] = top_n

        return recommendation

    def recommend_user(self, ratings, user, N):
        """
        Top-N ranked items for specific user
        """
        scores = self.predict_user(user)

        liked = set(ratings[user].indices)
        count = N + len(liked)
        if count < scores.shape[0]:
            ids = np.argpartition(scores, -count)[-count:] # 큰 숫자 count개수 만큼 오른쪽으로 정렬, 인덱스 반환
            best_ids = np.argsort(scores[ids])[::-1] # 오름차순으로 정렬 후, 오른쪽 값 부터 반대방향으로 반환
            best = ids[best_ids]

        else:
            best = np.argsort(scores)[::-1]

        top_n = list(islice((rec for rec in best if rec not in liked), N))

        return top_n
    
    def get_similar_items(self, N=5, item_ids=None):
        """
        return the top N similar items for itemid by cosine distance
        """
        normed_factors = normalize(self.item_factors)
        knn = NearestNeighbors(n_neighbors = N + 1, metric = 'euclidean')
        knn.fit(normed_factors)

        if item_ids is not None:
            normed_factors = normed_factors[item_ids]

        _, items = knn.kneighbors(normed_factors)
        similar_items = items[:, 1:].astype(np.uint32)

        return similar_items

In [12]:
bpr_params1 = {'reg': 0.01,
              'learning_rate': 0.1,
              'n_iters': 10,
              'n_factors': 15,
              'batch_size': 100}

bpr10 = BPR(**bpr_params1)
bpr10.fit(X)

BPR: 100%|██████████| 10/10 [00:02<00:00,  4.55it/s]


<__main__.BPR at 0x7f8d15cd5630>

In [13]:
bpr_params2 = {'reg': 0.01,
              'learning_rate': 0.1,
              'n_iters': 50,
              'n_factors': 15,
              'batch_size': 100}

bpr50 = BPR(**bpr_params2)
bpr50.fit(X)

BPR: 100%|██████████| 50/50 [00:10<00:00,  4.60it/s]


<__main__.BPR at 0x7f8d15f172e8>

In [14]:
bpr_params3 = {'reg': 0.01,
              'learning_rate': 0.1,
              'n_iters': 100,
              'n_factors': 15,
              'batch_size': 100}

bpr100 = BPR(**bpr_params3)
bpr100.fit(X)

BPR: 100%|██████████| 100/100 [00:25<00:00,  3.90it/s]


<__main__.BPR at 0x7f8d15f174a8>

In [15]:
array10 = bpr10.recommend(X, N = 20)
array10

array([[ 1361, 12491,  6261, ..., 11605,  6207,  1688],
       [13858, 16614,  6597, ..., 15481, 11456, 13518],
       [ 9209,  5372,   429, ...,  8046,  7782,  8880],
       ...,
       [13027, 15701, 10520, ...,  1701,  6184,  7742],
       [10599, 13027,  4888, ...,  5565,  4167,   575],
       [ 2770,   432, 15743, ..., 12983,  2346, 11395]], dtype=uint32)

In [16]:
array50 = bpr50.recommend(X, N = 20)
array50

array([[ 1361,  6261,  3272, ..., 15277, 12491, 16252],
       [13858,  1776,  9132, ..., 11456,   610, 11580],
       [ 9819,  5106,  7121, ...,  8652,  8147,  1041],
       ...,
       [13027, 10520,  6214, ...,  6872, 14679,  4987],
       [13027,   575, 10599, ...,  4453, 12121,  9583],
       [ 1635,  2531,  2770, ...,  2793,  5119,  9347]], dtype=uint32)

In [17]:
array100 = bpr100.recommend(X, N = 20)
array100

array([[12491,  7840,  3272, ...,  1688,   173, 16320],
       [13858,  1776,  9709, ..., 12005, 16364,  5575],
       [ 8147,   659,  8249, ...,  4274, 14351,  7750],
       ...,
       [13027, 10520,  1578, ..., 14291,  4828,  8734],
       [12886, 13027,  3488, ..., 16347,  4888,  2674],
       [10598,  5119,  1635, ...,  6556, 13115,  9986]], dtype=uint32)

In [18]:
top20 = array100[users_list,:]
top20.shape


(4899, 20)

In [19]:
array100.shape

(16622, 20)

In [21]:
link_with_recom = {}
temp_list = items_list

for i in range(len(temp_list)):
    user = users_list[i] 
    temp_list[i].extend(top20[i])
    link_with_recom[user] = temp_list[i]

len(link_with_recom)

4899

In [22]:
#기존 친구가 없던 유저에게도 랜덤 초기값 바탕의 추천해주는 경우 
link_with_recom_all = {}

for i in range(len(array100)):
    if i in set(users_list):
        link_with_recom_all[i] = link_with_recom[i]
    else:
        link_with_recom_all[i] = list(array100[i])

len(link_with_recom_all)

16622

In [23]:
with open("./dataset/epinion82/social_relations.txt", "w") as f:
    for k, v in link_with_recom.items():
        string = ""
        string += str(k)
        for i in v:
            string += " "
            string += str(i)
        string += '\n'
        f.write(string)

In [21]:
'''
with open("social_relations_all.txt", "w") as f:
    for k, v in link_with_recom_all.items():
        string = ""
        string += str(k)
        for i in v:
            string += " "
            string += str(i)
        string += '\n'
        f.write(string)
'''

In [None]:
def auc_score(model, bpr_ratings):
    """
    computes area under the ROC curve.
    """
    auc = 0.0
    n_users, n_items = bpr_ratings.shape
    count = 0
    
    #for u in len(users_list):
    #    y_pred = model.predict_user(users_list[u])
    #    y_true = np.zeros(n_items)
    #    y_true[row.indices] = 1
    #    auc += roc_auc_score(y_true, y_pred)

    for user, row in enumerate(bpr_ratings):
        if len(row.indices) == 0:
            continue
        y_pred = model.predict_user(user)
        y_true = np.zeros(n_items)
        y_true[row.indices] = 1
        auc += roc_auc_score(y_true, y_pred)
        count += 1
    
    auc /= count

    return auc

In [None]:
print('Epoch 10: ', auc_score(bpr10, X))
#print(auc_score(bpr, X_train))
#print('Epoch 10: ', auc_score(bpr10, X_test))
print()

print('Epoch 50: ', auc_score(bpr50, X))
#print(auc_score(bpr, X_train))
#print('Epoch 50: ', auc_score(bpr50, X_test))
print()

print('Epoch 100: ', auc_score(bpr100, X))
#print(auc_score(bpr, X_train))
#print('Epoch 100: ', auc_score(bpr100, X_test))