In [2]:
import pandas as pd
import numpy as np
np.random.seed(1000)
import pdb
import matplotlib.pyplot as plt
import cPickle as pkl
from sklearn import preprocessing
import sys

def getuserCache(df):
    userCache = {}
    for uid in df.uid.unique():
        items = df.loc[df.uid == uid]['iid'].values.tolist()
        userCache[uid] = items

    return userCache
    
def getitemCache(df):
    itemCache = {}
    for iid in df.iid.unique():
        users = df.loc[df.iid == iid]['uid'].values.tolist()
        itemCache[iid] = users

    return itemCache

def dist(a,b,norm):
    dist = np.linalg.norm((a-b),norm)
    
    return dist

datasets = ['delicious','ciao','bookcrossing','cellphone']
for dataset in datasets:
    print("--------------------------")
    print("***[Dataset: %s]***" %dataset.title())
    totalFile = pd.read_csv('data/'+dataset+'/ratings.dat',sep="\t",usecols=[0,1],names=['uid','iid'],header=0)
    total_uids = sorted(totalFile.uid.unique())
    total_iids = sorted(totalFile.iid.unique())

    trainFile = pd.read_csv('data/'+dataset+'/LOOTrain.dat',sep="\t",usecols=[0,1],names=['uid','iid'],header=0)
    train_uids = sorted(trainFile.uid.unique())
    train_iids = sorted(trainFile.iid.unique())

    userCache = getuserCache(trainFile)
    itemCache = getitemCache(trainFile)



    root = "data/"+dataset  
    # Decide data type
    df_data = pd.read_csv(root+'/u.data',sep="\t")
    if len(df_data.columns) == 3:
        initial_header = ['uid','iid','rating']
    else:
        initial_header = ['uid','iid']

    # Read data
    df_data = pd.read_csv(root+'/u.data',sep="\t",names=initial_header)
    if len(df_data.columns) == 3:
        ratings_set = sorted(df_data.rating.unique().tolist())
    df_data = df_data.drop_duplicates(['uid','iid']).reset_index(drop=True)


    # Remove uids less than numThreshold
    sr_uid = df_data['uid'].value_counts()
    df_count_uid = pd.DataFrame({'uid': sr_uid.index, 'count': sr_uid.values})
    filtered_uid = df_count_uid.loc[(5 <= df_count_uid['count'])].uid
    df_data = df_data.loc[df_data['uid'].isin(filtered_uid)].reset_index(drop=True)

    sr_iid = df_data['iid'].value_counts()
    df_count_iid = pd.DataFrame({'iid': sr_iid.index, 'count': sr_iid.values})
    filtered_iid = df_count_iid.loc[(5 <= df_count_iid['count'])].iid
    df_data = df_data.loc[df_data['iid'].isin(filtered_iid)].reset_index(drop=True)


    sr_uid = df_data['uid'].value_counts()
    df_count_uid = pd.DataFrame({'uid': sr_uid.index, 'count': sr_uid.values})
    filtered_uid = df_count_uid.loc[(3 <= df_count_uid['count'])].uid
    df_data = df_data.loc[df_data['uid'].isin(filtered_uid)].reset_index(drop=True)

    # map uids and iids from index 0
    unique_uids = df_data.uid.unique()
    unique_iids = df_data.iid.unique()

    uid_map = dict()
    for idx, uid in enumerate(unique_uids):
        uid_map[uid] = idx    

    iid_map = dict()
    for idx, iid in enumerate(unique_iids):
        iid_map[iid] = idx

    df_data['uid'] = df_data['uid'].map(uid_map)
    df_data['iid'] = df_data['iid'].map(iid_map)

    df_data = df_data.sort_values(['uid','iid']).reset_index(drop=True)

    # TransCF
    userEmbedding_TransCF = pkl.load(open('model/userEmbedding_TransCF_'+dataset+'.pkl'))
    itemEmbedding_TransCF = pkl.load(open('model/itemEmbedding_TransCF_'+dataset+'.pkl'))


    userNeighborEmbedding_TransCF = np.zeros((len(total_uids),128))
    for uid in train_uids:
        neighborItems = userCache[uid]
        neighborItems_embeddings = np.mean(itemEmbedding_TransCF[neighborItems],axis=0).tolist()
        userNeighborEmbedding_TransCF[uid,:] = neighborItems_embeddings

    itemNeighborEmbedding_TransCF = np.zeros((len(total_iids),128))
    for iid in train_iids:
        neighborUsers = itemCache[iid]
        neighborUsers_embeddings = np.mean(userEmbedding_TransCF[neighborUsers],axis=0).tolist()
        itemNeighborEmbedding_TransCF[iid,:] = neighborUsers_embeddings

    userNeighborEmbedding_TransCF = np.array(userNeighborEmbedding_TransCF)
    itemNeighborEmbedding_TransCF = np.array(itemNeighborEmbedding_TransCF)

    pos_cnt = 0
    neg_cnt = 0
    total_cnt = 0

    if len(initial_header) == 3:
        per_rating_correct_cnt = [0] * len(ratings_set)
        per_rating_total_cnt = [0] * len(ratings_set)

    for uid in train_uids:
        iids = trainFile.loc[trainFile.uid == uid]['iid'].values.tolist()
        tmp_iids = totalFile.loc[totalFile.uid == uid]['iid'].values.tolist()
        if len(initial_header) == 3:
            ratings = df_data.loc[df_data.uid == uid][:-2]['rating'].values.tolist()

        neg_iids = list(set(train_iids).difference(set(tmp_iids)))
        neg_iids = np.random.choice(neg_iids,len(iids))

        user = userEmbedding_TransCF[uid].reshape(1,-1)
        items = itemEmbedding_TransCF[iids]
        neg_items = itemEmbedding_TransCF[neg_iids]

        neighborUserEmbedding = userNeighborEmbedding_TransCF[uid]

        translated_users = []
        for idx, iid in enumerate(iids):
            neighborItemEmbedding = itemNeighborEmbedding_TransCF[iid]
            translation = neighborUserEmbedding * neighborItemEmbedding
            translated_user = user.ravel() + translation
            translated_users.append(translated_user.tolist())

        translated_users_neg = []
        for idx, iid in enumerate(neg_iids):
            neighborItemEmbedding = itemNeighborEmbedding_TransCF[iid]
            translation = neighborUserEmbedding * neighborItemEmbedding
            translated_user = user.ravel() + translation
            translated_users_neg.append(translated_user.tolist())

        translated_users = np.array(translated_users)
        translated_users_neg = np.array(translated_users_neg)

        for idx, iid in enumerate(iids):
            if len(initial_header) == 3:
                rat = ratings[idx]
                if dataset != 'bookcrossing' and rat == 0:
                    continue
                rating_idx = ratings_set.index(rat)
                per_rating_total_cnt[rating_idx] += 1

            total_cnt+=1
            before_dist_pos = dist(user, items[idx], 2)
            after_dist_pos = dist(translated_users[idx], items[idx], 2)

            before_dist_neg = dist(user, neg_items[idx], 2)
            after_dist_neg = dist(translated_users_neg[idx], neg_items[idx], 2)

            if before_dist_pos >= after_dist_pos:
                pos_cnt+=1
                if len(initial_header) == 3:
                    per_rating_correct_cnt[rating_idx] += 1

            if before_dist_neg >= after_dist_neg:
                neg_cnt+=1


    if len(initial_header) == 3:
        print("--------------------------")
        for rat in ratings_set:
            if rat == 0:
                continue
            idx = ratings_set.index(rat)
            acc = per_rating_correct_cnt[idx] / float(per_rating_total_cnt[idx]) * 100
            print("Rating: {} | Acc: {}%% ({}/{})".format(rat, np.round(acc,2), per_rating_correct_cnt[idx], per_rating_total_cnt[idx]))
    
    pos_accuracy = (pos_cnt / float(total_cnt)) * 100
    neg_accuracy = (neg_cnt / float(total_cnt)) * 100
    difference = pos_accuracy - neg_accuracy
    print("--------------------------")
    print("POS Accuracy: %.2f%% (%d/%d)" %(pos_accuracy, pos_cnt, total_cnt))
    print("NEG Accuracy: %.2f%% (%d/%d)" %(neg_accuracy, neg_cnt, total_cnt))
    print("--------------------------")
    print("Difference: %.2f%%\n" %(difference))
    
    


--------------------------
***[Dataset: Delicious]***
--------------------------
POS Accuracy: 64.63% (3618/5598)
NEG Accuracy: 43.75% (2449/5598)
--------------------------
Difference: 20.88%

--------------------------
***[Dataset: Ciao]***
--------------------------
Rating: 1 | Acc: 61.53%% (3991/6486)
Rating: 2 | Acc: 51.43%% (3512/6829)
Rating: 3 | Acc: 55.49%% (8414/15162)
Rating: 4 | Acc: 52.27%% (20220/38687)
Rating: 5 | Acc: 55.46%% (36774/66304)
--------------------------
POS Accuracy: 54.63% (72911/133468)
NEG Accuracy: 38.42% (51281/133468)
--------------------------
Difference: 16.21%

--------------------------
***[Dataset: Bookcrossing]***
--------------------------
Rating: 1 | Acc: 59.75%% (380/636)
Rating: 2 | Acc: 58.44%% (606/1037)
Rating: 3 | Acc: 55.19%% (1245/2256)
Rating: 4 | Acc: 53.77%% (1831/3405)
Rating: 5 | Acc: 52.73%% (10283/19500)
Rating: 6 | Acc: 55.2%% (8268/14977)
Rating: 7 | Acc: 56.16%% (18130/32283)
Rating: 8 | Acc: 57.23%% (26714/46676)
Rating: 9 |