In [1]:
import os
import sys
import random
import time

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import numpy as np
import pandas as pd
from scipy import sparse

In [2]:
import yaml

def load_config(config_file):
    with open(config_file, 'r') as stream:
        try:
            config = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)
    return config

cfg = load_config('config.yaml')

In [3]:
if torch.cuda.is_available():
    print('CUDA is available')
    cfg['device'] = True

device = torch.device('cuda' if cfg['device'] else 'cpu')
device

CUDA is available


device(type='cuda')

In [4]:
random.seed(cfg['seed'])
np.random.seed(cfg['seed'])
torch.manual_seed(cfg['seed'])


<torch._C.Generator at 0x7fc8f55965f0>

In [5]:
raw_data = pd.read_csv(os.path.join(cfg['DATA_DIR'], cfg['data']), header=0, usecols=[0, 1])

In [6]:
df_user_movie = raw_data[['user', 'item']]
df_user_movie

Unnamed: 0,user,item
0,11,4643
1,11,170
2,11,531
3,11,616
4,11,2140
...,...,...
5154466,138493,44022
5154467,138493,4958
5154468,138493,68319
5154469,138493,40819


In [7]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

def filter_triplets(tp, min_uc=5, min_sc=0):
    if min_sc > 0:
        itemcount = get_count(tp, 'item')
        tp = tp[tp['item'].isin(itemcount[itemcount['size'] >= min_sc]['item'])]

    if min_uc > 0:
        usercount = get_count(tp, 'user')
        tp = tp[tp['user'].isin(usercount[usercount['size'] >= min_uc]['user'])]

    usercount, itemcount = get_count(tp, 'user'), get_count(tp, 'item')
    return tp, usercount, itemcount

def split_train_test_proportion(data, test_prop=0.2):
    data_grouped_by_user = data.groupby('user')
    tr_list, te_list = list(), list()

    for _, group in data_grouped_by_user:
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])

        else:
            tr_list.append(group)

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)

    return data_tr, data_te

def numerize(tp, user2id, item2id):
    uid = tp['user'].apply(lambda x: user2id[x])
    sid = tp['item'].apply(lambda x: item2id[x])
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

def denumerize(tp, id2user, id2item):
    user = tp['uid'].apply(lambda x: id2user[x])
    item = tp['sid'].apply(lambda x: id2item[x])
    return pd.DataFrame(data={'user': user, 'item': item}, columns=['user', 'item'])

In [8]:
raw_data, user_activity, item_popularity = filter_triplets(raw_data, min_uc=5, min_sc=10)

print("유저별 리뷰수\n",user_activity)
print("아이템별 리뷰수\n",item_popularity)

유저별 리뷰수
          user  size
0          11   376
1          14   180
2          18    77
3          25    91
4          31   154
...       ...   ...
31355  138473    63
31356  138475   124
31357  138486   137
31358  138492    68
31359  138493   314

[31360 rows x 2 columns]
아이템별 리뷰수
         item   size
0          1  12217
1          2   3364
2          3    734
3          4     43
4          5    590
...      ...    ...
6802  118700     54
6803  118900     60
6804  118997     52
6805  119141    122
6806  119145     78

[6807 rows x 2 columns]


In [9]:
# Shuffle User Indices
unique_uid = user_activity['user'].unique()
unique_sid = item_popularity['item'].unique()
print("(BEFORE) unique_uid:",unique_uid)

np.random.seed(cfg['seed'])
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]
print("(AFTER) unique_uid:",unique_uid)

n_users = unique_uid.size #31360
n_items = unique_sid.size #5000


(BEFORE) unique_uid: [    11     14     18 ... 138486 138492 138493]
(AFTER) unique_uid: [ 81259  11986  67552 ...   3671  69383 103755]


In [10]:
unique_item = raw_data['item'].unique()


In [11]:
item2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
user2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

pro_dir = os.path.join('pro_sg')

if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)

with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)

with open(os.path.join(pro_dir, 'unique_uid.txt'), 'w') as f:
    for uid in unique_uid:
        f.write('%s\n' % uid)



In [12]:
num_heldout_users = cfg['FOLD_SIZE']

In [13]:
df_for_ease = numerize(raw_data, user2id, item2id)
df_for_ease

Unnamed: 0,uid,sid
0,13266,2505
1,13266,109
2,13266,319
3,13266,368
4,13266,1183
...,...,...
5154466,4927,4882
5154467,4927,2652
5154468,4927,5768
5154469,4927,4791


In [14]:
df_for_ease['watched'] = [0.9] * len(df_for_ease)
df_for_ease

Unnamed: 0,uid,sid,watched
0,13266,2505,0.9
1,13266,109,0.9
2,13266,319,0.9
3,13266,368,0.9
4,13266,1183,0.9
...,...,...,...
5154466,4927,4882,0.9
5154467,4927,2652,0.9
5154468,4927,5768,0.9
5154469,4927,4791,0.9


In [15]:
pivot_table = df_for_ease.pivot_table(index=["uid"], columns=["sid"], values="watched")
X = pivot_table.to_numpy()
X = np.nan_to_num(X)

In [16]:
class EASE:
    """
    Embarrassingly Shallow Autoencoders model class
    """

    def __init__(self, lambda_):
        self.B = None
        self.lambda_ = lambda_

    def train(self, interaction_matrix):
        """
        train pass
        :param interaction_matrix: interaction_matrix
        """
        G = interaction_matrix.T @ interaction_matrix
        diag = list(range(G.shape[0]))
        G[diag, diag] += self.lambda_
        P = np.linalg.inv(G)

        # B = P * (X^T * X − diagMat(γ))
        self.B = P / -np.diag(P)
        min_dim = min(*self.B.shape)
        self.B[range(min_dim), range(min_dim)] = 0

    def forward(self, user_row):
        """
        forward pass
        """
        return user_row @ self.B

In [17]:
ease = EASE(600)
ease.train(X)

In [18]:
result = ease.forward(X[:, :])
print(result)

[[ 1.44030173e-01  4.62524409e-02  1.32486402e-02 ...  1.15655143e-02
   1.52116324e-04 -9.57130184e-03]
 [ 5.40418261e-01  2.60224329e-01  1.18604298e-01 ...  4.84785817e-03
  -8.74884215e-03  6.17955959e-03]
 [ 5.78159254e-01  2.29870233e-01  3.04806435e-03 ... -7.16639468e-03
  -6.30002625e-03  5.29215595e-03]
 ...
 [ 8.25342773e-01  2.13388387e-01  1.00231686e-01 ...  8.79471015e-03
  -2.12466751e-02  1.16336975e-02]
 [ 1.90881772e-01  3.88376535e-02  1.63342187e-02 ... -2.20953935e-03
  -3.13434100e-03  2.52316346e-03]
 [ 3.41954142e-01  2.34193011e-01 -3.61963443e-02 ... -2.35777789e-03
  -4.47591240e-03 -1.26807603e-02]]


In [19]:
print(X.nonzero())


(array([    0,     0,     0, ..., 31359, 31359, 31359]), array([ 136,  153,  380, ..., 5771, 5855, 6084]))


In [20]:
result[X.nonzero()] = -np.inf
print(result)

[[ 1.44030173e-01  4.62524409e-02  1.32486402e-02 ...  1.15655143e-02
   1.52116324e-04 -9.57130184e-03]
 [ 5.40418261e-01            -inf  1.18604298e-01 ...  4.84785817e-03
  -8.74884215e-03  6.17955959e-03]
 [           -inf            -inf  3.04806435e-03 ... -7.16639468e-03
  -6.30002625e-03  5.29215595e-03]
 ...
 [           -inf  2.13388387e-01  1.00231686e-01 ...  8.79471015e-03
  -2.12466751e-02  1.16336975e-02]
 [ 1.90881772e-01  3.88376535e-02  1.63342187e-02 ... -2.20953935e-03
  -3.13434100e-03  2.52316346e-03]
 [ 3.41954142e-01  2.34193011e-01 -3.61963443e-02 ... -2.35777789e-03
  -4.47591240e-03 -1.26807603e-02]]


In [30]:
import bottleneck as bn

top_items_by_user = bn.argpartition(-result, 10, axis=1)[:, :10]
print(top_items_by_user)

[[5514 5147 5936 ... 4735 4882 4982]
 [1048  226  356 ... 4101 2672 2653]
 [ 406  894 1977 ... 2619  105    9]
 ...
 [4666 1435 2156 ...   41 1494 4101]
 [2304  178  657 ...  356   31 2670]
 [5461 3994  492 ...  237   92 4000]]


In [None]:
# user_result = []
# item_result =[]

# for id, top_k in enumerate(top_items_by_user):
#     user_result.extend([id] * 20)
#     item_result.extend(top_k)

# df_user_result = pd.DataFrame(user_result, columns=['uid'])
# df_item_result = pd.DataFrame(item_result, columns=['sid'])
# df_result = pd.concat([df_user_result, df_item_result], axis=1)

In [22]:
from copy import deepcopy
from tqdm import tqdm

In [23]:
class EASER():
    def __init__(self, threshold = 3500, lambdaBB = 500, lambdaCC = 10000, rho = 50000, epochs = 40):
        self.threshold = threshold
        self.lambdaBB = lambdaBB
        self.lambdaCC = lambdaCC
        self.rho = rho
        self.epochs = epochs

    def create_list_feature_pairs(self, XtX):
        AA = np.triu(np.abs(XtX))
        AA[ np.diag_indices(AA.shape[0]) ]=0.0
        ii_pairs = np.where((AA > self.threshold) == True)
        return ii_pairs

    def create_matrix_Z(self, ii_pairs, X):
        MM = np.zeros( (len(ii_pairs[0]), X.shape[1]),    dtype=np.float64)
        MM[np.arange(MM.shape[0]) , ii_pairs[0]   ]=1.0
        MM[np.arange(MM.shape[0]) , ii_pairs[1]   ]=1.0
        CCmask = 1.0-MM
        MM = sparse.csc_matrix(MM.T)
        Z=  X * MM
        Z= (Z == 2.0 )
        Z=Z*1.0
        return Z, CCmask

    def train_higher(self, XtX, XtXdiag, ZtZ, ZtZdiag, CCmask, ZtX):
        ii_diag=np.diag_indices(XtX.shape[0])
        XtX[ii_diag] = XtXdiag + self.lambdaBB
        PP = np.linalg.inv(XtX)
        ii_diag_ZZ=np.diag_indices(ZtZ.shape[0])
        ZtZ[ii_diag_ZZ] = ZtZdiag + self.lambdaCC + self.rho
        QQ=np.linalg.inv(ZtZ)
        CC = np.zeros( (ZtZ.shape[0], XtX.shape[0]),dtype=np.float64 )
        DD = np.zeros( (ZtZ.shape[0], XtX.shape[0]),dtype=np.float64 )
        UU = np.zeros( (ZtZ.shape[0], XtX.shape[0]),dtype=np.float64 )

        for iter in tqdm(range(self.epochs)):
            # learn BB
            XtX[ii_diag] = XtXdiag
            BB= PP.dot(XtX-ZtX.T.dot(CC))
            gamma = np.diag(BB) / np.diag(PP)
            BB-= PP * gamma
            # learn CC
            CC= QQ.dot(ZtX-ZtX.dot(BB) + self.rho * (DD-UU))
            # learn DD
            DD=  CC  * CCmask
            #DD= np.maximum(0.0, DD) # if you want to enforce non-negative parameters
            # learn UU (is Gamma in paper)
            UU+= CC-DD

        return BB, DD

    def fit(self, X):
        print(' --- init')
        XtX = (X.T @ X)
        XtXdiag = deepcopy(np.diag(XtX))
        ii_pairs = self.create_list_feature_pairs(XtX)
        Z, CCmask = self.create_matrix_Z(ii_pairs, X)

        ZtZ = (Z.transpose() @ Z)
        ZtZdiag = deepcopy(np.diag(ZtZ))

        ZtX = (Z.transpose() @ X)

        print(' --- iteration start.')
        BB, CC = self.train_higher(XtX, XtXdiag, ZtZ, ZtZdiag, CCmask, ZtX)
        print(' --- iteration end.')

        self.pred = torch.from_numpy(X.dot(BB) + Z.dot(CC))

In [24]:
easer = EASER()
easer.fit(X)

 --- init
 --- iteration start.


100%|██████████| 40/40 [07:41<00:00, 11.54s/it]


 --- iteration end.


In [25]:
X_pred = easer.pred.numpy()

In [26]:
X_pred[X.nonzero()] = -np.inf

In [31]:
top_items_by_user2 = bn.argpartition(-X_pred, 10, axis=1)[:, :10]
print(top_items_by_user2)

[[4101 5936 5514 ... 4065 5147 5737]
 [1048  226  356 ... 4101 2653  604]
 [3909 1977  105 ... 2619  353  894]
 ...
 [  41 5037 4666 ... 4101  146 1494]
 [ 178 2304  356 ...   31  890 1284]
 [ 492 5461 3240 ... 1949  237   92]]


In [71]:
top_items_by_user2

array([[4101, 5936, 5514, ..., 4065, 5147, 5737],
       [1048,  226,  356, ..., 4101, 2653,  604],
       [3909, 1977,  105, ..., 2619,  353,  894],
       ...,
       [  41, 5037, 4666, ..., 4101,  146, 1494],
       [ 178, 2304,  356, ...,   31,  890, 1284],
       [ 492, 5461, 3240, ..., 1949,  237,   92]])

In [39]:
print(top_items_by_user[110])
print(top_items_by_user2[110])

print(set(top_items_by_user2[120]) & set(top_items_by_user[120]))

[  76 1580  943  870 2320 3255 3420  705  328 1504]
[3255  870  705 2320   76  328  943 1580 3420 3200]
{3200, 352, 738, 2852, 4101, 3181, 2670, 2991, 852, 220}


In [93]:
top_item_intersection = []

for i in range(31360):
    set1 = set(top_items_by_user[i])
    set2 = set(top_items_by_user2[i])
    intersection = list(set1 & set2)
    if len(intersection) < 10:
        intersection += list(set1 - set2)[:10-len(intersection)]
    top_item_intersection.append(intersection)


In [94]:
top_item_intersection

[[4065, 4610, 4101, 5514, 5936, 4882, 5147, 6173, 4735, 4982],
 [226, 931, 644, 356, 4101, 487, 3255, 1048, 2653, 2672],
 [353, 3909, 1161, 105, 9, 2672, 406, 1977, 2619, 894],
 [5766, 3654, 5386, 593, 5042, 179, 3539, 5078, 4886, 5788],
 [161, 1666, 356, 4101, 3654, 41, 651, 2187, 3255, 671],
 [356, 6084, 549, 358, 2923, 427, 4882, 5268, 6171, 6083],
 [2659, 6276, 1095, 328, 4522, 4301, 3388, 4410, 3068, 1564],
 [2624, 356, 2344, 3916, 3181, 593, 854, 220, 287, 76],
 [358, 9, 1610, 2859, 722, 916, 3994, 862, 287, 1081],
 [352, 6084, 3654, 5771, 3181, 1743, 5233, 5461, 6171, 2812],
 [416, 2344, 650, 363, 1580, 943, 657, 178, 3679, 43],
 [352, 870, 424, 2187, 3567, 146, 3255, 2619, 1435, 92],
 [706, 354, 3053, 1134, 750, 753, 276, 729, 4635, 1756],
 [2852, 455, 712, 585, 41, 1743, 1048, 1658, 31, 1547],
 [0, 2304, 4000, 355, 356, 76, 239, 662, 854, 1977],
 [193, 644, 455, 41, 492, 593, 659, 4469, 5399, 699],
 [3361, 705, 1639, 328, 750, 657, 179, 1589, 471, 317],
 [0, 2304, 356, 3181, 7

In [95]:
user_result = []
item_result =[]

for id, top_k in enumerate(top_item_intersection):
    user_result.extend([id] * 10)
    item_result.extend(top_k)

df_user_result = pd.DataFrame(user_result, columns=['uid'])
df_item_result = pd.DataFrame(item_result, columns=['sid'])
df_result = pd.concat([df_user_result, df_item_result], axis=1)

In [96]:
df_result

Unnamed: 0,uid,sid
0,0,4065
1,0,4610
2,0,4101
3,0,5514
4,0,5936
...,...,...
313595,31359,5461
313596,31359,3420
313597,31359,3994
313598,31359,92


In [97]:
id2item = dict((i, sid) for (i, sid) in enumerate(unique_sid.squeeze()))
id2user = dict((i, pid) for (i, pid) in enumerate(unique_uid.squeeze()))

In [98]:
df_infer = denumerize(df_result, id2user, id2item)
df_infer.columns = ['user', 'item']
df_infer = df_infer.sort_values('user')
df_infer

Unnamed: 0,user,item
132668,11,7438
132669,11,2987
132667,11,4886
132666,11,40815
132665,11,7373
...,...,...
49275,138493,1270
49276,138493,551
49277,138493,2628
49279,138493,48394


In [99]:
df_infer.to_csv(os.path.join('submission_ensemble.csv'), index=False)