# SiReN on ML-1m in PyTorch

## **Step 1 - Setup the environment**

### **1.1 Install libraries**

In [2]:
# torch geometric
try: 
    import torch_geometric
except ModuleNotFoundError:
    # Installing torch geometric packages with specific CUDA+PyTorch version. 
    # See https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html for details 
    import torch
    TORCH = torch.__version__.split('+')[0]
    CUDA = 'cu' + torch.version.cuda.replace('.','')

    !pip install torch-scatter     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
    !pip install torch-sparse      -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
    !pip install torch-cluster     -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
    !pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
    !pip install torch-geometric 
    import torch_geometric
import torch_geometric.nn as geom_nn
import torch_geometric.data as geom_data

Looking in links: https://pytorch-geometric.com/whl/torch-1.10.0+cu111.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-1.10.0%2Bcu113/torch_scatter-2.0.9-cp37-cp37m-linux_x86_64.whl (7.9 MB)
[K     |████████████████████████████████| 7.9 MB 4.9 MB/s 
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.0.9
Looking in links: https://pytorch-geometric.com/whl/torch-1.10.0+cu111.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-1.10.0%2Bcu113/torch_sparse-0.6.12-cp37-cp37m-linux_x86_64.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 4.3 MB/s 
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.12
Looking in links: https://pytorch-geometric.com/whl/torch-1.10.0+cu111.html
Collecting torch-cluster
  Downloading https://data.pyg.org/whl/torch-1.10.0%2Bcu113/torch_cluster-1.5.9-cp37-cp37m-linux_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3

In [3]:
!pip install -q -U git+https://github.com/RecoHut-Projects/recohut.git -b v0.0.4

  Building wheel for recohut (setup.py) ... [?25l[?25hdone


### **1.2 Download datasets**

In [4]:
!git clone -q --branch v2 https://github.com/RecoHut-Datasets/movielens_1m.git

### **1.3 Import libraries**

In [5]:
import torch
from torch import optim
from torch.utils.data import DataLoader

import pandas as pd
import numpy as np
import time
from tqdm.notebook import tqdm

import os
import pickle

import warnings
warnings.filterwarnings('ignore')

In [6]:
# layers
from recohut.layers.message_passing import LightGConv, LRGCCF

# models
from recohut.models.siren import SiReN

# transforms
from recohut.transforms.bipartite import BipartiteDataset

### **1.4 Set params**

In [7]:
class Args:
    dataset = 'ML-1M' # Dataset
    version = 1 # Dataset version
    batch_size = 1024 # Batch size
    dim = 64 # Dimension
    lr = 5e-3 # Learning rate
    offset = 3.5 # Criterion of likes/dislikes
    K = 40 # The number of negative samples
    num_layers = 4 # The number of layers of a GNN model for the graph with positive edges
    MLP_layers = 2 # The number of layers of MLP for the graph with negative edges
    epoch = 4 # The number of epochs
    reg = 0.05 # Regularization coefficient

## **Step 2 - Data preparation**

In [8]:
class Data_loader():
    def __init__(self,dataset,version):
        self.dataset=dataset; self.version=version
        self.sep='::'
        self.names=['userId','movieId','rating','timestemp'];
        self.path_for_whole='./movielens_1m/ratings.dat'
        self.path_for_train='./movielens_1m/train_1m%s.dat'%(version)
        self.path_for_test='./movielens_1m/test_1m%s.dat'%(version)
        self.num_u=6040; self.num_v=3952;
        
    def data_load(self):
        self.whole_=pd.read_csv(self.path_for_whole, names = self.names, sep=self.sep, engine='python').drop('timestemp',axis=1).sample(frac=1,replace=False,random_state=self.version)
        self.train_set = pd.read_csv(self.path_for_train,engine='python',names=self.names).drop('timestemp',axis=1)
        self.test_set = pd.read_csv(self.path_for_test,engine='python',names=self.names).drop('timestemp',axis=1)            
        return self.train_set, self.test_set

In [9]:
def deg_dist(train, num_v):
    uni, cou = np.unique(train['movieId'].values-1,return_counts=True)
    cou = cou**(0.75)
    deg = np.zeros(num_v)
    deg[uni] = cou
    return torch.tensor(deg)

In [10]:
def gen_top_K(data_class,emb,train,directory_):
    no_items = np.array(list(set(np.arange(1,data_class.num_v+1))-set(train['movieId'])))
    total_users = set(np.arange(1,data_class.num_u+1))
    reco = dict()
    pbar = tqdm(desc = 'top-k recommendation...',total=len(total_users),position=0)
    for j in total_users:
        pos = train[train['userId']==j]['movieId'].values-1
        embedding_ = emb[j-1].view(1,len(emb[0])).mm(emb[data_class.num_u:].t()).detach();
        embedding_[0][no_items-1]=-np.inf;
        embedding_[0][pos]=-np.inf;
        reco[j]=torch.topk(embedding_[0],300).indices.cpu().numpy()+1
        pbar.update(1)
    pbar.close()
    return reco

In [12]:
args = Args()
data_class=Data_loader(args.dataset,args.version)
threshold = round(args.offset) # To generate ground truth set 

print('data loading...'); st=time.time()
train,test = data_class.data_load();
train = train.astype({'userId':'int64', 'movieId':'int64'})
print('loading complete! time :: %s'%(time.time()-st))

print('generate negative candidates...'); st=time.time()
neg_dist = deg_dist(train,data_class.num_v)
print('complete ! time : %s'%(time.time()-st))    

data loading...
loading complete! time :: 9.222385168075562
generate negative candidates...
complete ! time : 0.08198142051696777


## **Step 3 - Training & Evaluation**

In [11]:
class evaluate():
    def __init__(self,reco,train,test,threshold,num_u,num_v,N=[5,10,15,20,25],ratings=[20,50]):
        '''
        train : training set
        test : test set
        threshold : To generate ground truth set from test set
        '''
        self.reco = reco
        self.num_u = num_u;
        self.num_v = num_v;
        self.N=N
        self.p=[]
        self.r=[]
        self.NDCG=[]
        self.p_c1=[]; self.p_c2=[]; self.p_c3=[]
        self.r_c1=[]; self.r_c2=[]; self.r_c3=[]
        self.NDCG_c1=[]; self.NDCG_c2=[]; self.NDCG_c3=[]
        self.tr = train; self.te = test;
        self.threshold = threshold;
        self.gen_ground_truth_set()
        self.ratings = ratings
        self.partition_into_groups_(self.ratings)
        print('\nevaluating recommendation accuracy....')
        self.precision_and_recall_G(self.group1,1)
        self.precision_and_recall_G(self.group2,2)
        self.precision_and_recall_G(self.group3,3)
        self.Normalized_DCG_G(self.group1,1)
        self.Normalized_DCG_G(self.group2,2)
        self.Normalized_DCG_G(self.group3,3)
        self.metric_total()

    def gen_ground_truth_set(self):
        result = dict()
        GT = self.te[self.te['rating']>=self.threshold];
        U = set(GT['userId'])
        for i in U:
            result[i] = list(set([j for j in GT[GT['userId']==i]['movieId']]))#-set(self.TOP))
            if len(result[i])==0:
                del(result[i])
        self.GT = result

    def precision_and_recall(self):
        user_in_GT=[j for j in self.GT];
        for n in self.N:
            p=0; r=0;
            for i in user_in_GT:
                topn=self.reco[i][:n]
                num_hit=len(set(topn).intersection(set(self.GT[i])));
                p+=num_hit/n; r+=num_hit/len(self.GT[i]);
            self.p.append(p/len(user_in_GT)); self.r.append(r/len(user_in_GT));
                
    def Normalized_DCG(self):
        maxn=max(self.N);
        user_in_GT=[j for j in self.GT];
        ndcg=np.zeros(maxn);
        for i in user_in_GT:
            idcg_len = min(len(self.GT[i]), maxn)
            temp_idcg = np.cumsum(1.0 / np.log2(np.arange(2, maxn + 2)))
            temp_idcg[idcg_len:] = temp_idcg[idcg_len-1]
            temp_dcg=np.cumsum([1.0/np.log2(idx+2) if item in self.GT[i] else 0.0 for idx, item in enumerate(self.reco[i][:maxn])])
            ndcg+=temp_dcg/temp_idcg;
        ndcg/=len(user_in_GT);
        for n in self.N:
            self.NDCG.append(ndcg[n-1])
            
    def metric_total(self):
        self.p = self.len1 * np.array(self.p_c1) + self.len2 * np.array(self.p_c2) + self.len3 * np.array(self.p_c3);
        self.p/= self.len1 + self.len2 + self.len3
        self.p = list(self.p)
        self.r = self.len1 * np.array(self.r_c1) + self.len2 * np.array(self.r_c2) + self.len3 * np.array(self.r_c3);
        self.r/= self.len1 + self.len2 + self.len3
        self.r = list(self.r)
        self.NDCG = self.len1 * np.array(self.NDCG_c1) + self.len2 * np.array(self.NDCG_c2) + self.len3 * np.array(self.NDCG_c3);
        self.NDCG/= self.len1 + self.len2 + self.len3
        self.NDCG = list(self.NDCG)

    def partition_into_groups_(self,ratings=[20,50]):
        unique_u, counts_u = np.unique(self.tr['userId'].values,return_counts=True)
        self.group1 = unique_u[np.argwhere(counts_u<ratings[0])]
        temp = unique_u[np.argwhere(counts_u<ratings[1])]
        self.group2 = np.setdiff1d(temp,self.group1)
        self.group3 = np.setdiff1d(unique_u,temp)
        self.cold_groups = ratings
        self.group1 = list(self.group1.reshape(-1))
        self.group2 = list(self.group2.reshape(-1))
        self.group3 = list(self.group3.reshape(-1))
    
    def precision_and_recall_G(self,group,gn):
        user_in_GT=[j for j in self.GT];
        leng = 0 ; maxn = max(self.N) ; p = np.zeros(maxn); r = np.zeros(maxn);
        for i in user_in_GT:
            if i in group:
                leng+=1
                hit_ = np.cumsum([1.0 if item in self.GT[i] else 0.0 for idx, item in enumerate(self.reco[i][:maxn])])
                p+=hit_ / np.arange(1,maxn+1); r+=hit_/len(self.GT[i])
        p/= leng; r/=leng;
        for n in self.N:
            if gn == 1 :
                self.p_c1.append(p[n-1])
                self.r_c1.append(r[n-1])
                self.len1 = leng;
            elif gn == 2 :
                self.p_c2.append(p[n-1])
                self.r_c2.append(r[n-1])
                self.len2 = leng;
            elif gn == 3 :
                self.p_c3.append(p[n-1])
                self.r_c3.append(r[n-1])
                self.len3 = leng;
            
    def Normalized_DCG_G(self,group,gn):
        maxn=max(self.N);
        user_in_GT=[j for j in self.GT];
        ndcg=np.zeros(maxn);
        leng = 0
        for i in user_in_GT:
            if i in group:
                leng+=1
                idcg_len = min(len(self.GT[i]), maxn)
                temp_idcg = np.cumsum(1.0 / np.log2(np.arange(2, maxn + 2)))
                temp_idcg[idcg_len:] = temp_idcg[idcg_len-1]
                temp_dcg=np.cumsum([1.0/np.log2(idx+2) if item in self.GT[i] else 0.0 for idx, item in enumerate(self.reco[i][:maxn])])
                ndcg+=temp_dcg/temp_idcg;
        ndcg/=leng
        for n in self.N:
            if gn == 1 :
                self.NDCG_c1.append(ndcg[n-1])
            elif gn == 2 :
                self.NDCG_c2.append(ndcg[n-1])
            elif gn == 3 :
                self.NDCG_c3.append(ndcg[n-1])

In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

args.user_col = 'userId'
args.item_col = 'movieId'
args.feedback_col = 'rating'

model= SiReN(train,
             data_class.num_u,
             data_class.num_v,
             offset=args.offset,
             num_layers=args.num_layers,
             MLP_layers=args.MLP_layers,
             dim=args.dim,
             device=device,
             reg=args.reg,
            graph_enc = 'lightgcn',
            user_col = args.user_col,
            item_col = args.item_col,
            rating_col = args.feedback_col)

model.data_p.to(device)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr = args.lr)

In [None]:
print("\nTraining on {}...\n".format(device))
model.train()
training_dataset = BipartiteDataset(args, train, neg_dist, args.offset, 
                                    data_class.num_u, data_class.num_v, args.K)

for EPOCH in range(1,args.epoch+1):
    if EPOCH%2-1==0:training_dataset.negs_gen_EP(2)
    LOSS=0
    training_dataset.edge_4 = training_dataset.edge_4_tot[:,:,EPOCH%2-1]
    ds = DataLoader(training_dataset,batch_size=args.batch_size,shuffle=True)
    q=0
    pbar = tqdm(desc = 'Version : {} Epoch {}/{}'.format(args.version,EPOCH,args.epoch),total=len(ds),position=0)
    for u,v,w,negs in ds:
        q+=len(u)
        st=time.time()
        optimizer.zero_grad()
        loss = model(u,v,w,negs,device) # original
        loss.backward()                
        optimizer.step()
        LOSS+=loss.item() * len(ds)
        pbar.update(1);
        pbar.set_postfix({'loss':loss.item()})
    pbar.close()

    if EPOCH%2==0 :
        directory = os.getcwd() + '/results/%s/SiReN/epoch%s_batch%s_dim%s_lr%s_offset%s_K%s_num_layers%s_MLP_layers%s_threshold%s_reg%s/'%(args.dataset,EPOCH,args.batch_size,args.dim,args.lr,args.offset,args.K,args.num_layers,args.MLP_layers,threshold,args.reg)
        if not os.path.exists(directory):
            os.makedirs(directory)
        model.eval()
        emb = model.aggregate();
        top_k_list = gen_top_K(data_class,emb,train,directory+'r%s_reco.pickle'%(args.version)) 
        eval_ = evaluate(top_k_list,train,test,threshold,data_class.num_u,data_class.num_v,N=[10,15,20],ratings=[20,50])
        print("\n***************************************************************************************")
        print(" /* Recommendation Accuracy */")
        print('Precision at [10, 15, 20] :: ',eval_.p)
        print('Recall at [10, 15, 20] :: ',eval_.r)
        print('NDCG at [10, 15, 20] :: ',eval_.NDCG)
        print("***************************************************************************************")
        directory_ = directory+'r%s_reco.pickle'%(args.version)
        with open(directory_,'wb') as fw:
            pickle.dump(eval_,fw)
        model.train()


Training on cuda:0...

negative sampling for next epochs...


negative sampling for next epochs...:   0%|          | 0/6040 [00:00<?, ?it/s]

complete ! 44.3044056892395


Version : 1 Epoch 1/4:   0%|          | 0/782 [00:00<?, ?it/s]

Version : 1 Epoch 2/4:   0%|          | 0/782 [00:00<?, ?it/s]

top-k recommendation...:   0%|          | 0/6040 [00:00<?, ?it/s]


evaluating recommendation accuracy....

***************************************************************************************
 /* Recommendation Accuracy */
Precision at [10, 15, 20] ::  [0.1977101788400488, 0.17380355451557442, 0.15696139060671854]
Recall at [10, 15, 20] ::  [0.11815263807718367, 0.15323209109049143, 0.1826288867835281]
NDCG at [10, 15, 20] ::  [0.23763163291675646, 0.23046875513769544, 0.22915729283611078]
***************************************************************************************
negative sampling for next epochs...


negative sampling for next epochs...:   0%|          | 0/6040 [00:00<?, ?it/s]

complete ! 48.64898443222046


Version : 1 Epoch 3/4:   0%|          | 0/782 [00:00<?, ?it/s]

Version : 1 Epoch 4/4:   0%|          | 0/782 [00:00<?, ?it/s]

top-k recommendation...:   0%|          | 0/6040 [00:00<?, ?it/s]


evaluating recommendation accuracy....

***************************************************************************************
 /* Recommendation Accuracy */
Precision at [10, 15, 20] ::  [0.22543874310546613, 0.19848459524207768, 0.17864783553401203]
Recall at [10, 15, 20] ::  [0.13995855076046487, 0.1812022261733025, 0.21449465241229854]
NDCG at [10, 15, 20] ::  [0.269092620151712, 0.2622437530495195, 0.261196524578161]
***************************************************************************************


## **Closure**

For more details, you can refer to https://github.com/RecoHut-Stanzas/S138006.

<a href="https://github.com/RecoHut-Stanzas/S138006/blob/main/reports/S138006_Report.ipynb" alt="S138006_Report"> <img src="https://img.shields.io/static/v1?label=report&message=active&color=green" /></a> <a href="https://github.com/RecoHut-Stanzas/S138006" alt="S138006"> <img src="https://img.shields.io/static/v1?label=code&message=github&color=blue" /></a>

In [15]:
!pip install -q watermark
%reload_ext watermark
%watermark -a "Sparsh A." -m -iv -u -t -d -p recohut

Author: Sparsh A.

Last updated: 2021-12-20 06:30:14

recohut: 0.0.4

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.4.104+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit

torch_geometric: 2.0.2
IPython        : 5.5.0
pandas         : 1.1.5
torch          : 1.10.0+cu111
numpy          : 1.19.5



---

**END**