In [None]:

from torch import nn, optim
from torch.nn import functional as F
import torch
from torch import nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
from scipy import sparse

import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm_notebook as tqdm

import seaborn as sn
sn.set()

import sys
import warnings; 
warnings.simplefilter('ignore')

In [None]:

train_data = pd.read_csv('../input/movielens20m/train.csv')
test_data = pd.read_csv('../input/movielens20m/test.csv')

In [None]:
nItems = train_data.sid.nunique()

In [None]:


test_data = sparse.csr_matrix((np.ones_like(test_data.uid), (test_data.uid.values, test_data.sid.values)), 
                             dtype='float64',
                             shape=(test_data.uid.nunique(), nItems))

In [None]:
class netflixDataset(torch.utils.data.Dataset):
    def __init__(self, scr_matrix, eval = False,prop=0.2):
        self.scr_matrix = scr_matrix
        self.eval = eval
        self.prop = prop
      
          
    def __getitem__(self, idx):
      
      item = {}
        
      

      if self.eval:
        u_items = self.scr_matrix[idx,:].toarray()[0]
        
        nu_items = u_items.sum()       
        val_size = max(int(nu_items*self.prop),1)
        idx_labels = np.where(u_items == 1)[0]
        data = np.ones_like(u_items)
        
        
                
        val_idx = np.random.choice(idx_labels, size=val_size, replace=False)                   
        data[val_idx] = 0
         
        
        
        
        item['data'] = torch.tensor(u_items*data,dtype=torch.float64)     
        
        item['ground_truth'] = torch.tensor(np.logical_not(data),dtype=torch.float64)             
        
        
       
      else:
        item['data'] = torch.tensor(self.scr_matrix[idx,:].toarray(),dtype=torch.float64)
      return item
        

    def __len__(self):
        return self.scr_matrix.shape[0]




    

In [None]:
class VAE(nn.Module):
    def __init__(self,n_Items, hidden=600, dimz= 200, p=0.5):
        super(VAE, self).__init__()

        self.n_Items = n_Items
        self.dimz = dimz
        self.hidden = hidden
        self.p = p

        self.inference = nn.Sequential(
           
            nn.Dropout(self.p),
            nn.Linear(self.n_Items,self.hidden),
            nn.Tanh(),
            nn.Linear(self.hidden,2*self.dimz)          
        )
        self.generative = nn.Sequential(
            nn.Linear(self.dimz,self.hidden),
            nn.Tanh(),
            nn.Linear(self.hidden,self.n_Items),
            
        )
  
        

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        
        return mu + std* ( eps if self.Mode =='train' else 0)


    def forward(self, x,Mode='train'):       
        self.Mode = Mode
        x = F.normalize(x, p=2, dim=1)  
        distribution = self.inference(x)



        mu, logvar = distribution[:, :self.dimz], distribution[:, self.dimz:]
        z = self.reparameterize(mu, logvar)
        logit = self.generative(z)

        
        return logit, mu, logvar


       
class DAE(nn.Module):
    def __init__(self,n_Items, dimz= 200, p=0.5):
        super(DAE, self).__init__()

        self.n_Items = n_Items
        self.dimz = dimz        
        self.p = p

        self.net = nn.Sequential(
           
            nn.Dropout(self.p),
            nn.Linear(self.n_Items,self.dimz),
            nn.Tanh(),          
            nn.Linear(self.dimz,self.n_Items),
        )
       
  
        

 
    def forward(self, x,Mode='train'):       
        self.Mode = Mode
        x = F.normalize(x, p=2, dim=1)  
        
        logit = self.net(x)        
        return logit


In [None]:

def NDCG_at_k(labels, scores, k = 100):
  device = scores.device
  arg_sort_scores = torch.argsort(scores,1,descending=True)
  arg_sort_labels = torch.argsort(labels,1,descending=True)
  

  pred_labels = torch.gather(labels,1,arg_sort_scores[:,:k]).to(device)
 

  tp = (1. /torch.log(torch.arange(2,2+k).float())).to(device)
  
 
  dcg = (tp * pred_labels).sum(axis = 1)
 
  idcg = torch.Tensor([tp[:min(int(n),k)].sum() for n in labels.sum(1)]).to(device)
  
  ndcg = (dcg/idcg).mean()

  return ndcg

def Recall_at_k(labels, scores, k = 20):
    device = scores.device
    arg_sort_scores = torch.argsort(scores,1,descending=True)
    arg_sort_labels = torch.argsort(labels,1,descending=True)

    pred_labels = torch.gather(labels,1,arg_sort_scores[:,:k]).to(device)

#     denominator = torch.Tensor([min(M, k) for M in labels.sum(1)]).to(device)
    denominator = labels.sum(1)
    denominator[denominator > k] = k

    return (pred_labels.sum(1) / denominator).mean()

In [None]:
!ls ../input

In [None]:
# Declare Model
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model_vae = VAE(nItems)


checkpoint_vae = torch.load('../input/test-model/ml20m-beta1.pt')
model_vae.load_state_dict(checkpoint_vae['model_state_dict'])


model_vae.to(device)
model_vae.eval()
checkpoint_vae['beta']

In [None]:

model_dae = DAE(nItems)

checkpoint_dae = torch.load('../input/mulvae/ml20m-DAE.pt')
model_dae.load_state_dict(checkpoint_dae['model_state_dict'])



model_dae.to(device)
model_dae.eval()

In [None]:
0.01*200

In [None]:
statics_vae,statics_dae = {},{}
prop_bar = tqdm(range(200),total = 200)
for prop in prop_bar:
    
    
    NDCGs_vae, NDCGs_dae = [], []
    
    test_ds = netflixDataset(test_data,eval=True,prop=((1/200)*prop))
    test_dl = DataLoader(test_ds,batch_size=1024)
    
   
    
    
    eval_phase = tqdm(enumerate(test_dl),total = len(test_dl),leave = False)
    for batch_idx,data in eval_phase:
      X = data['data'].float().to(device)  
      X = X.squeeze(1)
      ground_truth = torch.stack([data['ground_truth'][i,:] for i in range(X.shape[0])])\
                .squeeze(1).to(device)
      pred_vae = model_vae(X,Mode ='eval')[0].detach()
      pred_dae = model_dae(X,Mode ='eval').detach() 

      pred_vae[X!=0] = -np.inf
      pred_dae[X!=0] = -np.inf
        
        
      ndcg_vae = NDCG_at_k(ground_truth,pred_vae)
      NDCGs_vae.append(ndcg_vae.item())
      
      ndcg_dae = NDCG_at_k(ground_truth,pred_dae)
      NDCGs_dae.append(ndcg_dae.item())

    statics_vae[0.01*prop] = np.mean(NDCGs_vae)
    statics_dae[0.01*prop] = np.mean(NDCGs_dae)


In [None]:
fig = plt.figure(figsize=((16,8)))

plt.plot(1-np.array(list(statics_vae.keys()))/2, list(statics_vae.values()),label='VAE')
plt.plot(1-np.array(list(statics_vae.keys()))/2, list(statics_dae.values()),label ='DAE')

# Number of accent colors in the color scheme
plt.title('So sánh giữa VAE và DAE')
plt.xlabel('x label', fontsize=14)
plt.ylabel('y label', fontsize=14)
plt.legend()


In [None]:
list(statics_vae.keys())

In [None]:
vae_fold1 = list(statics_vae.values())[:40]
vae_fold2 = list(statics_vae.values())[40:80]
vae_fold3 = list(statics_vae.values())[80:120]
vae_fold4 = list(statics_vae.values())[120:160]
vae_fold5 = list(statics_vae.values())[160:]

dae_fold1 = list(statics_dae.values())[:40]
dae_fold2 = list(statics_dae.values())[40:80]
dae_fold3 = list(statics_dae.values())[80:120]
dae_fold4 = list(statics_dae.values())[120:160]
dae_fold5 = list(statics_dae.values())[160:]


In [None]:
pd.DataFrame([statics_dae.keys(),statics_dae.values(),statics_vae.values()]).T.to_csv("ml20m_statics.csv")

In [None]:
ax1 = sns.boxplot(data=data1.reshape((-1,2)))

In [None]:
import seaborn as sns
fig, axs = plt.subplots(1, 5,figsize =(35,6))

sns.set_theme(style="whitegrid")
ax1 = sns.boxplot(data=[dae_fold1,vae_fold1],ax = axs[0])
ax2 = sns.boxplot(data=[dae_fold2,vae_fold2],ax = axs[1])
ax3 = sns.boxplot(data=[dae_fold3,vae_fold3],ax = axs[2])
ax4 = sns.boxplot(data=[dae_fold4,vae_fold4],ax = axs[3])
ax5 = sns.boxplot(data=[dae_fold5,vae_fold5],ax = axs[4])

fig.savefig('msd_boxplot.jpg')

In [None]:
import seaborn as sns
sns.set_theme(style="whitegrid")
ax = sns.boxplot(x=list(statics_dae.values()))



In [None]:
# Declare Model
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = DAE(nItems)


checkpoint = torch.load('../input/milionsong-dae/DAE_msd-beta1.pt')
model.load_state_dict(checkpoint['model_state_dict'])

epoch = checkpoint['epoch']


model.to(device)

In [None]:
range(0,1,100)

In [None]:
test_ds = netflixDataset(test_data,eval=True)
test_dl = DataLoader(test_ds,batch_size=1024)
model.eval()
metrics = {}
eval_phase = tqdm(enumerate(test_dl),total = len(test_dl),leave = False)
NDCGs = []
RECALLs_20, RECALLs_50 = [],[]
for batch_idx,data in eval_phase:
  
    
  X = data['data'].float().to(device)  
  X = X.squeeze(1)
  ground_truth = torch.stack([data['ground_truth'][i,:] for i in range(X.shape[0])])\
            .squeeze(1).to(device)
  
  

  pred = model(X,Mode ='eval')

  pred = pred.detach()
  
  
  pred[X!=0] = -np.inf
  ndcg = NDCG_at_k(ground_truth,pred)
  recall_20 = Recall_at_k(ground_truth,pred,20)
  recall_50 = Recall_at_k(ground_truth,pred,50)
    
    
 
  NDCGs.append(ndcg.item())
  RECALLs_20.append(recall_20.item())
  RECALLs_50.append(recall_50.item())


In [None]:
np.mean(NDCGs),np.mean(RECALLs_20),np.mean(RECALLs_50)