In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

import torch 
import torch.nn as nn 
import torch.nn.parallel 
import torch.optim as optim 
import torch.utils.data 
from torch.autograd import Variable

In [None]:
anime =  pd.read_csv('../input/anime-recommendation-database-2020/anime.csv')

In [None]:
anime.head()

In [None]:
rating_complete = pd.read_csv('../input/anime-recommendation-database-2020/rating_complete.csv')

In [None]:
rating_complete.info()

In [None]:
rating_complete.head()

# pre-processing

In [None]:
user_anime = rating_complete.groupby('user_id').size().reset_index()
user_anime.columns = ['user_id', 'anime_count']
user_anime.head()

In [None]:
filtered_users = user_anime[user_anime['anime_count'] > 1000]
users = set(filtered_users['user_id'])
len(users)

In [None]:
query = anime['Type'] == 'TV'
anime_tv = anime[query]
animes = set(anime_tv['MAL_ID'])
len(animes)

In [None]:
query = rating_complete['user_id'].isin(users) & rating_complete['anime_id'].isin(animes)
rating_data = rating_complete[query]
rating_data.shape

In [None]:
mask = np.random.rand(rating_data.shape[0]) < 0.8
df_train = rating_data[mask]
print('train', df_train.shape[0])
df_test = rating_data[~mask]
print('test', df_test.shape[0])

## The rating by each user

In [None]:
user_rating = rating_data.groupby('user_id').mean()['rating'].reset_index()
user_rating.columns = ['user', 'avg_rating']
user_rating.head()

In [None]:
user_rating['avg_rating'].describe()

## train datasets

In [None]:
unique_users = {int(x): i for i,x in enumerate(df_train['user_id'].unique())}
unique_items = {int(x): i for i,x in enumerate(anime_tv['MAL_ID'].unique())}

print(len(unique_users), len(unique_items))
nb_users = len(unique_users)
train_set = np.full((len(unique_users), len(unique_items)), -1)

for user_id, anime_id, rating in df_train.values:
    if rating >= 8:
        train_set[unique_users[user_id], unique_items[anime_id]] = 1
    else:
        train_set[unique_users[user_id], unique_items[anime_id]] = 0
    
train_set.shape

## test datasets

In [None]:
unique_users = {int(x): i for i,x in enumerate(df_test['user_id'].unique())}
unique_items = {int(x): i for i,x in enumerate(anime_tv['MAL_ID'].unique())}

print(len(unique_users), len(unique_items))
test_set = np.full((len(unique_users), len(unique_items)), -1)

for user_id, anime_id, rating in df_test.values:
    if rating >= 7:
        test_set[unique_users[user_id], unique_items[anime_id]] = 1
    else:
        test_set[unique_users[user_id], unique_items[anime_id]] = 0
    
test_set.shape

In [None]:
# convert data into Torch tensor
train_set = torch.FloatTensor(train_set)
test_set = torch.FloatTensor(test_set)

# visualize data

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4))

sns.heatmap(train_set, cmap="coolwarm", cbar_kws={"ticks":np.arange(-1,2)}, ax=ax1)
ax1.set_xlabel('anime')
ax1.set_ylabel('user')
ax1.set_title("training data set")

sns.heatmap(test_set, cmap="coolwarm", cbar_kws={"ticks":np.arange(-1,2)}, ax=ax2)
ax2.set_xlabel('anime')
ax2.set_ylabel('user')
ax2.set_title("test data set")
plt.suptitle("Rating: -1=no data, 0=low rating, 1=high rating")

# Restricted Boltzmann Machines (RBM)

## RBM Model

In [None]:
class RBM():
    def __init__(self, num_visible_nodes, num_hidden_nodes):
        ##initialize all weights 
        ##a tensor with size of num_hidden_nodes, num_visible_nodes in normal dis mean 0 var 1
        self.W = torch.randn(num_hidden_nodes, num_visible_nodes)
        self.a = torch.randn(1, num_hidden_nodes)  #bias for hidden nodes - #1st dimension is batch, 2nd is num of hidden nodes
        self.b = torch.randn(1, num_visible_nodes) #bias for visible nodes
        
    #activate the hidden nodes by sampling all hiddens node, given values of visible nodes 
    def sample_hidden_nodes(self, x):
        #x is values of visible nodes
        #probablity of hiddens h to be activated, given values of visible  nodes v
        wx = torch.mm(x, self.W.t())
        #use sigmoid fuc to activate visible node
        ## a is bias for hidden nodes
        activation = wx + self.a.expand_as(wx)
        ##ith of the vector is the probability of ith hidden nodes to be activated, 
        ##given visible values
        p_h_given_v =torch.sigmoid(activation)
        #samples of all hiddens nodes
        return p_h_given_v, torch.bernoulli(p_h_given_v)

    def sample_visible_nodes(self, y):
        #y is hidden nodes
        #probablity of visible h to be activated, given hidden  nodes v
        wy = torch.mm(y, self.W)
        #use sigmoid fuc to activate hiddens nodes
        activation = wy + self.b.expand_as(wy)
        ##ith of the vector is the probability of ith visible nodes to be activated, 
        ##given hidden values
        p_v_given_h =torch.sigmoid(activation)
        #samples of all hiddens nodes
        return p_v_given_h, torch.bernoulli(p_v_given_h)
        
    #visible nodes after kth interation
    #probablity of hidden nodes after kth iteration
    def train(self, v0, vk, ph0, phk):
        # self.W += torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)
        self.W += (torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)).t()
        #add zero to keep b as a tensor of 2 dimension
        self.b += torch.sum((v0 - vk), 0)
        self.a += torch.sum((ph0 - phk), 0)
    
    # for prediction, input pass hidden nodes and reconstruct back to visible nodes
    def predict(self, x): # x is visible nodes
        _, h = self.sample_hidden_nodes(x)
        _, v = self.sample_visible_nodes(h)
        return v

# set up parameters and call RBM model

In [None]:
# define model parameters
print(len(train_set[0]))
num_visible_nodes = len(train_set[0]) #number of anime
num_hidden_nodes = 1000 #number of hidden nodes or num of features
batch_size = 2500

# call RBM model
rbm = RBM(num_visible_nodes, num_hidden_nodes)

In [None]:
##train the RBM
nb_epoch = 10
train_loss_list = []
for epoch in range(1, nb_epoch+1):
    ##loss function
    train_loss = 0
    #normalize the loss, define a counter
    s = 0.
    #implement a batch learning, 
    for id_user in range(0, nb_users - batch_size, 100):
        #input batch values
        vk = train_set[id_user: id_user+batch_size]
        #target used for loss mesarue: rating 
        v0 = train_set[id_user: id_user+batch_size]
        ##initilize probablity
        #pho: given real rating at begining, probablity of hidden nodes
        ph0, _ = rbm.sample_hidden_nodes(v0)
        #k step of constrative divergence
        for k in range(10):
            _, hk = rbm.sample_hidden_nodes(vk)
            _, vk = rbm.sample_visible_nodes(hk)
            #training on rating that do exist, rating as -1
            vk[v0<0] = v0[v0<0]
        phk, _ = rbm.sample_hidden_nodes(vk)
        #update weights and bias
        rbm.train(v0, vk, ph0, phk)
        #update train loss
        train_loss += torch.mean(torch.abs(v0[v0>0]-vk[v0>0]))
        s += 1
    print('epoch: '+str(epoch)+' loss: '+str(train_loss/s))
    train_loss_list.append ( train_loss )

In [None]:
# plot loss
plt.plot(train_loss_list)
plt.ylabel("loss")
plt.xlabel("epoch")
plt.title("training")

In [None]:
##loss function
test_loss = 0
#normalize the loss, define a counter
s = 0.
#implement a batch learning, 
predicted_v_input = []
test_input = []
for id_user in range(0, nb_users):
    #use input of train set to activate RBM
    v_input = train_set[id_user: id_user+1]
    #target used for loss mesarue: rating 
    v_target = test_set[id_user: id_user+1]
    #use only 1 step to make better prediction, though used 10 steps to train
    if len(v_target[v_target>=0]):
        # predict data 
        v_input = rbm.predict(v_input)
        #update test loss
        test_loss += torch.mean(torch.abs(v_target[v_target>0]-v_input[v_target>0]))
        predicted_v_input.append ( v_input.detach().numpy()[0] )
        test_input.append ( v_target.detach().numpy()[0] )
        s += 1
predicted_v_input = np.array(predicted_v_input)
print('test loss: ' +str(test_loss/s))

In [None]:
# visualise test input and output rating data
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,4))
sns.heatmap(test_input, cmap="coolwarm", vmin=-1, cbar_kws={"ticks":np.arange(-1,2)}, ax=ax1)
ax1.set_xlabel('movies')
ax1.set_ylabel('users')
ax1.set_title("test input")
sns.heatmap(predicted_v_input, cmap="coolwarm", vmin=-1,  cbar_kws={"ticks":np.arange(-1,2)}, ax=ax2)
ax2.set_xlabel('movies')
ax2.set_ylabel('users')
ax2.set_title("test output")
plt.suptitle("Rating: -1=no data, 0=low rating, 1=high rating")

In [None]:
# select tested dataframe comparing predicted and original data
df_test = pd.DataFrame({'predict': np.concatenate(predicted_v_input), 
                        'original': np.concatenate(test_input)})

# select only data that original data is available
df_select = df_test[df_test["original"] != -1.0]
print ("number of predicted data", len(df_test))
print ("number of original data", len(df_select))

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(true_labels, predicted_labels, title):
    # get_metrics
    print('Accuracy:', metrics.accuracy_score(true_labels, predicted_labels))
    print('Precision:', metrics.precision_score(true_labels, predicted_labels, average='weighted'))
    print('Recall:', metrics.recall_score(true_labels, predicted_labels, average='weighted'))
    print('F1 Score:', metrics.f1_score(true_labels, predicted_labels,average='weighted'))

    # confusion matrix
    labels = list(set(true_labels))
    cm = confusion_matrix(true_labels, predicted_labels, labels=labels)
    cm_labeled = pd.DataFrame(cm, columns=labels, index=labels)
    sns.heatmap(cm_labeled, annot=True, cmap='Greens', fmt='g')
    plt.xlabel("predict")
    plt.ylabel("actual")
    plt.title(title)
    return

In [None]:
# check accuracy of test data comparing to rating available test data
true_labels = df_select["original"]
predicted_labels = df_select["predict"]
plot_confusion_matrix(true_labels, predicted_labels, "rated test data")