# Setup

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pandas import DataFrame

from matplotlib import pyplot as plt

from IPython import display

import collections

import sklearn
import sklearn.manifold

import tensorflow.compat.v1 as tf

tf.disable_v2_behavior()
tf.logging.set_verbosity(tf.logging.ERROR)

In [None]:
rating_complete = pd.read_csv('../input/anime-recommendation-database-2020/rating_complete.csv')

In [None]:
anime = pd.read_csv('../input/anime-recommendation-database-2020/anime.csv')

In [None]:
anime.head(2)

In [None]:
rating_complete.head(2)

In [None]:
user_rating = DataFrame(rating_complete.groupby('user_id')['rating'].count().reset_index())
query = user_rating['rating'] > 280 # Get the user who has rated over 280 animes
filtered_users = user_rating[query]
users = set(filtered_users['user_id'])
len(users)

In [None]:
# Generate the new user id
user_id_dict = {}
num = 0
for user in sorted(users):
    user_id_dict[user] = num
    num += 1
    
user_id_dict[3]

In [None]:
rating = DataFrame(rating_complete[rating_complete['user_id'].isin(users)])
print(rating.shape)
rating.head()

In [None]:
# Reset the user id from 0
rating['user_id'] = rating['user_id'].map(user_id_dict)
rating.head()

In [None]:
def split_dataframe(df, holdout_fraction=0.1):
    """Splits a DataFrame into training and test sets.
    Args:
    df: a dataframe.
    holdout_fraction: fraction of dataframe rows to use in the test set.
    
    Returns:
    train: dataframe for training
    test: dataframe for testing
    """
    
    test = df.sample(frac=holdout_fraction, replace=False)
    train = df[~df.index.isin(test.index)]
    return train, test

In [None]:
def build_rating_sparse_tensor(ratings_df):
    """
    Args:
    ratings_df: a pd.DataFrame with `user_id`, `anime_id` and `rating` columns.
    
    Returns:
    a tf.SparseTensor representing the ratings matrix.
    """
    
    indices = ratings_df[['user_id', 'anime_id']].values
    values = ratings_df['rating'].values
    return tf.SparseTensor(indices=indices, values=values, dense_shape=[len(users), anime.shape[0]])

In [None]:
def sparse_mean_square_error(sparse_ratings, user_embeddings, anime_embeddings):
    """
    Args:
    sparse_ratings: A SparseTensor rating matrix, of dense_shape [N, M]
    user_embeddings: A dense Tensor U of shape [N, k] where k is the embedding
      dimension, such that U_i is the embedding of user i.
    anime_embeddings: A dense Tensor V of shape [M, k] where k is the embedding
      dimension, such that V_j is the embedding of anime j.
  
    Returns:
    A scalar Tensor representing the MSE between the true ratings and the
      model's predictions.
    """
    
    predictions = tf.gather_nd(tf.matmul(user_embeddings, anime_embeddings, transpose_b=True), sparse_ratings.indices)
    loss = tf.losses.mean_squared_error(sparse_ratings.values, predictions)
    return loss

In [None]:
class CFModel(object):
    """Simple class that represents a collaborative filtering model"""
    def __init__(self, embedding_vars, loss, metrics=None):
        """Initializes a CFModel.
        Args:
        embedding_vars: A dictionary of tf.Variables.
        loss: A float Tensor. The loss to optimize.
        metrics: optional list of dictionaries of Tensors. The metrics in each
        dictionary will be plotted in a separate figure during training.
        """
        
        self._embedding_vars = embedding_vars
        self._loss = loss
        self._metrics = metrics
        self._embeddings = {k: None for k in embedding_vars}
        self._session = None

    @property
    def embeddings(self):
        """The embeddings dictionary."""
        return self._embeddings

    def train(self, num_iterations=100, learning_rate=1.0, plot_results=True, optimizer=tf.train.GradientDescentOptimizer):
        """Trains the model.
        Args:
        iterations: number of iterations to run.
        learning_rate: optimizer learning rate.
        plot_results: whether to plot the results at the end of training.
        optimizer: the optimizer to use. Default to GradientDescentOptimizer.
    
        Returns:
        The metrics dictionary evaluated at the last iteration.
        """
        with self._loss.graph.as_default():
            opt = optimizer(learning_rate)
            train_op = opt.minimize(self._loss)
            local_init_op = tf.group(tf.variables_initializer(opt.variables()), tf.local_variables_initializer())
      
            if self._session is None:
                self._session = tf.Session()
                with self._session.as_default():
                    self._session.run(tf.global_variables_initializer())
                    self._session.run(tf.tables_initializer())
                    tf.train.start_queue_runners()

        with self._session.as_default():
            local_init_op.run()
            iterations = []
            metrics = self._metrics or ({},)
            metrics_vals = [collections.defaultdict(list) for _ in self._metrics]

            # Train and append results.
            for i in range(num_iterations + 1):
                _, results = self._session.run((train_op, metrics))
                if (i % 10 == 0) or i == num_iterations:
                    print("\r iteration %d: " % i + ", ".join(["%s=%f" % (k, v) for r in results for k, v in r.items()]),end='')
                    iterations.append(i)
                    for metric_val, result in zip(metrics_vals, results):
                        for k, v in result.items():
                            metric_val[k].append(v)

            for k, v in self._embedding_vars.items():
                self._embeddings[k] = v.eval()
                
            if plot_results:
                # Plot the metrics.
                num_subplots = len(metrics)+1
                fig = plt.figure()
                fig.set_size_inches(num_subplots*10, 8)
                for i, metric_vals in enumerate(metrics_vals):
                    ax = fig.add_subplot(1, num_subplots, i+1)
                    for k, v in metric_vals.items():
                        ax.plot(iterations, v, label=k)
                    ax.set_xlim([1, num_iterations])
                    ax.legend()
            return results

In [None]:
def build_model(ratings, embedding_dim=3, init_stddev=1.):
    """
    Args:
    ratings: a DataFrame of the ratings
    embedding_dim: the dimension of the embedding vectors.
    init_stddev: float, the standard deviation of the random initial embeddings.
    Returns:
    model: a CFModel.
    """
    
    # Split the ratings DataFrame into train and test.
    train_ratings, test_ratings = split_dataframe(ratings)
    
    # SparseTensor representation of the train and test datasets.
    A_train = build_rating_sparse_tensor(train_ratings)
    A_test = build_rating_sparse_tensor(test_ratings)
    
    # Initialize the embeddings using a normal distribution.
    U = tf.Variable(tf.random_normal([A_train.dense_shape[0], embedding_dim], stddev=init_stddev))
    V = tf.Variable(tf.random_normal([A_train.dense_shape[1], embedding_dim], stddev=init_stddev))

    train_loss = sparse_mean_square_error(A_train, U, V)
    test_loss = sparse_mean_square_error(A_test, U, V)
    
    metrics = {'train_error': train_loss, 'test_error': test_loss}
    embeddings = {"user_id": U, "anime_id": V}
    return CFModel(embeddings, train_loss, [metrics])

In [None]:
model = build_model(rating, embedding_dim=30, init_stddev=0.5)
model.train(num_iterations=1000, learning_rate=20.)

In [None]:
DOT = 'dot'
COSINE = 'cosine'
def compute_scores(query_embedding, item_embeddings, measure=DOT):
    """Computes the scores of the candidates given a query.
    Args:
    query_embedding: a vector of shape [k], representing the query embedding.
    item_embeddings: a matrix of shape [N, k], such that row i is the embedding of item i.
    measure: a string specifying the similarity measure to be used. Can be either DOT or COSINE.
    
    Returns:
    scores: a vector of shape [N], such that scores[i] is the score of item i.
    """
    u = query_embedding
    V = item_embeddings
    if measure == COSINE:
        V = V / np.linalg.norm(V, axis=1, keepdims=True)
        u = u / np.linalg.norm(u)
    scores = u.dot(V.T)
    return scores

In [None]:
def user_recommendations(user_id, model, measure=DOT, exclude_rated=False, k=6):
    if True:
        scores = compute_scores(model.embeddings["user_id"][user_id], model.embeddings["anime_id"], measure)
        score_key = measure + ' score'
        df = pd.DataFrame({score_key: list(scores),
                           'anime_id': anime['MAL_ID'],
                           'titles': anime['Japanese name']})
        
    display.display(df.sort_values([score_key], ascending=False).head(k))  

In [None]:
def similiar_user(user_id, model, measure=DOT, exclude_rated=False, k=6):
    if True:
        scores = compute_scores(model.embeddings["user_id"][user_id], model.embeddings["user_id"], measure)
        score_key = measure + ' score'
        df = pd.DataFrame({score_key: list(scores)})
        
    display.display(df.sort_values([score_key], ascending=False).head(k))  

In [None]:
user_recommendations(5, model, measure=COSINE, k=10)

In [None]:
similiar_user(5, model, measure=COSINE, k=10)

In [None]:
pd.merge(rating[rating['user_id'] == 25997].sort_values('rating', ascending=False), anime[['MAL_ID','Japanese name']], left_on='anime_id', right_on='MAL_ID', how='left').head(10)[['user_id','rating','Japanese name']]

In [None]:
pd.merge(rating[rating['user_id'] == 5], rating[rating['user_id'] == 41332], on='anime_id', how='inner').head(50)