# Model-based CF
* Dataset: MovieLens 100K Dataset
* Set NaN as 0
* 10% as testing data
* 300 epoch, 30 embedding dimension

## performance
* MSE_error= 2.689

In [None]:
from __future__ import print_function
import collections
import numpy as np
import pandas as pd
import tensorflow.compat.v1 as tf
from sklearn.model_selection import train_test_split

tf.disable_v2_behavior()
tf.logging.set_verbosity(tf.logging.ERROR)

In [None]:
# Read from csv and preprocess
ratings = pd.read_csv('../input/movielens-dataset/ratings.csv')
ratings["movieId"] = ratings["movieId"].apply(lambda x: str(x-1))
ratings["userId"] = ratings["userId"].apply(lambda x: str(x-1))
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))

In [None]:
USER = 668
MOVIE = 149532
def build_sparse_tensor(ratings_df):
  return tf.SparseTensor(
      indices=ratings_df[['userId', 'movieId']].values,
      values=ratings_df['rating'].values,
      dense_shape=[USER, MOVIE])

In [None]:
def sparse_mean_square_error(sparse_ratings, user_embeddings, movie_embeddings):
    predictions = tf.gather_nd(
        tf.matmul(user_embeddings, movie_embeddings, transpose_b=True),
        sparse_ratings.indices)
    return tf.losses.mean_squared_error(sparse_ratings.values, predictions)

In [None]:
class Model(object):
    def __init__(self, embedding_vars, loss, metrics=None):
        self._embedding_vars = embedding_vars
        self._loss = loss
        self._metrics = metrics
        self._embeddings = {k: None for k in embedding_vars}
        self._session = None

    @property
    def embeddings(self):
        return self._embeddings

    def train(self, num_iterations=300, learning_rate=5.0, plot_results=True,
              optimizer=tf.train.GradientDescentOptimizer):
        with self._loss.graph.as_default():
            opt = optimizer(learning_rate)
            train_op = opt.minimize(self._loss)
            local_init_op = tf.group(
                tf.variables_initializer(opt.variables()),
                tf.local_variables_initializer())
        if self._session is None:
            self._session = tf.Session()
        with self._session.as_default():
            self._session.run(tf.global_variables_initializer())
            self._session.run(tf.tables_initializer())
            tf.train.start_queue_runners()

        with self._session.as_default():
            local_init_op.run()
            metrics = self._metrics or ({},)
            metrics_vals = [collections.defaultdict(list) for _ in self._metrics]

            # Train and append results.
            for i in range(num_iterations + 1):
                _, results = self._session.run((train_op, metrics))

            for k, v in self._embedding_vars.items():
                self._embeddings[k] = v.eval()
            
            return results

In [None]:
def build_model(ratings, embedding_dim=30, init_stddev=0.5):
    # Split into train and test
    train, test = train_test_split(ratings[['userId', 'movieId', 'rating']], test_size=0.1, random_state=42)
    # DataFrame to sparse tensor
    train_tensor = build_sparse_tensor(train)
    test_tensor = build_sparse_tensor(test)
    # Embeddings
    U = tf.Variable(tf.random_normal(
      [train_tensor.dense_shape[0], embedding_dim], stddev=init_stddev))
    V = tf.Variable(tf.random_normal(
      [train_tensor.dense_shape[1], embedding_dim], stddev=init_stddev))
    # Loss
    train_loss = sparse_mean_square_error(train_tensor, U, V)
    test_loss = sparse_mean_square_error(test_tensor, U, V)
    metrics = {
      'train_mse_error': train_loss,
      'test_mse_error': test_loss
    }
    embeddings = {
      "user_id": U,
      "movie_id": V
    }
    return Model(embeddings, train_loss, [metrics])

In [None]:
# Build the CF model and train it.
model = build_model(ratings)
model.train()