# Wide and Deep Model for Movie Recommendation

### Prerequisite
* `tensorflow`

In this example, we utilize TensorFlow's higher level Estimator API to build wide-and-deep model for movie recommendation.

In [12]:
import sys
sys.path.append("../../")

import tensorflow as tf
import pandas as pd

from reco_utils.dataset import movielens
from reco_utils.dataset.python_splitters import python_random_split
from reco_utils.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var)

In [2]:
from tensorflow.python.client import device_lib

devices = device_lib.list_local_devices()
[x.name for x in devices]

['/device:CPU:0', '/device:GPU:0']

### Data loading

In [3]:
MOVIELENS_DATA_SIZE = '100k'

In [4]:
data = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE,
    header=['UserId','MovieId','Rating','Timestamp'],
    # TODO For now, not using genres YET
    load_genres=False
)
data.head()


Unnamed: 0,UserId,MovieId,Rating,Timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


### Feature embedding

In [5]:
# Distinct users and items
user_list = data['UserId'].unique()
item_list = data['MovieId'].unique()

# Rule of thumb for embedding_dimensions =  number_of_categories ** 0.25
USER_EMBEDDING_DIM = int(len(user_list) ** 0.25) # = 16
ITEM_EMBEDDING_DIM = int(len(item_list) ** 0.25) # = 64
print("Embedding {} users to {}-dim vector".format(len(user_list), USER_EMBEDDING_DIM))
print("Embedding {} items to {}-dim vector".format(len(item_list), ITEM_EMBEDDING_DIM))

# Convert a categorical feature, e.g. UserId or MovieId, into a lower-dimensional vector (embedding)
user_id = tf.feature_column.categorical_column_with_vocabulary_list(
    'UserId', user_list)
user_embedding = tf.feature_column.embedding_column(
    categorical_column=user_id,
    dimension=USER_EMBEDDING_DIM,
    max_norm=USER_EMBEDDING_DIM**.5)

item_id = tf.feature_column.categorical_column_with_vocabulary_list(
    'MovieId', item_list)
item_embedding = tf.feature_column.embedding_column(
    categorical_column=item_id,
    dimension=ITEM_EMBEDDING_DIM,
    max_norm=ITEM_EMBEDDING_DIM**.5)

timestamp = tf.feature_column.numeric_column('Timestamp')

# TODO numeric_column (w/ shape)
# genres = tf.feature_column.numeric_column(
#     'Genre', shape=(NUM_GENRES,), dtype=tf.uint8)

deep_columns = [user_embedding, item_embedding, timestamp]  # TODO , genres]
wide_columns = []  # TODO cross product transformation of user and item

Embedding 943 users to 5-dim vector
Embedding 1682 items to 6-dim vector


Tran and test data split

In [6]:
train, test = python_random_split(data, ratio=0.75, seed=123)

train_x = train.copy()
train_y = train_x.pop('Rating')
test_x = test.copy()
test_y = test_x.pop('Rating')

print(train_x.head())
print("\nLabels:")
print(train_y.head())

       UserId  MovieId  Timestamp
31450     496      136  876066424
42809      64      101  889740225
52419     158      471  880132513
45663     198      652  884209569
50696     749      121  878847645

Labels:
31450    1.0
42809    2.0
52419    4.0
45663    3.0
50696    3.0
Name: Rating, dtype: float64


### Model preparation

Model selection
* `wide` - Linear model
* `deep` - DNN model
* `wide_deep` - Linear combination of the linear and DNN models

(TODO)Model type: `regressor` or `classifier`

In [7]:
# 'wide', 'deep', or 'wide_deep' 
MODEL_TYPE = 'deep'
HIDDEN_UNITS = [256, 256, 256, 128]
# Model checkpoints folder
MODEL_DIR = './models'

In [8]:
# TODO set run config if needed
if MODEL_TYPE == 'wide':
    if len(wide_columns) == 0:
        raise ValueError("No features have defined for the 'wide' model")
    model = tf.estimator.LinearRegressor(  # LinearClassifier(
        model_dir=MODEL_DIR,
        feature_columns=wide_columns,
    )
elif MODEL_TYPE == 'deep':
    if len(deep_columns) == 0:
        raise ValueError("No features have defined for the 'deep' model")
    model = tf.estimator.DNNRegressor(  # DNNClassifier(
        model_dir=MODEL_DIR,
        feature_columns=deep_columns,
        hidden_units=HIDDEN_UNITS,
        optimizer=tf.train.AdamOptimizer(),
#         activation_fn=tf.nn.sigmoid,
#         dropout=0.3,
#         loss_reduction=tf.losses.Reduction.MEAN,
#         batch_norm=False
    )
elif MODEL_TYPE == 'wide_deep':
    if len(wide_columns) == 0 and len(deep_columns) == 0:
        raise ValueError("No features have defined for the 'wide_deep' model")
    model = tf.estimator.DNNLinearCombinedRegressor(  # DNNLinearCombinedClassifier(
        model_dir=MODEL_DIR,
        # wide settings
        linear_feature_columns=wide_columns,
        # deep settings
        dnn_feature_columns=deep_columns,
        dnn_hidden_units=HIDDEN_UNITS,
    )
else:
    raise ValueError("Model type should be either 'wide', 'deep', or 'wide_deep'")


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './models', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000026212FD8630>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


### Training

In [9]:
# Maybe should set tf.estimator.RunConfig to run on GPU?

BATCH_SIZE = 256
NUM_EPOCHS = 50

train_steps = len(train_x) / BATCH_SIZE
train_input_fn = tf.estimator.inputs.pandas_input_fn(
    x=train_x,
    y=train_y,
    batch_size=BATCH_SIZE,
    num_epochs=NUM_EPOCHS,
    shuffle=True,
    num_threads=1
)

model.train(input_fn=train_input_fn, steps=train_steps)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into ./models\model.ckpt.
INFO:tensorflow:loss = 3.5551321e+16, step = 0
INFO:tensorflow:global_step/sec: 87.4924
INFO:tensorflow:loss = 1876942700000.0, step = 100 (1.144 sec)
INFO:tensorflow:global_step/sec: 121.508
INFO:tensorflow:loss = 67808710.0, step = 200 (0.823 sec)
INFO:tensorflow:Saving checkpoints for 293 into ./models\model.ckpt.
INFO:tensorflow:Loss for final step: 7182.6343.


<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x26212fd8400>

### Testing

In [10]:
test_input_fn = tf.estimator.inputs.pandas_input_fn(
    x=test_x,
    y=test_y,
    num_epochs=1,
    shuffle=False
)

result = model.evaluate(input_fn=test_input_fn, steps=None)

print(result)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-10-22:14:02
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./models\model.ckpt-293
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-12-10-22:14:04
INFO:tensorflow:Saving dict for global step 293: average_loss = 17.11205, global_step = 293, loss = 2182.6594
{'average_loss': 17.11205, 'loss': 2182.6594, 'global_step': 293}


In [13]:
predictions = list(model.predict(input_fn=test_input_fn))
pred_list = [p['predictions'][0] for p in predictions]
test_x['prediction']  = pd.Series(pred_list).values
test_x.head()


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./models\model.ckpt-293
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


Unnamed: 0,UserId,MovieId,Timestamp,prediction
42083,600,651,888451492,10.501305
71825,607,494,883879556,7.001305
99535,875,1103,876465144,3.001305
47879,648,238,882213535,6.251305
36734,113,273,875935609,2.251305


In [15]:
cols = {
    'col_user': "UserId",
    'col_item': "MovieId",
    'col_rating': "Rating",
    'col_prediction': "prediction",
}


eval_rmse = rmse(test, test_x, **cols)
eval_mae = mae(test, test_x, **cols)
eval_rsquared = rsquared(test, test_x, **cols)
eval_exp_var = exp_var(test, test_x, **cols)


print("RMSE:\t\t%f" % eval_rmse,
      "MAE:\t\t%f" % eval_mae,
      "rsquared:\t%f" % eval_rsquared,
      "exp var:\t%f" % eval_exp_var, sep='\n')

RMSE:		4.136671
MAE:		3.301557
rsquared:	-12.295617
exp var:	-9.991789
