# Wide and Deep Model for Movie Recommendation

### Prerequisite
* `tensorflow`

In this example, we utilize TensorFlow's higher level Estimator API to build wide-and-deep model for movie recommendation.

In [38]:
import sys
sys.path.append("../../")

import itertools

import tensorflow as tf
import pandas as pd

from reco_utils.dataset import movielens
from reco_utils.dataset.python_splitters import python_random_split
from reco_utils.evaluation.python_evaluation import (
    rmse, mae, rsquared, exp_var,
    map_at_k, ndcg_at_k, precision_at_k, recall_at_k
)

In [39]:
from tensorflow.python.client import device_lib

devices = device_lib.list_local_devices()
[x.name for x in devices]

['/device:CPU:0', '/device:XLA_CPU:0', '/device:XLA_GPU:0', '/device:GPU:0']

### Data loading

In [3]:
MOVIELENS_DATA_SIZE = '100k'

In [4]:
data = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE,
    header=['UserId','MovieId','Rating','Timestamp'],
    # TODO For now, not using genres YET
    load_genres=False
)
data.head()


Unnamed: 0,UserId,MovieId,Rating,Timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


### Feature embedding

In [5]:
# Distinct users and items
user_list = data['UserId'].unique()
item_list = data['MovieId'].unique()

# Rule of thumb for embedding_dimensions =  number_of_categories ** 0.25
USER_EMBEDDING_DIM = int(len(user_list) ** 0.25) # or 16
ITEM_EMBEDDING_DIM = int(len(item_list) ** 0.25) # or 64
print("Embedding {} users to {}-dim vector".format(len(user_list), USER_EMBEDDING_DIM))
print("Embedding {} items to {}-dim vector".format(len(item_list), ITEM_EMBEDDING_DIM))

# Convert a categorical feature, e.g. UserId or MovieId, into a lower-dimensional vector (embedding)
user_id = tf.feature_column.categorical_column_with_vocabulary_list(
    'UserId', user_list)
user_embedding = tf.feature_column.embedding_column(
    categorical_column=user_id,
    dimension=USER_EMBEDDING_DIM,
    max_norm=USER_EMBEDDING_DIM**.5)

item_id = tf.feature_column.categorical_column_with_vocabulary_list(
    'MovieId', item_list)
item_embedding = tf.feature_column.embedding_column(
    categorical_column=item_id,
    dimension=ITEM_EMBEDDING_DIM,
    max_norm=ITEM_EMBEDDING_DIM**.5)

timestamp = tf.feature_column.numeric_column('Timestamp')

# TODO numeric_column (w/ shape)
# genres = tf.feature_column.numeric_column(
#     'Genre', shape=(NUM_GENRES,), dtype=tf.uint8)

deep_columns = [user_embedding, item_embedding, timestamp]  # TODO , genres]
wide_columns = []  # TODO cross product transformation of user and item

Embedding 943 users to 5-dim vector
Embedding 1682 items to 6-dim vector


Tran and test data split

In [6]:
train, test = python_random_split(data, ratio=0.75, seed=123)

train_x = train.copy()
train_y = train_x.pop('Rating')
test_x = test.copy()
test_y = test_x.pop('Rating')

print(train_x.head())
print("\nLabels:")
print(train_y.head())

       UserId  MovieId  Timestamp
31450     496      136  876066424
42809      64      101  889740225
52419     158      471  880132513
45663     198      652  884209569
50696     749      121  878847645

Labels:
31450    1.0
42809    2.0
52419    4.0
45663    3.0
50696    3.0
Name: Rating, dtype: float64


### Model preparation

Model selection
* `wide` - Linear model
* `deep` - DNN model
* `wide_deep` - Linear combination of the linear and DNN models

(TODO)Model type: `regressor` or `classifier`

In [7]:
# 'wide', 'deep', or 'wide_deep' 
MODEL_TYPE = 'deep'
HIDDEN_UNITS = [256, 256, 256, 128]
# Model checkpoints folder
MODEL_DIR = './models'

In [8]:
# TODO set run config if needed
if MODEL_TYPE == 'wide':
    if len(wide_columns) == 0:
        raise ValueError("No features have defined for the 'wide' model")
    model = tf.estimator.LinearRegressor(  # LinearClassifier(
        model_dir=MODEL_DIR,
        feature_columns=wide_columns,
    )
elif MODEL_TYPE == 'deep':
    if len(deep_columns) == 0:
        raise ValueError("No features have defined for the 'deep' model")
    model = tf.estimator.DNNRegressor(  # DNNClassifier(
        model_dir=MODEL_DIR,
        feature_columns=deep_columns,
        hidden_units=HIDDEN_UNITS,
        optimizer=tf.train.AdamOptimizer(),
#         activation_fn=tf.nn.sigmoid,
#         dropout=0.3,
#         loss_reduction=tf.losses.Reduction.MEAN,
#         batch_norm=False
    )
elif MODEL_TYPE == 'wide_deep':
    if len(wide_columns) == 0 and len(deep_columns) == 0:
        raise ValueError("No features have defined for the 'wide_deep' model")
    model = tf.estimator.DNNLinearCombinedRegressor(  # DNNLinearCombinedClassifier(
        model_dir=MODEL_DIR,
        # wide settings
        linear_feature_columns=wide_columns,
        # deep settings
        dnn_feature_columns=deep_columns,
        dnn_hidden_units=HIDDEN_UNITS,
    )
else:
    raise ValueError("Model type should be either 'wide', 'deep', or 'wide_deep'")


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './models', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fae1c17f940>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


### Training

In [9]:
# Maybe should set tf.estimator.RunConfig to run on GPU?

BATCH_SIZE = 256
NUM_EPOCHS = 50

train_input_fn = tf.estimator.inputs.pandas_input_fn(
    x=train_x,
    y=train_y,
    batch_size=BATCH_SIZE,
    num_epochs=NUM_EPOCHS,
    shuffle=True,
    num_threads=1
)

model.train(input_fn=train_input_fn)

Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
INFO:tensorflow:Saving checkpoints for 0 into ./models/model.ckpt.
INFO:tensorflow:loss = 1.975615e+17, step = 0
INFO:tensorflow:global_step/sec: 115.375
INFO:tensorflow:loss = 1075005600000.0, step = 100 (0.868 sec)
INFO:tensorflow:global_step/sec: 150.477
INFO:tensorflow:loss = 16948674.0, step = 200 (0.665 sec)
INFO:tensorflow:global_step/sec: 153.13
INFO:tensorflow:loss = 8898.426, step = 300 (0.653 sec)
INFO:tensorflow:global_step/sec: 136.722
INFO:tensorflow:loss = 8810.109, step = 400 (0.732 sec)
INFO:

INFO:tensorflow:global_step/sec: 134.764
INFO:tensorflow:loss = 420.58386, step = 7000 (0.741 sec)
INFO:tensorflow:global_step/sec: 154.285
INFO:tensorflow:loss = 381.7384, step = 7100 (0.648 sec)
INFO:tensorflow:global_step/sec: 152.577
INFO:tensorflow:loss = 508.60516, step = 7200 (0.655 sec)
INFO:tensorflow:global_step/sec: 153.977
INFO:tensorflow:loss = 440.88892, step = 7300 (0.649 sec)
INFO:tensorflow:global_step/sec: 129.189
INFO:tensorflow:loss = 458.9027, step = 7400 (0.774 sec)
INFO:tensorflow:global_step/sec: 152.093
INFO:tensorflow:loss = 433.6165, step = 7500 (0.658 sec)
INFO:tensorflow:global_step/sec: 150.411
INFO:tensorflow:loss = 415.26874, step = 7600 (0.665 sec)
INFO:tensorflow:global_step/sec: 136.508
INFO:tensorflow:loss = 432.86987, step = 7700 (0.733 sec)
INFO:tensorflow:global_step/sec: 104.175
INFO:tensorflow:loss = 472.8913, step = 7800 (0.960 sec)
INFO:tensorflow:global_step/sec: 136.073
INFO:tensorflow:loss = 455.38312, step = 7900 (0.736 sec)
INFO:tensorflo

<tensorflow.python.estimator.canned.dnn.DNNRegressor at 0x7fae1c17f6a0>

### Testing

In [10]:
test_input_fn = tf.estimator.inputs.pandas_input_fn(
    x=test_x,
    y=test_y,
    num_epochs=1,
    shuffle=False
)

result = model.evaluate(input_fn=test_input_fn, steps=None)

print(result)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-12-13-23:23:45
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./models/model.ckpt-14649
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-12-13-23:23:46
INFO:tensorflow:Saving dict for global step 14649: average_loss = 1.5211167, global_step = 14649, label/mean = 3.5334, loss = 194.01999, prediction/mean = 3.9837782
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 14649: ./models/model.ckpt-14649
{'average_loss': 1.5211167, 'label/mean': 3.5334, 'loss': 194.01999, 'prediction/mean': 3.9837782, 'global_step': 14649}


1. Item rating prediction

In [11]:
predictions = list(model.predict(input_fn=test_input_fn))
pred_list = [p['predictions'][0] for p in predictions]
test_x['prediction']  = pd.Series(pred_list).values
test_x.head()


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./models/model.ckpt-14649
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


Unnamed: 0,UserId,MovieId,Timestamp,prediction
42083,600,651,888451492,4.022713
71825,607,494,883879556,4.007088
99535,875,1103,876465144,3.975838
47879,648,238,882213535,3.960213
36734,113,273,875935609,3.866463


In [12]:
cols = {
    'col_user': 'UserId',
    'col_item': 'MovieId',
    'col_rating': 'Rating',
    'col_prediction': 'prediction',
}

eval_rmse = rmse(test, test_x, **cols)
eval_mae = mae(test, test_x, **cols)
eval_rsquared = rsquared(test, test_x, **cols)
eval_exp_var = exp_var(test, test_x, **cols)

print("RMSE:\t\t%f" % eval_rmse,
      "MAE:\t\t%f" % eval_mae,
      "rsquared:\t%f" % eval_rsquared,
      "exp var:\t%f" % eval_exp_var, sep='\n')

RMSE:		1.233336
MAE:		0.943717
rsquared:	-0.181869
exp var:	-0.024264


2. Recommend k items

1) Remove seen items and 2) add timestamp info

In [46]:
# Get the cross join of all user-item pairs and score them.
user_item_col = ['UserId', 'MovieId']
user_item_list = list(itertools.product(user_list, item_list))
users_items = pd.DataFrame(user_item_list, columns=user_item_col)
print("Before excude seen items:", len(users_items))

# Remove seen items (items in the train set)
users_items_exclude_train = users_items.loc[
    ~users_items.set_index(user_item_col).index.isin(train.set_index(user_item_col).index)
]
print("After excude seen items:", len(users_items_exclude_train))

# Add timestamp info
users_items_exclude_train = pd.merge(test, users_items_exclude_train,
                                     on=user_item_col, how='outer')
users_items_exclude_train.drop('Rating', axis=1, inplace=True)
users_items_exclude_train.fillna(test['Timestamp'].max(), inplace=True) 
print(users_items_exclude_train.head())

Before excude seen items: 1586126
After excude seen items: 1511126
   UserId  MovieId    Timestamp
0     600      651  888451492.0
1     607      494  883879556.0
2     875     1103  876465144.0
3     648      238  882213535.0
4     113      273  875935609.0


In [47]:
reco_input_fn = tf.estimator.inputs.pandas_input_fn(
    x=users_items_exclude_train,
    num_epochs=1,
    shuffle=False
)

reco = list(model.predict(input_fn=reco_input_fn))
reco_list = [p['predictions'][0] for p in reco]
users_items_exclude_train['prediction']  = pd.Series(reco_list).values
users_items_exclude_train.head()


INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./models/model.ckpt-14649
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


Unnamed: 0,UserId,MovieId,Timestamp,prediction
0,600,651,888451492.0,4.022713
1,607,494,883879556.0,4.007088
2,875,1103,876465144.0,3.975838
3,648,238,882213535.0,3.960213
4,113,273,875935609.0,3.866463


In [48]:
k = 10
eval_map = map_at_k(test, users_items_exclude_train, k=k, **cols)
eval_ndcg = ndcg_at_k(test, users_items_exclude_train, k=k, **cols)
eval_precision = precision_at_k(test, users_items_exclude_train, k=k, **cols)
eval_recall = recall_at_k(test, users_items_exclude_train, k=k, **cols)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.020709
NDCG:	0.089744
Precision@K:	0.051538
Recall@K:	0.020709
