In [None]:
!pip install cornac
!git clone https://github.com/srendle/libfm.git
!make all -C libfm

In [None]:
# !mkdir /content/etc && git clone https://github.com/microsoft/recommenders.git /content/etc
# !mkdir -p ./notebooks/etc && cp /content/etc/notebooks/*.ipynb ./notebooks/etc
# !rm -r /content/4CED0278/ml-100k

In [39]:
# !echo 'libfm/*' >> .gitignore

Import libraries

In [31]:
import os
import sys
from collections import defaultdict

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import cornac
from cornac.utils import cache

SEED = 42
VERBOSE = False

### Data Layering

In [21]:
base_path = "data/ml-hetrec/"

In [22]:
user_ratedmovies_df = pd.read_csv(base_path + "user_ratedmovies.dat", sep="\t")
user_ratedmovies_df.head()

Unnamed: 0,userID,movieID,rating,date_day,date_month,date_year,date_hour,date_minute,date_second
0,75,3,1.0,29,10,2006,23,17,16
1,75,32,4.5,29,10,2006,23,23,44
2,75,110,4.0,29,10,2006,23,30,8
3,75,160,2.0,29,10,2006,23,16,52
4,75,163,4.0,29,10,2006,23,29,30


In [24]:
# a user could also assign one or more tags to a movie
user_taggedmovies_df = pd.read_csv(base_path + "user_taggedmovies.dat", sep="\t")
user_taggedmovies_df.head()

Unnamed: 0,userID,movieID,tagID,date_day,date_month,date_year,date_hour,date_minute,date_second
0,75,353,5290,29,10,2006,23,20,15
1,78,4223,5264,16,4,2007,4,43,45
2,127,1343,1544,28,8,2007,3,42,27
3,127,1343,12330,28,8,2007,3,42,27
4,127,2080,1451,28,8,2007,3,42,47


In [25]:
tag_df = pd.read_csv(base_path + "tags.dat", sep="\t", encoding="iso-8859-1")
tag_df.head()

Unnamed: 0,id,value
0,1,earth
1,2,police
2,3,boxing
3,4,painter
4,5,whale


In [26]:
'''The original movie information -title and year- available at MovieLens10M 
dataset have been extended with public data provided in IMDb and Rotten 
Tomatoes websites'''

movie_df = pd.read_csv(base_path + "movies.dat", sep="\t", encoding="iso-8859-1")
movie_df.head()

Unnamed: 0,id,title,imdbID,spanishTitle,imdbPictureURL,year,rtID,rtAllCriticsRating,rtAllCriticsNumReviews,rtAllCriticsNumFresh,rtAllCriticsNumRotten,rtAllCriticsScore,rtTopCriticsRating,rtTopCriticsNumReviews,rtTopCriticsNumFresh,rtTopCriticsNumRotten,rtTopCriticsScore,rtAudienceRating,rtAudienceNumRatings,rtAudienceScore,rtPictureURL
0,1,Toy story,114709,Toy story (juguetes),http://ia.media-imdb.com/images/M/MV5BMTMwNDU0...,1995,toy_story,9.0,73,73,0,100,8.5,17,17,0,100,3.7,102338,81,http://content7.flixster.com/movie/10/93/63/10...
1,2,Jumanji,113497,Jumanji,http://ia.media-imdb.com/images/M/MV5BMzM5NjE1...,1995,1068044-jumanji,5.6,28,13,15,46,5.8,5,2,3,40,3.2,44587,61,http://content8.flixster.com/movie/56/79/73/56...
2,3,Grumpy Old Men,107050,Dos viejos gruñones,http://ia.media-imdb.com/images/M/MV5BMTI5MTgy...,1993,grumpy_old_men,5.9,36,24,12,66,7.0,6,5,1,83,3.2,10489,66,http://content6.flixster.com/movie/25/60/25602...
3,4,Waiting to Exhale,114885,Esperando un respiro,http://ia.media-imdb.com/images/M/MV5BMTczMTMy...,1995,waiting_to_exhale,5.6,25,14,11,56,5.5,11,5,6,45,3.3,5666,79,http://content9.flixster.com/movie/10/94/17/10...
4,5,Father of the Bride Part II,113041,Vuelve el padre de la novia (Ahora también abu...,http://ia.media-imdb.com/images/M/MV5BMTg1NDc2...,1995,father_of_the_bride_part_ii,5.3,19,9,10,47,5.4,5,1,4,20,3.0,13761,64,http://content8.flixster.com/movie/25/54/25542...


### EDA

In [27]:
n_users = user_ratedmovies_df.userID.nunique()
n_movies = user_ratedmovies_df.movieID.nunique()
n_tags = tag_df.id.nunique()

print("Number of users:", n_users)
print("Number of movies:", n_movies)
print("Number of ratings:", len(user_ratedmovies_df))
print("-" * 30)
print("Number of tags:", n_tags)
print("Number of tag assignments:", len(user_taggedmovies_df))
print("Number of tagged movies:", user_taggedmovies_df.movieID.nunique())

Number of users: 2113
Number of movies: 10109
Number of ratings: 855598
------------------------------
Number of tags: 13222
Number of tag assignments: 47957
Number of tagged movies: 5908


Data split

In [29]:
train_df, test_df = train_test_split(user_ratedmovies_df, test_size=0.2, random_state=SEED)
print("Training size:", len(train_df))
print("Test size:", len(test_df))

Training size: 684478
Test size: 171120


### Traditional Matrix Factorization
Matrix factorization (MF) only makes use of (user, item, rating) information to train a recommendation model. We include MF as a baseline to see if context produces an improvement.

In [32]:
eval_method = cornac.eval_methods.BaseMethod.from_splits(
  train_data=list(train_df.itertuples(index=False)), 
  test_data=list(test_df.itertuples(index=False)),
  exclude_unknowns=False, 
  verbose=VERBOSE,
  seed=SEED,
)

mf = cornac.models.MF(
  k=10, 
  max_iter=20, 
  learning_rate=0.01, 
  lambda_reg=0.02, 
  use_bias=True,
  verbose=VERBOSE, seed=SEED,
)

test_result, _ = eval_method.evaluate(
  model=mf, metrics=[cornac.metrics.RMSE()], user_based=False
)
print(test_result)

   |   RMSE | Train (s) | Test (s)
-- + ------ + --------- + --------
MF | 0.7576 |    0.7149 |   2.0963



### Factorization Machines with Contextual Information
Factorization Machines (FM) model formulates rating prediction as a regression problem in which user, item, and additional contextual information are combined into a feature vector $\mathbf{x}_i$. The predictor consists of global bias, first-order, and second-order interactions of the input features.

Prepare data

In [33]:
# First, we identify a set of tags for each pair of (user, movie)
user_movie_tags = defaultdict(set)
for uid, mid, tid, *_ in user_taggedmovies_df.itertuples(index=False):
  user_movie_tags[(uid, mid)].add(tid)

# Second, we maintain mappings from ID to index for users, movies, and tags
user_id2idx = eval_method.global_uid_map
movie_id2idx = eval_method.global_iid_map

# create mapping for tags
tag_id2idx = defaultdict()
for tagid, _ in tag_df.itertuples(index=False):
  tag_id2idx.setdefault(tagid, len(tag_id2idx))
assert len(tag_id2idx) == n_tags

# For each feature vector, most of the values will be zeros. Thus, we will 
# save a lot of memory by storing the data in a sparse format
def to_fm_sparse_fmt(rating, uid, mid, tags):
  # order of features: user, movie, tags
  user_start_idx = 0
  movie_start_idx = n_users
  tag_start_idx = movie_start_idx + n_movies
  return "{} {}:1 {}:1 {}\n".format(
    rating,
    user_id2idx[uid],
    movie_id2idx[mid] + movie_start_idx, 
    " ".join("{}:1".format(tag_id2idx[t] + tag_start_idx) for t in tags)
  )

# save training data to file
with open("train.libfm", "w") as f:
  for uid, mid, rating, *_ in train_df.itertuples(index=False):
    f.write(to_fm_sparse_fmt(rating, uid, mid, user_movie_tags[(uid, mid)]))

# save test data to file
with open("test.libfm", "w") as f:
  for uid, mid, rating, *_ in test_df.itertuples(index=False):
    f.write(to_fm_sparse_fmt(rating, uid, mid, user_movie_tags[(uid, mid)]))

In [34]:
!head train.libfm

4.5 0:1 2113:1 
3.0 1:1 2114:1 
4.5 2:1 2115:1 
2.0 3:1 2116:1 
4.0 4:1 2117:1 
2.5 5:1 2118:1 
2.0 6:1 2119:1 
3.5 7:1 2120:1 
4.0 8:1 2121:1 
3.5 9:1 2122:1 


Training

In [35]:
!./libfm/bin/libFM -task r -train train.libfm -test test.libfm -seed $SEED -dim "1,1,10" -iter 200

----------------------------------------------------------------------------
libFM
  Version: 1.4.4
  Author:  Steffen Rendle, srendle@libfm.org
  WWW:     http://www.libfm.org/
This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt.
This is free software, and you are welcome to redistribute it under certain
conditions; for details see license.txt.
----------------------------------------------------------------------------
Loading train...	
has x = 0
has xt = 1
num_rows=684478	num_values=1398326	num_features=25444	min_target=0.5	max_target=5
Loading test... 	
has x = 0
has xt = 1
num_rows=171120	num_values=349755	num_features=25431	min_target=0.5	max_target=5
#relations: 0
Loading meta data...	
#Iter=  0	Train=0.93938	Test=0.937978
#Iter=  1	Train=0.835144	Test=0.872688
#Iter=  2	Train=0.811651	Test=0.845675
#Iter=  3	Train=0.802268	Test=0.830956
#Iter=  4	Train=0.798476	Test=0.822256
#Iter=  5	Train=0.796305	Test=0.816585
#Iter=  6	Train=0.794784	Test=0.812691
#I

The numbers reported above are RMSE. As we can observe, the decreasing RMSE over iterations illustrates a stable training process. FM model achieves a better result (lower RMSE) on the test set as compared to the matrix factorization model. If we train the model even longer, it could potentially improve the performance further as the RMSE is still decreasing.