Using LightFM to build a recommendation system

In [27]:
import sys
import os
import itertools

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import lightfm
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm import cross_validation

from lightfm.evaluation import precision_at_k as lightfm_prec_at_k
from lightfm.evaluation import recall_at_k as lightfm_recall_at_k

In [2]:
from recommenders.evaluation.python_evaluation import precision_at_k, recall_at_k

from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.models.lightfm.lightfm_utils import (
    track_model_metrics, prepare_test_df, prepare_all_predictions,
    compare_metric, similar_users, similar_items)


In [3]:
SEED = 42

In [4]:
data = movielens.load_pandas_df(
    size='100k',
    genres_col='genre',
    header=["userID", "itemID", "rating"]
)

data.sample(5, random_state=SEED)

100%|██████████| 4.81k/4.81k [00:00<00:00, 9.24kKB/s]


Unnamed: 0,userID,itemID,rating,genre
75721,498,693,3.0,Drama
80184,642,542,5.0,Animation|Children's|Musical|Romance
19864,58,135,4.0,Drama|Mystery|Sci-Fi|Thriller
76699,495,674,3.0,Horror
92991,618,735,3.0,Drama


In [5]:
print(data.userID.unique().shape[0]) 
print(data.itemID.unique().shape[0])
print(data.shape[0])

943
1682
100000


In [6]:
# data set creation
dataset = Dataset()
dataset.fit(data.userID, data.itemID)

In [7]:
dataset

<lightfm.data.Dataset at 0x7fe599019550>

In [8]:
dataset.interactions_shape()

(943, 1682)

In [9]:
# interactions
(interactions, weights) = dataset.build_interactions(((row['userID'], row['itemID']) for index, row in data.iterrows()))

In [10]:
interactions

<943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 100000 stored elements in COOrdinate format>

In [11]:
weights

<943x1682 sparse matrix of type '<class 'numpy.float32'>'
	with 100000 stored elements in COOrdinate format>

In [12]:
train_interactions, test_interactions = cross_validation.random_train_test_split(
    interactions, test_percentage=0.25,
    random_state=np.random.RandomState(SEED))

In [13]:
train_interactions

<943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 75000 stored elements in COOrdinate format>

In [14]:
test_interactions

<943x1682 sparse matrix of type '<class 'numpy.int32'>'
	with 25000 stored elements in COOrdinate format>

COO matrix implies that the matrix is stored in a sparse format.

In [15]:
model1 = LightFM(loss='warp', no_components=20, # weighted Approximate-Rank Pairwise (WARP) as the loss function
                 learning_rate=0.2,                 
                 random_state=np.random.RandomState(SEED))

BPR: Bayesian Personalised Ranking pairwise loss. Maximises the prediction difference between a positive example and a randomly chosen negative example. Useful when only positive interactions are present and optimising ROC AUC is desired.

WARP: Weighted Approximate-Rank Pairwise loss. Maximises the rank of positive examples by repeatedly sampling negative examples until rank violating one is found. Useful when only positive interactions are present and optimising the top of the recommendation list (precision@k) is desired.

In [16]:
model1.fit(train_interactions, epochs=10, num_threads=2)

<lightfm.lightfm.LightFM at 0x7fe5ab65aaf0>

In [17]:

# Prepare for evaluation
uids, iids, interaction_data = cross_validation._shuffle(
    interactions.row, interactions.col, interactions.data, 
    random_state=np.random.RandomState(SEED))

In [18]:
uids.shape, iids.shape, interaction_data.shape

((100000,), (100000,), (100000,))

In [19]:
test_idx = slice(int(0.75 * len(uids)), None)

In [20]:
test_idx

slice(75000, None, None)

In [21]:
uid_map, ufeature_map, iid_map, ifeature_map = dataset.mapping()

In [22]:
len(uid_map), len(iid_map), len(ufeature_map)

(943, 1682, 943)

In [23]:
test_df = prepare_test_df(test_idx, uids, iids, uid_map, iid_map, weights)

In [24]:
test_df.head()

Unnamed: 0,userID,itemID,rating
0,160,952,1.0
1,83,88,1.0
2,617,1019,1.0
3,345,246,1.0
4,387,95,1.0


In [25]:
all_predictions = prepare_all_predictions(data, uid_map, iid_map, 
                                            interactions=train_interactions,
                                            model=model1, 
                                            num_threads=2)

In [26]:
all_predictions.sample(5, random_state=SEED)

Unnamed: 0,userID,itemID,prediction
471550,626,1058,-14.199018
373280,474,1564,-22.24724
1271323,75,1452,-12.22709
310738,352,1381,-17.027611
1323612,68,1584,-10.442496


In [28]:
eval_precision = precision_at_k(rating_true=test_df, 
                            rating_pred=all_predictions, k=10)

In [29]:
eval_recall = recall_at_k(test_df, all_predictions, k=10)

In [30]:
eval_precision, eval_recall

(0.27221633085896074, 0.11847814225378762)

Notebook largely based on the following Microsoft recommender notebook:
https://github.com/microsoft/recommenders/blob/main/examples/02_model_hybrid/lightfm_deep_dive.ipynb