In [132]:
import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm import cross_validation
import lightfm.evaluation as lfm_eval
import warnings
warnings.filterwarnings('ignore') 

In [133]:
# percentage of data used for testing
TEST_PERCENTAGE = 0.2
# model learning rate
LEARNING_RATE = 0.25
# no of latent factors
NO_COMPONENTS = 30
# no of epochs to fit model
NO_EPOCHS = 30
# no of threads to fit model
NO_THREADS = 32
# regularisation for both user and item features
ITEM_ALPHA = 1e-6
USER_ALPHA = 1e-6

# seed for pseudonumber generations
SEED = 42

## Data preparation

In [134]:
df = pd.read_csv('../data/interim/merged.csv',index_col=0)
data = pd.read_csv('../data/interim/data.csv',index_col=0)
items = pd.read_csv('../data/interim/items.csv',index_col=0)
users = pd.read_csv('../data/interim/users.csv',index_col=0)

In [135]:
genres = list(df.columns[8:-4])
occupations = df['occupation'].unique().tolist()
user_features_names = ['age','gender', *occupations]

In [136]:
dataset = Dataset()
dataset.fit(df['user_id'], df['item_id'],user_features=user_features_names, item_features=genres)

In [137]:
item_features = dataset.build_item_features((x,y[1].to_dict()) for x,y in zip(items['item_id'], items[items.columns[5:]].iterrows()))

In [138]:
def retrieve_user_features():
    for i, row in users.iterrows():
        features_map = {x:0 for x in occupations}
        user_id = row['user_id']
        features_map[row['occupation']] = 1
        features_map['age'] = row['age']
        features_map['gender'] = 1 if row['gender'] == 'M' else 0
        yield user_id, features_map

user_features = dataset.build_user_features(retrieve_user_features())

In [139]:
interactions, weights = dataset.build_interactions(data.iloc[:, :3].values)

### Train / test splitting

In [140]:
train_interactions, test_interactions = cross_validation.random_train_test_split(interactions,TEST_PERCENTAGE,random_state=np.random.RandomState(SEED))
train_weights, test_weights = cross_validation.random_train_test_split(weights,TEST_PERCENTAGE,random_state=np.random.RandomState(SEED))

## Model fitting

In [141]:
model = LightFM(loss='warp', no_components=NO_COMPONENTS, 
                 learning_rate=LEARNING_RATE, 
                 item_alpha=ITEM_ALPHA,
                 user_alpha=USER_ALPHA,
                 random_state=np.random.RandomState(SEED)
                )

In [142]:
fitted_model = model.fit(interactions=train_interactions,
           user_features=user_features,
           item_features=item_features,
           sample_weight=train_weights,
           epochs=NO_EPOCHS,
           verbose=True,
           num_threads=NO_THREADS
           )

Epoch: 100%|██████████| 30/30 [00:51<00:00,  1.72s/it]


## Caching

In [151]:
import pickle

with open('../models/lfm_1.pkl', 'wb') as outp: 
    pickle.dump(fitted_model, outp)

In [159]:
with open('../benchmark/data/benchmark_data.pkl', 'wb') as outp: 
    bench_data = {
        'test_interactions': test_interactions,
        'train_interactions': train_interactions,
        'user_features': user_features,
        'item_features': item_features
    }
    pickle.dump(bench_data, outp)

with open('../src/data/dataset_internal.pkl', 'wb') as outp: 
    pickle.dump(dataset,outp)

## Evaluation

In [153]:
print("Train auc: %.2f" % lfm_eval.auc_score(fitted_model, train_interactions,item_features=item_features,user_features=user_features).mean())
print("Test auc: %.2f" % lfm_eval.auc_score(fitted_model, test_interactions, train_interactions=train_interactions,item_features=item_features,user_features=user_features).mean())

Train auc: 0.94
Test auc: 0.93


In [144]:
k = 10

print(f"Train precision@{k}: %.2f" % lfm_eval.precision_at_k(fitted_model, train_interactions, k=k,item_features=item_features,user_features=user_features).mean())
print(f"Test precision@{k}: %.2f" % lfm_eval.precision_at_k(fitted_model, test_interactions, train_interactions=train_interactions, k=k,item_features=item_features,user_features=user_features).mean())

Train precision@10: 0.61
Test precision@10: 0.33


In [145]:
print(f"Train recall@{k}: %.2f" % lfm_eval.recall_at_k(fitted_model, train_interactions,k=k,item_features=item_features,user_features=user_features).mean())
print(f"Test recall@{k}: %.2f" % lfm_eval.recall_at_k(fitted_model, test_interactions, k=k,train_interactions=train_interactions,item_features=item_features,user_features=user_features).mean())

Train recall@10: 0.12
Test recall@10: 0.20


In [146]:
print("Train reciporial rank: %.2f" % lfm_eval.reciprocal_rank(fitted_model, train_interactions,item_features=item_features,user_features=user_features).mean())
print("Test reciporial rank: %.2f" % lfm_eval.reciprocal_rank(fitted_model, test_interactions, train_interactions=train_interactions,item_features=item_features,user_features=user_features).mean())

Train reciporial rank: 0.83
Test reciporial rank: 0.64


## Inference

In [147]:
uid_map, ufeature_map, iid_map, ifeature_map = dataset.mapping()

iid_map_reversed = {v:k for k,v in iid_map.items()}

In [148]:
def sample_recommendation(model, user_ids, k=3):

    n_users, n_items = dataset.interactions_shape()

    user_ids_internal = [uid_map[user_id] for user_id in user_ids]

    for user_id, user_id_internal in zip(user_ids, user_ids_internal):

        scores = model.predict(user_id_internal, np.arange(n_items))
        top_items_idxs_internal = np.argsort(-scores)
        top_items_idxs = [iid_map_reversed[item_id] for item_id in top_items_idxs_internal]
        top_items = [items[items['item_id'] == item_id]['title'].values[0] for item_id in top_items_idxs]

        print("User %s" % user_id)
        print("     Recommended:")

        for x in top_items[:k]:
            print("        %s" % x)


In [149]:
sample_recommendation(fitted_model, [3, 25, 450])

User 3
     Recommended:
        Starship Troopers (1997)
        Everyone Says I Love You (1996)
        Devil's Own, The (1997)
User 25
     Recommended:
        African Queen, The (1951)
        2001: A Space Odyssey (1968)
        Arsenic and Old Lace (1944)
User 450
     Recommended:
        Batman (1989)
        Ghost (1990)
        Grease (1978)
