# Implementación LightFM

## Librerias

In [1]:
from lightfm import LightFM
from lightfm.data import Dataset
import pandas as pd
from lightfm.evaluation import precision_at_k, recall_at_k
from lightfm.evaluation import auc_score
import json

## Datasets

In [2]:
df_postulantes = pd.read_csv('postulantes_procesados.csv', index_col=0)
df_establecimientos = pd.read_csv('establecimientos_procesados.csv', index_col=0)
df_postulaciones_training = pd.read_csv('postulaciones_training.csv', index_col=0)
df_postulaciones_testing = pd.read_csv('postulaciones_testing.csv', index_col=0)

In [3]:
for col in df_establecimientos.columns:
  if ((col != 'LATITUD') and (col != 'LONGITUD')):
    df_establecimientos[col] = df_establecimientos[col].astype(int)

In [4]:
df_establecimientos.drop(columns=['COD_COM_RBD', 'LATITUD', 'LONGITUD'], inplace=True)

In [5]:
dataset = Dataset()
dataset.fit(
    users=df_postulaciones_training['mrun'].unique(),
    items=df_postulaciones_training['rbd'].unique(),
    item_features=[col for col in df_establecimientos.columns if col != 'RBD']
)

In [6]:
# Construir las interacciones de entrenamiento
(interactions, weights) = dataset.build_interactions(
    (row['mrun'], row['rbd']) for _, row in df_postulaciones_training.iterrows()
)

In [7]:
# Construir las características de los ítems
item_features = dataset.build_item_features(
    [
        (row['RBD'], {col: int(row[col]) for col in df_establecimientos.columns if col != 'RBD'})
        for _, row in df_establecimientos.iterrows()
    ]
)

In [8]:
model = LightFM(loss='bpr')
model.fit_partial(interactions, item_features=item_features, epochs=100, verbose=True)

Epoch: 100%|██████████| 100/100 [08:35<00:00,  5.16s/it]


<lightfm.lightfm.LightFM at 0x7f8524982f20>

In [9]:
# Construir las interacciones de prueba
(test_interactions, test_weights) = dataset.build_interactions(
    (row['mrun'], row['rbd']) for _, row in df_postulaciones_testing.iterrows()
)


In [10]:
import numpy as np
from scipy.sparse import coo_matrix

def ndcg_at_k(
    model,
    test_interactions,
    train_interactions=None,
    k=10,
    user_features=None,
    item_features=None,
    preserve_rows=False,
    num_threads=1,
    check_intersections=True,
):
    """
    Measure the normalized discounted cumulative gain (NDCG) at k metric for a model:
    the DCG at k divided by the ideal DCG at k. A perfect score is 1.0.

    Parameters
    ----------
    model: LightFM instance
        the fitted model to be evaluated
    test_interactions: np.float32 csr_matrix of shape [n_users, n_items]
        Non-zero entries representing known positives in the evaluation set.
    train_interactions: np.float32 csr_matrix of shape [n_users, n_items], optional
        Non-zero entries representing known positives in the train set. These
        will be omitted from the score calculations to avoid re-recommending
        known positives.
    k: integer, optional
        The k parameter.
    user_features: np.float32 csr_matrix of shape [n_users, n_user_features], optional
        Each row contains that user's weights over features.
    item_features: np.float32 csr_matrix of shape [n_items, n_item_features], optional
        Each row contains that item's weights over features.
    preserve_rows: boolean, optional
        When False (default), the number of rows in the output will be equal
        to the number of users with interactions in the evaluation set.
        When True, the number of rows in the output will be equal to the
        number of users.
    num_threads: int, optional
        Number of parallel computation threads to use. Should
        not be higher than the number of physical cores.
    check_intersections: bool, optional, True by default,
        Only relevant when train_interactions are supplied.
        A flag that signals whether the test and train matrices should be checked
        for intersections to prevent optimistic ranks / wrong evaluation / bad data split.

    Returns
    -------
    np.array of shape [n_users with interactions or n_users,]
        Numpy array containing NDCG@k scores for each user.
        If there are no interactions for a given user the returned value will be 0.0.
    """

    if num_threads < 1:
        raise ValueError("Number of threads must be 1 or larger.")

    ranks = model.predict_rank(
        test_interactions,
        train_interactions=train_interactions,
        user_features=user_features,
        item_features=item_features,
        num_threads=num_threads,
        check_intersections=check_intersections,
    )

    # Only consider the top k predictions
    ranks.data = np.less(ranks.data, k, ranks.data)

    ndcg_scores = []

    test_interactions = test_interactions.tocsr()
    ranks = ranks.tocsr()

    for user_id in range(test_interactions.shape[0]):
        user_test_interactions = test_interactions[user_id].toarray().flatten()
        if user_test_interactions.sum() == 0:
            ndcg_scores.append(0.0)
            continue

        user_ranks = ranks[user_id].data

        # Consider only the top k ranks
        top_k_indices = np.argsort(user_ranks)[:k]
        user_ranks_top_k = user_ranks[top_k_indices]
        user_test_interactions_top_k = user_test_interactions[top_k_indices]

        # Discounted cumulative gain at k
        dcg = np.sum((1 / np.log2(np.arange(1, len(user_ranks_top_k) + 1) + 1)) * user_test_interactions_top_k)

        # Ideal discounted cumulative gain at k
        ideal_ranks = np.sort(user_test_interactions)[::-1][:k]
        idcg = np.sum((1 / np.log2(np.arange(1, len(ideal_ranks) + 1) + 1)) * ideal_ranks)

        ndcg = dcg / idcg if idcg > 0 else 0.0
        ndcg_scores.append(ndcg)

    return np.mean(ndcg_scores)


In [11]:
# Evaluar el modelo
test_precision = precision_at_k(model=model, test_interactions=test_interactions, item_features=item_features, k=20, train_interactions=interactions).mean()

test_auc = auc_score(model=model, test_interactions=test_interactions, item_features=item_features, train_interactions=interactions).mean()

test_recall = recall_at_k(model=model, test_interactions=test_interactions, item_features=item_features, k=20, train_interactions=interactions).mean()

test_ndcg = ndcg_at_k(model=model, test_interactions=test_interactions, item_features=item_features, k=20, train_interactions=interactions)

print('Precision: %.10f.' % (test_precision))
print('AUC: %.10f.' % (test_auc))
print('Recall: %.10f.' % (test_recall))
print('NDCG: %.10f.' % (test_ndcg))

Precision: 0.0163789019.
AUC: 0.8010588288.
Recall: 0.2137884586.
NDCG: 0.0020882798.


In [12]:
predicciones_lightfm = {}
lista_id_usuarios = list(dataset.mapping()[0].keys())
lista_id_establecimientos = list(dataset.mapping()[2].keys())

establecimientos_random = df_postulaciones_testing.sample(n=10)

for _,usuario in establecimientos_random.iterrows():
    (test_interactions, test_weights) = dataset.build_interactions(
        (usuario['mrun'], establecimiento) for establecimiento in df_establecimientos['RBD']
    )
    ranks = model.predict_rank(test_interactions, item_features=item_features)
    non_zero_values = ranks.data
    row_indices, col_indices = ranks.nonzero()

    non_zero_elements = list(zip(row_indices, col_indices, non_zero_values))
    non_zero_elements = [(int(row[2]), int(row[0]), int(row[1])) for row in non_zero_elements]
    non_zero_elements.sort()
    predicciones_lightfm[int(lista_id_usuarios[non_zero_elements[0][1]])] = [int(lista_id_establecimientos[non_zero_elements[i][2]]) for i in range(20)]

with open('../Visualizacion/data/recomendaciones_lightfm.json', 'w') as output:
    json.dump(predicciones_lightfm, output)