# Implementación ALS

## LIBRERIAS

In [1]:
import pandas as pd
import numpy as np
import implicit
import scipy.sparse as sparse
import sklearn
from sklearn.model_selection import train_test_split
import json


  from .autonotebook import tqdm as notebook_tqdm


## Dataframes


In [2]:
df_postulantes = pd.read_csv('postulantes_procesados.csv', index_col=0)
df_establecimientos = pd.read_csv('establecimientos_procesados.csv', index_col=0)
df_postulaciones_training = pd.read_csv('postulaciones_training.csv', index_col=0)
df_postulaciones_testing = pd.read_csv('postulaciones_testing.csv', index_col=0)

In [3]:
user_items_testing = {}

for row in df_postulaciones_testing.itertuples():
  if row[1] not in user_items_testing:
      user_items_testing[row[1]] = []
  user_items_testing[row[1]].append(row[2])

In [4]:
# Definicion de métricas (No editar)
# Obtenido de https://gist.github.com/bwhite/3726239

def precision_at_k_perso(r, k):
    assert k >= 1
    r = np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return np.mean(r)

def average_precision(r):
    r = np.asarray(r) != 0
    out = [precision_at_k_perso(r, k + 1) for k in range(r.size) if r[k]]
    if not out:
        return 0.
    return np.mean(out)

def mean_average_precision(rs):
    return np.mean([average_precision(r) for r in rs])

def dcg_at_k(r, k):
    r = np.asarray(r, dtype=float)[:k]
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.


def ndcg_at_k(r, k):
    idcg = dcg_at_k(sorted(r, reverse=True), k)

    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg

def recall_at_k(r):
    recall = np.mean(r)
    return recall

In [81]:
user_items_training = {}
itemset_training = set()

counter = {}
item_map = {}
i = 0
for row in df_postulaciones_training.itertuples():
  if row[1] not in user_items_training:
      user_items_training[row[1]] = []
  if row[2] not in itemset_training:
     item_map[row[2]] = i
     counter[item_map[row[2]]] = 0
     i += 1
  user_items_training[row[1]].append(row[2])
  itemset_training.add(row[2])
  counter[item_map[row[2]]] += 1

itemset_training = np.sort(list(itemset_training))
counter = [(counter[key], key) for key in counter.keys()]
counter.sort(reverse=True)

In [88]:
class most_popular():

    def __init__(self, counter):
        self.counter = counter
        print(counter)

    def recommend(self, userid=None, user_items=None, N=None):
        return [np.array([x[1] for x in self.counter[:20]])]
        


In [66]:
import random
class randomModel():

    def __init__(self, itemset):
        self.itemset = range(len(itemset))

    def recommend(self, userid=None, user_items=None, N=20):
        return [np.array(random.sample(self.itemset,N))]

In [15]:
sparse_matrix = np.zeros((len(user_items_training), len(itemset_training)))
userid_to_i = {}

for i, items in enumerate(user_items_training.values()):
    sparse_matrix[i] = np.isin(itemset_training, items, assume_unique=True).astype(int)


matrix = sparse.csr_matrix(sparse_matrix.T)

user_ids = {key: i for i, key in enumerate(user_items_training.keys())}
user_item_matrix = matrix.T.tocsr()

In [73]:
def evaluate_model(model, n):
  mean_map = 0.
  mean_ndcg = 0.
  mean_recall = 0.
  for u, c in zip(user_items_testing.keys(), range(len(user_items_testing.keys()))):
    rec = model.recommend(c, user_item_matrix[c], n)[0]
    rel_vector = [np.isin(itemset_training[rec], user_items_testing[u], assume_unique=True).astype(int)]
    vector_for_recall = [np.isin(user_items_testing[u], itemset_training[rec], assume_unique=True).astype(int)]
    mean_map += mean_average_precision(rel_vector)
    mean_ndcg += ndcg_at_k(rel_vector, n)
    mean_recall += recall_at_k(vector_for_recall)

  mean_map /= len(user_items_testing)
  mean_ndcg /= len(user_items_testing)
  mean_recall /= len(user_items_testing)

  return mean_map, mean_ndcg, mean_recall

In [46]:
def show_recommendations(model, user, n):
  recommendations = model.recommend(userid=user, user_items=user_item_matrix[user], N=n)[0]
  return itemset_training[recommendations]

In [72]:
random_rec = randomModel(itemset=itemset_training)
most_popular_rec = most_popular(counter)

[array([ 354, 1146,   76,  725,  542,  125,  510,  721,  741,   84,  151,
        516,  969, 1105,  264,  922,  944,   39,  518,  870])]


In [86]:
r_map, r_nndcg, r_nrecall = evaluate_model(random_rec, 20)

print('map: {}\nndcg: {}\nrecall: {}'.format(r_map, r_nndcg, r_nrecall))

map: 0.004303163273600063
ndcg: 0.02450097283274483
recall: 0.017213116405665245


In [89]:
r_map, r_nndcg, r_nrecall = evaluate_model(most_popular_rec, 20)

print('map: {}\nndcg: {}\nrecall: {}'.format(r_map, r_nndcg, r_nrecall))

map: 0.008088544649959005
ndcg: 0.053599481155869426
recall: 0.035340872886091536


In [95]:
recomendaciones = {}

for u, c in zip(user_items_testing.keys(), range(len(user_items_testing.keys()))):
    rec = random_rec.recommend(c, user_item_matrix[c], 20)
    recomendaciones[u] = [int(i) for i in itemset_training[rec][0]]

with open('recomendaciones_random.json', 'w') as output:
    json.dump(recomendaciones, output)

In [96]:
recomendaciones = {}

for u, c in zip(user_items_testing.keys(), range(len(user_items_testing.keys()))):
    rec = most_popular_rec.recommend(c, user_item_matrix[c], 20)
    recomendaciones[u] = [int(i) for i in itemset_training[rec][0]]

with open('recomendaciones_mp.json', 'w') as output:
    json.dump(recomendaciones, output)