# Librerías

In [None]:
from collections import defaultdict
import pandas as pd
import ipywidgets as widgets
from sklearn.metrics import pairwise_distances
import heapq
import numpy as np
import random as rd

# Importar datos preprocesados

In [None]:
track_features = pd.read_csv('./data/processed_track_features.csv')
sessions = pd.read_csv('./data/processed_sessions.csv')
display(track_features.head(3))
display(sessions.head(3))

In [None]:
track_features_embedding = track_features.drop(labels="track_id", axis=1)

In [None]:
idx_to_track_id = {i: id_ for i, id_ in enumerate(track_features.track_id)}
track_id_to_idx = {id_:i for i, id_ in enumerate(track_features.track_id)}

In [None]:
idx_to_sessions_id = {i: id_ for i, id_ in enumerate(sessions.session_id)}
sessions_id_to_idx = {id_:i for i, id_ in enumerate(sessions.session_id)}

# Nuestro modelo

# Crear Top N verdadero por usuario

In [None]:
# top_n_verdadero_por_usuario es un diccionario de la forma
# {usuario1: [canción1, canción2, ...], usuario2: [canción3, canción4,...], ...}
# que contiene las canciones con las que ha interactuado un usuario. Consideraremos que si el
# usuario ha interactuado con la canción, entonces la canción es relevante para el usuario.
# top_n_verdadero_por_usuario = defaultdict(list)
# for i in range(len(sessions)):
#     fila = sessions.iloc[i, :]
#     top_n_verdadero_por_usuario[fila['session_id']].append(fila['track_id'])



Basado en práctico Content Based (Texto) (https://github.com/PUC-RecSys-Class/RecSysPUC-2022/blob/master/practicos/Content_Based_texto.ipynb)

In [None]:
def find_similar_songs(sessions, track_features, alpha, beta, session_id=None, metric='euclidean', topk=5):
    
    session_tracks = sessions[sessions.session_id == session_id]
    tracks_played = session_tracks.merge(track_features, left_on="track_id", right_on="track_id")
    tracks_not_played = track_features[~track_features.track_id.isin(tracks_played)]
    initial_track = np.array(tracks_played.iloc[0].drop(labels=["session_id", "track_id", "not_skipped"]))
    
    embedding = tracks_not_played.drop(labels="track_id", axis=1).values
    not_played_distances = pairwise_distances(initial_track.reshape(1,-1), embedding, metric=metric)
    tracks_not_played["distance"] = not_played_distances[0]
    tracks_not_played.sort_values(by=["distance"])
    
    embedding= tracks_played.drop(labels=["session_id", "track_id", "not_skipped"], axis=1).values
    played_distances = pairwise_distances(initial_track.reshape(1,-1), embedding, metric=metric)
    tracks_played["distance"] = played_distances[0]
    tracks_played.sort_values(by=["distance"])
    
    rec = []
    
    for i in range(topk):
        # No escuchados
        if rd.random() < alpha:
            loc = int(beta*len(tracks_not_played)//1)
            track = tracks_not_played.iloc[loc]
            while track.track_id in rec:
                loc += 1
                track = tracks_not_played.iloc[loc]
            rec.append(track.track_id)
        # Ya escuchados
        else:
            loc = int(beta*len(tracks_played)//1)
            if loc == 0:
                loc += 1
            track = tracks_played.iloc[loc]
            while track.track_id in rec:
                loc += 1
                track = tracks_played.iloc[loc]
            rec.append(track.track_id)
    
    return rec

In [None]:
random_session = rd.choice(sessions.session_id.unique())
random_session

In [None]:
alpha = widgets.FloatSlider(min=0, max=1, step=0.01, description="alpha")
beta = widgets.FloatSlider(min=0, max=1, step=0.01, description="beta")
display(alpha)
display(beta)

In [88]:
rec = find_similar_songs(sessions, track_features, alpha.value, beta.value, session_id=random_session, \
                         metric='euclidean', topk=5)
rec

['t_90ddd0d7-1784-4c49-9907-8d7f39be5266',
 't_bc88c565-06bf-4df1-83b6-fa07a18ac4ab',
 't_77d294e7-5f3a-45a4-8c38-a8aacad2e4c1',
 't_0340fe26-f946-40b5-90f0-928349402ce3',
 't_305127ff-ba67-456b-8e0d-ef01b0cc8db0']

In [91]:
rec = find_similar_songs(sessions, track_features, alpha.value, beta.value, session_id=random_session, \
                         metric='cosine', topk=5)
rec

['t_90ddd0d7-1784-4c49-9907-8d7f39be5266',
 't_bc88c565-06bf-4df1-83b6-fa07a18ac4ab',
 't_77d294e7-5f3a-45a4-8c38-a8aacad2e4c1',
 't_5186d6d8-934d-4acc-bf63-7638020992c3',
 't_321167f2-42d3-4783-8687-157b7790f61a']