# Content based filtering (first aproach)

In [1]:
## Import Librarys
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

from time import process_time

from sklearn.metrics.pairwise import cosine_similarity
from numpy.linalg import norm
from scipy.stats import pearsonr

%run ./src/__init__.py

# Functions

In [2]:
def normalize_and_select(df):
    """
    Apply some normalizations and return a selection
     of features from a dataframe with tracks' features
    """
    
    df['duration_norm'] = (df['duration_ms']*1.6666666666667e-05) / 10
    df['explicit'] = df['explicit'].astype('int')
    df['track_popularity_norm'] = df['track_popularity'] / 100
    df['loudness_norm'] = (df['loudness'] + 60) / 60
    df['tempo_norm'] = df['tempo'] / 200
    df['artist_popularity_norm'] = df['artist_popularity'] / 100
    df['artist_followers_norm'] = df['artist_followers'] / 10000000
    df['release_year_norm'] = (df['release_year'] - 1900) / 122
    
    selection = ['track_id','duration_norm','explicit',
                 'track_popularity_norm','acousticness','danceability',
                 'energy','instrumentalness','liveness','loudness_norm',
                 'speechiness','tempo_norm','valence',
                 'artist_popularity_norm','artist_followers_norm',
                 'release_year_norm']
    
    return df[selection]


def get_playlist_genres(df, seed_genres_path):
    """
    Returns a pandas dataframe whit the seed genres
     found in the plalist and a count
    """
    
    #get all genres in the playlist
    all_genres = df['artist_genres'].apply(pd.Series).stack() \
                                    .reset_index(drop=True) \
                                    .to_frame(name='artist_genres')
    
    #get seed_genres dataframe
    genres = pd.read_csv(seed_genres_path)
    
    
    all_genres = all_genres.merge(genres, how='left', left_on='artist_genres',
                                  right_on='genre')[['seed_genre']].dropna()
    
    all_genres.rename(columns={'seed_genre': 'artist_genres'}, inplace=True)
    
    
    #count unique values
    counts = all_genres.groupby(['artist_genres']).size() \
                       .reset_index(name='counts')
    counts.sort_values(by='counts', ascending=False, inplace=True)
    counts.reset_index(drop=True, inplace=True)
    
    return counts


def feature_vector(df):
    """
    Returns the feature vector of a playlist
    """
    
    features = ['duration_norm', 'explicit', 'track_popularity_norm',
                'acousticness', 'danceability', 'energy', 'instrumentalness',
                'liveness', 'loudness_norm', 'speechiness', 'tempo_norm',
                'valence','artist_popularity_norm', 'artist_followers_norm',
                'release_year_norm']

    return np.array(df[features].mean())


def similaritys(candidates, playlist):
    """
    Returns a dataframe with varius similarity normes
    comparing the playlist feature vector with every candidate.
    """
    
    # feature vector of the playlist
    playlist_vector = feature_vector(playlist)
    
    # empty lists for the different metrics
    dot_product = []
    euclidean = []
    cosine = []
    manhattan = []
    pearson = []
    
    for track in candidates.track_id:
        # track feature vector
        track_vector = candidates[candidates.track_id==track] \
                                 .drop(columns='track_id').to_numpy()[0]
        
        #metrics calculation
        dot_product.append(np.dot(playlist_vector,track_vector))
        euclidean.append(norm(playlist_vector - track_vector))
        cosine.append(cosine_similarity(playlist_vector.reshape(1, -1),
                                        track_vector.reshape(1, -1))[0][0])
        manhattan.append(norm(playlist_vector - track_vector, ord=1))
        pearson.append(pearsonr(playlist_vector,track_vector)[0])
        
    similarity_metrics = pd.DataFrame({
        'track_id': candidates.track_id,
        'dot_product': dot_product,
        'euclidean': euclidean,
        'cosine': cosine,
        'manhattan': manhattan,
        'pearson': pearson
    })
        
        
    return similarity_metrics


def select_top_tracks(candidates, metric, n=50):
    """
    Return the top n songs from candidates dataframe
    with similarity metrics.
    """
    
    ascending_merics = ['dot_product', 'euclidean', 'manhattan']
    descending_metrics = ['cosine', 'pearson', 'weight']
    
    if metric in ascending_merics:
        ascending=True
    elif metric in descending_metrics:
        ascending=False
    else:
        raise Exception(metric,'is not a valid metric')
        
    df_out = candidates.sort_values(by=metric, ascending=ascending,
                                    ignore_index=True)
    
    limit = min(n,len(candidates))
    
    return df_out[0:limit]

# Flow

In [3]:
### GENERAL DATAFRAMES

# Connect with API
sp = get_spotify_connection()

# Some playlist id
playlist_id = '0JcCtpR2cHbmy70zwa81WV'

# Get playlist data
df_playlist = playlist_features(playlist_id, sp)

# Normalization
df_playlist_norm = normalize_and_select(df_playlist)

## Get dataframe of artists and genres
artists = pd.read_csv('./data/artist_genres.csv')
artists.drop(columns=['unmapped'], inplace=True)

In [4]:
### SPECIFIC FLOW

# Relevant genres in playlist
playlist_genres = get_playlist_genres(df_playlist,
                                      './data/seed_genres_clean.csv')
playlist_genres_list = playlist_genres['artist_genres'].tolist()

# Filter genres
artists_filt = artists[['name', 'id', 'popularity'] + playlist_genres_list]

# Get weights for artists based on popularity and genders on the playlist
weights = playlist_genres.set_index('artist_genres')

# Weight of the artists based on genres
artists_filt['genres_weight'] =artists_filt[playlist_genres_list].dot(weights)
artists_filt['popularity_weight'] = artists_filt.popularity / 100
artists_filt['weight'] = artists_filt['popularity_weight'] * \
                         artists_filt['genres_weight']
artists_filt = artists_filt[artists_filt.weight>0]

# Keep only artists for the 90% quantile
quantile = artists_filt['weight'].quantile(q=0.9)
best_artists = artists_filt[artists_filt.weight>=quantile]
best_artists.sort_values(by='weight', ascending=False, inplace=True)
best_artists.reset_index(drop=True, inplace=True)

# Get top 10 songs from top 300 artists excluding songs in the playlist
top_songs = []

for id_artist in best_artists.id[:300]:
    top_10_tracks = sp.artist_top_tracks(id_artist, country='MX')['tracks']
    top_songs = top_songs + [track['id'] for track in top_10_tracks]
    
all_songs = list(set(top_songs) - set(df_playlist['track_id']))

# Songs that are already in the playlist
already_in_playlist = set(top_songs).intersection(set(df_playlist['track_id']))

# Normalize candidates
df_candidates = tracks_features(all_songs)
df_candidates_norm = normalize_and_select(df_candidates)

# Metrics
similarity_metrics = similaritys(df_candidates_norm, df_playlist_norm)

In [7]:
len(already_in_playlist)

39

## Get playlist data

In [3]:
# Connect with API

sp = get_spotify_connection()

# Some playlist id
playlist_id = '0JcCtpR2cHbmy70zwa81WV'

# Get playlist data
df_playlist = playlist_features(playlist_id, sp)

## Normalizations and feature selection

* duration_ms --------> log_duration_min = log(duration in minuts)
* explicit -----------> explicit_int (possibly drop this column)
* track_popularity ---> track_popularity_norm = track_popularity / 100
* loudness -----------> loudness_norm = (loudness + 60) / 60

In [4]:
df_playlist_norm = normalize_and_select(df_playlist)

# df_playlist_norm.describe()

In [5]:
## Get dataframe of artists and genres

artists = pd.read_csv('./data/artist_genres.csv')

artists.drop(columns=['unmapped'], inplace=True)

## Get relevant genres in the playlist

In [6]:
playlist_genres = get_playlist_genres(df_playlist,
                                      './data/seed_genres_clean.csv')

# playlist_genres

In [7]:
# playlist_artists = df_playlist['artist_name'].unique()
playlist_genres_list = playlist_genres['artist_genres'].tolist()

## filter genres
artists_filt = artists[['name', 'id', 'popularity'] + playlist_genres_list]

# artists_filt

In [8]:
## get weights for artists based on popularity and genders on the playlist

weights = playlist_genres.set_index('artist_genres')

# Weight of the artists based on genres
artists_filt['genres_weight'] =artists_filt[playlist_genres_list].dot(weights)

# Weight based on popularity
# playlist_popularity = df_playlist_norm.artist_popularity_norm.mean()
# artists_filt['popularity_norm'] = artists_filt.popularity / 100

# artists_filt['popularity_weight'] = 1 /abs(playlist_popularity - \
#                                            artists_filt.popularity_norm)

# artists_filt['popularity_weight'] = artists_filt.popularity_weight / \
#                                     artists_filt.popularity_weight.max()

artists_filt['popularity_weight'] = artists_filt.popularity / 100


artists_filt['weight'] = artists_filt['popularity_weight'] * \
                         artists_filt['genres_weight']

artists_filt = artists_filt[artists_filt.weight>0]


# artists_filt

In [9]:
# artists_filt[['weight','popularity_weight','genres_weight']].describe()

In [10]:
## keep only artists for the 90% quantile

quantile = artists_filt['weight'].quantile(q=0.9)

best_artists = artists_filt[artists_filt.weight>=quantile]

best_artists.sort_values(by='weight', ascending=False, inplace=True)
best_artists.reset_index(drop=True, inplace=True)

#best_artists.to_csv('./data/best_artists_test.csv', index=False)

# best_artists

In [21]:
# Get all songs of top 300 artists
# t_start = process_time()

# top_songs = []

# for id_artist in best_artists.id[:300]:
#     albums= sp.artist_albums(artist_id=id_artist, album_type='album')['items']
    
#     for id_album in [album['id'] for album in albums]:
#         album_tracks = sp.album_tracks(id_album)['items']
#         top_songs = top_songs + [track['id'] for track in album_tracks]
    
# all_songs = list(set(top_songs) - set(df_playlist['track_id']))

# t_end = process_time()
# print('Duration:',t_end-t_start)

Duration: 54.730760575999994


In [24]:
### get top 10 songs from top 300 artists excluding songs in the playlist

t_start = process_time()

top_songs = []

for id_artist in best_artists.id[:300]:
    top_10_tracks = sp.artist_top_tracks(id_artist, country='MX')['tracks']
    top_songs = top_songs + [track['id'] for track in top_10_tracks]
    
all_songs = list(set(top_songs) - set(df_playlist['track_id']))

t_end = process_time()
print('Duration:',t_end-t_start)

Duration: 3.405043629000005


In [25]:
len(all_songs)

2881

In [26]:
already_in_playlist = set(top_songs).intersection(set(df_playlist['track_id']))

len(already_in_playlist)

38

In [51]:
# df_playlist[df_playlist.track_id.isin(already_in_playlist)]

Unnamed: 0,track_id,track_name,duration_ms,explicit,track_popularity,acousticness,danceability,energy,instrumentalness,key,...,album_type,release_date,release_year,duration_norm,track_popularity_norm,loudness_norm,tempo_norm,artist_popularity_norm,artist_followers_norm,release_year_norm
10,4gMgiXfqyzZLMhsksGmbQV,"Another Brick in the Wall, Pt. 2",238746,0,78,0.0782,0.693,0.394,0.000694,0,...,album,1979-11-30,1979,0.39791,0.78,0.7353,0.52057,0.82,1.654545,0.647541
14,0EYOdF5FCkgOJJla8DI2Md,B.Y.O.B.,255466,1,79,0.00662,0.538,0.981,0.0,1,...,album,2005-05-17,2005,0.425777,0.79,0.9552,0.50707,0.81,0.800838,0.860656
18,2FcE7B1p3qVvLvwV5qPljV,A Beautiful Lie,245306,0,60,0.00291,0.479,0.929,0.293,9,...,album,2005,2005,0.408843,0.6,0.9149,0.799745,0.71,0.318416,0.860656
19,3f3HHRPF5vAo90GwdpDMaQ,Because I’m Me,252800,0,64,0.0491,0.472,0.805,0.0162,8,...,album,2016-07-08,2016,0.421333,0.64,0.90345,0.47461,0.63,0.043666,0.95082
20,5FZxsHWIvUsmSK1IAvm2pp,Best of You,255626,0,79,0.000769,0.366,0.94,9.4e-05,1,...,album,2005-06-14,2005,0.426043,0.79,0.914683,0.65099,0.8,0.961593,0.860656
30,1QFh8OH1e78dGd3VyJZCAC,Boys Don't Cry,155973,0,78,0.00517,0.462,0.836,0.0,11,...,album,1979-05-11,1979,0.259955,0.78,0.9039,0.84383,0.77,0.361396,0.647541
35,46Yp4JsZIh8ceg5WBqW1ZB,Buscando en la basura,208839,0,59,0.00531,0.557,0.857,0.0,2,...,album,2005-04-04,2005,0.348065,0.59,0.936517,0.475125,0.56,0.027186,0.860656
37,48UPSzbZjgc449aqz8bxox,Californication,329733,0,85,0.0021,0.592,0.767,0.00165,9,...,album,1999-06-08,1999,0.549555,0.85,0.953533,0.482415,0.86,1.767518,0.811475
45,4P5KoWXOxwuobLmHXLMobV,Come As You Are,218920,0,79,0.00016,0.5,0.824,0.00161,4,...,album,1991-09-26,1991,0.364867,0.79,0.902567,0.600625,0.85,1.478439,0.745902
46,2EqlS6tkEnglzr7tkKAAYD,Come Together - Remastered 2009,259946,0,81,0.0302,0.533,0.376,0.248,9,...,album,1969-09-26,1969,0.433243,0.81,0.80145,0.825035,0.88,2.217064,0.565574


In [52]:
### normalize candidates

df_candidates = tracks_features(all_songs)

#add realise year
df_candidates['release_year'] = df_candidates['release_date'].str[:4].astype(int)

df_candidates_norm = normalize_and_select(df_candidates)

## Create feature vector

In [53]:
playlist_feature_vector = feature_vector(df_playlist_norm)

playlist_feature_vector

array([0.39011997, 0.04225352, 0.45690141, 0.17220921, 0.51310798,
       0.71220329, 0.07075744, 0.18047465, 0.87832966, 0.0560446 ,
       0.63697617, 0.57970845, 0.72793427, 0.75491964, 0.79689063])

In [54]:
similarity_metrics = similaritys(df_candidates_norm, df_playlist_norm)

similarity_metrics

Unnamed: 0,track_id,dot_product,euclidean,cosine,manhattan,pearson
0,0hNaPXiI8XsKsbP6VCZIdg,4.178486,0.789448,0.930634,1.782126,0.775231
1,4snBTILMGyRKErKygwoDkE,3.766918,0.767702,0.931754,1.627449,0.811126
2,1rOlTL4pKQ9Y1fURua4AJR,3.663618,0.787721,0.928665,2.220921,0.765272
3,2Vfbf60Ujpom3Bcljzw7Dd,4.406862,0.699464,0.948284,1.947444,0.827822
4,6E37j4b44JGpdk297urpKM,3.947451,0.882690,0.910494,2.511075,0.706711
...,...,...,...,...,...,...
4826,3VCKdfJAL8DTDlwZw5O6Ik,3.592029,0.933244,0.896864,2.322447,0.662962
4827,3uPfVXcjnpOjyzI3jb3js4,4.067713,0.849391,0.918555,2.133570,0.725210
4828,1nEchyBF1nPPfIAq4YGRyc,3.668041,0.796176,0.926843,1.792169,0.821233
4829,6F3rgQfSns1slUnMfHXjOA,3.982170,0.870928,0.913238,2.211494,0.795878


In [55]:
#### SOLO ERA PARA SABER LOS NOMBRES DE LAS CANCIONES

## join track weghti from artist-genre table

# tracks_artists = similarity_metrics[['track_id']].merge(df_candidates[['track_id','artist_id', 'track_name', 'artist_name']])

# tracks_weights = tracks_artists.merge(best_artists[['id','weight']], left_on='artist_id', right_on='id', how='left')

# similarity_metrics = similarity_metrics.merge(tracks_weights[['track_id','weight','track_name','artist_name']], on='track_id', how='left')

# similarity_metrics

In [56]:
# similarity_metrics.describe()

Unnamed: 0,dot_product,euclidean,cosine,manhattan,pearson
count,4831.0,4831.0,4831.0,4831.0,4831.0
mean,4.122539,0.893108,0.913542,2.230772,0.742428
std,0.490666,0.301454,0.039077,0.559564,0.124325
min,2.709632,0.2994,0.558307,0.881088,0.129146
25%,3.861472,0.761563,0.901365,1.871403,0.69809
50%,4.086744,0.839514,0.921741,2.106041,0.780021
75%,4.310136,0.946693,0.936126,2.466202,0.817792
max,11.374931,8.657168,0.991695,10.463355,0.973716


In [57]:
prueba = select_top_tracks(candidates=similarity_metrics, metric='euclidean')

# prueba.to_csv('./data/metricas.csv', index=False)

In [58]:
prueba

Unnamed: 0,track_id,dot_product,euclidean,cosine,manhattan,pearson
0,02ePjHjIiszSYqeLykvpTN,4.642824,0.2994,0.991695,0.881088,0.972138
1,5etssK2rpk4SnHWWD1Q6xn,4.852416,0.367184,0.990793,1.057931,0.969389
2,7ndGFo9nZ108KPgXtfYWCe,4.547974,0.391011,0.984156,1.214997,0.955079
3,6dUfUi14QkuIpGA0GFFPyC,4.575797,0.403605,0.98347,1.339752,0.958847
4,1AdYZ6X00nXmO613Y7GJOl,4.469791,0.413808,0.981459,1.108353,0.950223
5,3OgZdIAHaDMOtYq8Z5iCQ5,4.274979,0.413931,0.980568,1.279721,0.944806
6,1gVVSmhWjci8l9Vrsl6HaS,4.886893,0.418268,0.987883,1.331464,0.971228
7,5PRRthP9SLfbXB359MfIWv,4.551593,0.423712,0.981479,1.247012,0.945582
8,3FifroUyur8j5mTM78WuXQ,4.62499,0.42495,0.982387,1.463116,0.93786
9,2Fn4gZI3MAeWTiv7cSVxZQ,4.644351,0.428595,0.982381,1.232615,0.940392
