# Music Recommendation - Clustering Oriented

In [1]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import h5py
import sqlite3
import sklearn
import progress
from progress.bar import Bar

# import tools functions
import h5df_getters

Set your environnement configuration here

In [2]:
# absolute path to music-recommendation directory
project_abspath = "/home/osboxes/Documents/python/machlrn/music-recommendation"

## Data preprocessing

Here is some information about the data.

-> Features for 10000 songs (using the subset of MillionSongDataset)
Download here : http://static.echonest.com/millionsongsubset_full.tar.gz
In order to succesfully access to the data in this huge dataset it is recommended to look at http://millionsongdataset.com/pages/tutorial/. One can try to open file using the h5py library. Then you can take a look at this tutorials to use it :http://docs.h5py.org/en/stable/quick.html#quick.

-> Listener tastes dataset. A triplet-shaped dataset (user_id, song_id, play_count)
Download here : http://millionsongdataset.com/sites/default/files/challenge/train_triplets.txt.zip

-> Access to the songs styles (MSD Allmusic Genre Dataset (Top MAGD)). We used the smallest one with 13 different styles.
The access online is direct without download : http://www.ifs.tuwien.ac.at/mir/msd/partitions/msd-topMAGD-genreAssignment.cls

In [4]:
# get the id of all the songs
path_to_tracks_id = os.path.join(project_abspath, "data/MillionSongSubset/AdditionalFiles/subset_unique_tracks.txt")
df_tracks_info = pd.read_csv(path_to_tracks_id, sep="<SEP>", header=None, names=["track_id", "song_id", "artist_name", "song_title"], engine='python')
df_tracks_info.head()

Unnamed: 0,track_id,song_id,artist_name,song_title
0,TRAAAAW128F429D538,SOMZWCG12A8C13C480,Casual,I Didn't Mean To
1,TRAAABD128F429CF47,SOCIWDW12A8C13D406,The Box Tops,Soul Deep
2,TRAAADZ128F9348C2E,SOXVLOJ12AB0189215,Sonora Santanera,Amor De Cabaret
3,TRAAAEF128F4273421,SONHOTT12A8C13493C,Adam Ant,Something Girls
4,TRAAAFD128F92F423A,SOFSOCN12A8C143F5D,Gob,Face the Ashes


### Track styles and genres

We have three datasets to link the tracks we have with their style and genres. We think we are going to work with the styles rather than the genres because this way the classification is balenced. Otherwise with the genre, half of the tracks are of genre "Pop_Rock".

In [92]:
# topMAGD genres - 13 different genres
genres_unique = ['Rap', 'Pop_Rock', 'RnB', 'New Age', 'Latin', 'International',
       'Jazz', 'Folk', 'Blues', 'Electronic', 'Country', 'Reggae',
       'Vocal']
link_songs_genre = "http://www.ifs.tuwien.ac.at/mir/msd/partitions/msd-topMAGD-genreAssignment.cls"
df_songs_genre = pd.read_csv(link_songs_genre, sep="\t", header=None, names=["track_id", "track_genre"])

In [93]:
# MAGD genres - 21 different genres
more_genres_unique = ['Rap', 'Pop_Rock', 'Religious', 'RnB', 'New Age', 'Comedy_Spoken',
       'Latin', 'International', 'Jazz', 'Folk', 'Blues', 'Electronic',
       'Stage ', 'Country', 'Reggae', 'Vocal', 'Easy_Listening',
       'Classical', 'Avant_Garde', 'Children', 'Holiday']
link_songs_more_genre = "http://www.ifs.tuwien.ac.at/mir/msd/partitions/msd-MAGD-genreAssignment.cls"
df_songs_more_genre = pd.read_csv(link_songs_more_genre, sep="\t", header=None, names=["track_id", "track_genre"])

In [68]:
# 25 different styles
styles_unique = ['Hip_Hop_Rap', 'Pop_Indie', 'Gospel', 'Pop_Contemporary',
       'Rock_Neo_Psychedelia', 'Rock_Contemporary', 'Folk_International',
       'Experimental', 'Country_Traditional', 'Blues_Contemporary',
       'Dance', 'Electronica', 'Punk', 'Rock_College', 'Rock_Hard',
       'Metal_Death', 'Metal_Alternative', 'Metal_Heavy', 'Jazz_Classic',
       'RnB_Soul', 'Grunge_Emo', 'Pop_Latin', 'Rock_Alternative',
       'Reggae', 'Big_Band']
link_songs_style = "http://www.ifs.tuwien.ac.at/mir/msd/partitions/msd-MASD-styleAssignment.cls"
df_songs_style = pd.read_csv(link_songs_style, sep="\t", header=None, names=["track_id", "track_style"])

In [76]:
# merge the dataset of info and style
df_tracks_info_style = pd.merge(df_tracks_info, df_songs_style, on="track_id", how="inner")

In [77]:
# get the number of elements per styles
print("Tracks per style :")
for style in df_tracks_info_style.track_style.unique():
    print(style, "=", len(df_tracks_info_style[df_tracks_info_style["track_style"] == style].index))

Tracks per style :
Hip_Hop_Rap = 227
Pop_Indie = 103
Gospel = 167
Pop_Contemporary = 147
Rock_Neo_Psychedelia = 87
Rock_Contemporary = 135
Folk_International = 106
Experimental = 60
Country_Traditional = 157
Blues_Contemporary = 142
Dance = 114
Electronica = 82
Punk = 80
Rock_College = 80
Rock_Hard = 187
Metal_Death = 105
Metal_Alternative = 195
Metal_Heavy = 93
Jazz_Classic = 74
RnB_Soul = 58
Grunge_Emo = 72
Pop_Latin = 107
Rock_Alternative = 75
Reggae = 82
Big_Band = 41


In [78]:
df_tracks_info_style

Unnamed: 0,track_id,song_id,artist_name,song_title,track_style
0,TRAAAAW128F429D538,SOMZWCG12A8C13C480,Casual,I Didn't Mean To,Hip_Hop_Rap
1,TRAAAEF128F4273421,SONHOTT12A8C13493C,Adam Ant,Something Girls,Pop_Indie
2,TRAAAMO128F1481E7F,SOYMRWW12A6D4FAB14,Jeff And Sheri Easter,The Moon And I (Ordinary Day Album Version),Gospel
3,TRAAAMQ128F1460CD3,SOMJBYD12A6D4F8557,Rated R,Keepin It Real (Skit),Hip_Hop_Rap
4,TRAAARJ128F9320760,SOIAZJW12AB01853F1,Planet P Project,Pink World,Pop_Contemporary
5,TRAAAVO128F93133D4,SOQHXMF12AB0182363,JennyAnyKind,Young Boy Blues,Rock_Neo_Psychedelia
6,TRAABJV128F1460C49,SOBONFF12A6D4F84D8,Lionel Richie,Tonight Will Be Alright,Rock_Contemporary
7,TRAABNV128F425CEE1,SOUQQEA12A8C134B1B,Richard Souther,High Tide,Folk_International
8,TRAACER128F4290F96,SOBBUGU12A8C13E95D,The Dillinger Escape Plan,Setting Fire to Sleeping Giants,Experimental
9,TRAACLV128F427E123,SONYPOM12A8C13B2D7,Tim Wilson,I Think My Wife Is Running Around On Me (Taco ...,Country_Traditional


### User music tastes

We need to get the musics listened by user in order to make suggestions and evaluate those suggestions.

In [22]:
path_to_user_tastes_dataset = os.path.join(project_abspath, "data/train_triplets.txt") # nrows 48373586
df_user_tastes = pd.read_csv(path_to_user_tastes_dataset, sep="\t", nrows=48373586, header=None, names=["user_id", "song_id", "play_count"])

In [24]:
# users_to_consider = df_user_tastes.user_id.unique()[:20000]
# df_user_tastes[df_user_tastes.user_id.isin(users_to_consider)]

songs_to_consider = df_tracks_info.song_id.unique()
df_user_tastes_filtered = df_user_tastes[df_user_tastes.song_id.isin(songs_to_consider)]

In [31]:
df_user_tastes_filtered = df_user_tastes_filtered.reset_index(drop=True)
df_user_tastes_filtered

Unnamed: 0,user_id,song_id,play_count
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOWEZSI12A81C21CE6,1
1,4bd88bfb25263a75bbdd467e74018f4ae570e5df,SODCXXY12AB0187452,2
2,4bd88bfb25263a75bbdd467e74018f4ae570e5df,SOWPAXV12A67ADA046,18
3,b64cdd1a0bd907e5e00b39e345194768e330d652,SOLXDDC12A6701FBFD,1
4,b64cdd1a0bd907e5e00b39e345194768e330d652,SONJBQX12A6D4F8382,4
5,b64cdd1a0bd907e5e00b39e345194768e330d652,SONQBUB12A6D4F8ED0,2
6,17aa9f6dbdf753831da8f38c71b66b64373de613,SOMGPML12A8C13AE8C,1
7,d6589314c0a9bcbca4fee0c93b14bc402363afea,SOULTKQ12AB018A183,3
8,5a905f000fc1ff3df7ca807d57edb608863db05d,SOEBCBI12AF72A154F,5
9,5a905f000fc1ff3df7ca807d57edb608863db05d,SOFKTPP12A8C1385CA,1


In [39]:
# 4e11f45d732f4861772b2906f81a7d384552ad12 => 21
# ffdaab327f2fc6b9fa01a4e3e7f41fdd0e468046 => 10
# 02192554db8fe6d17b6309aabb2b7526a2e58534 => 10
# 6ff5f3621d592b8c8f0b56bddd900a66a44909ad => 11
# 81fb7f3d2dde170e8d614a71f9a3ed3ea9e85e0b => 13
# 043d81932e75d5749ed5758d6420506e7bc457a5 => 12
# 3e6ef2a572d1f6f06df71bf28190eae9e1934a61 => 12
# 13ce57b3a25ef63fa614335fd838e8024c42ec17 => 22
# 6a944bfe30ae8d6b873139e8305ae131f1607d5f => 15


# for user_id in df_user_tastes_filtered.user_id.unique():
#     nb_songs_users = len(df_user_tastes_filtered[df_user_tastes_filtered["user_id"] == user_id].index)
#     if nb_songs_users >= 10:
#         print(user_id, "=>", nb_songs_users)
df_tmp = df_user_tastes_filtered.groupby("user_id")
df_size = df_tmp.size().sort_values(ascending=False)

user_id
ec6dfcf19485cb011e0b22637075037aae34cf26    48
c1255748c06ee3f6440c51c439446886c7807095    45
db6a78c78c9239aba33861dae7611a6893fb27d5    38
738759001498928d8dcb054cd53a1a0cfc200d36    38
d964fc033291078031d117ed10adfb615948256d    37
4e73d9e058d2b1f2dba9c1fe4a8f416f9f58364f    35
119b7c88d58d0c6eb051365c103da5caf817bea6    33
736083bd7ecd162effb7668cab6c281945762e85    31
cbc7bddbe3b2f59fdbe031b3c8d0db4175d361e6    30
3584ec93c836deac5c5c9ce1b88da731d249e099    29
b7c24f770be6b802805ac0e2106624a517643c17    29
33a1286454a3cff06e3c2324be746d2e23d7c270    26
5a3417a1955d9136413e0d293cd36497f5e00238    26
2b3e60309a89ea57df2f9ffe7dcb5871fcc8bc83    25
5ef127be2845313b04cdab97eafd5bab866eaf35    25
a2758cfd225f99b0494d98b3e7c65920345f95c8    25
8cb51abc6bf8ea29341cb070fe1e1af5e4c3ffcc    25
eaa2b3c9e086a662ab15e10ca6211a3207f40a50    25
6e0a8e9103ad7d68e63d350fbff3386c75613ab2    25
9c2dfee26bbdd4fb19e9800244bea6e7181caeae    25
57262c4ed3cb3ed2db7cab8c627091757c6437d8    24
1aa4f

In [44]:
row_group_sizes = (
    df_user_tastes_filtered['user_id']
    .groupby(df_user_tastes_filtered['user_id'])
    .transform('size')
)

In [85]:
for i in [5, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]:
    tmp = df_user_tastes_filtered[row_group_sizes >= i]
    print("#min_songs/user", i, "| nrows =",len(tmp.index), "| #user =", len(tmp.user_id.unique()), "| #diff_songs =", len(tmp.song_id.unique()), "| #all_diff_25styles =", len(pd.merge(tmp, df_tracks_info_style, on="song_id", how="inner").track_style.unique()) == 25)
    tmp = tmp.reset_index(drop=True)

#min_songs/user 5 | nrows = 151057 | #user = 22840 | #diff_songs = 2939 | #all_diff_25styles = True
#min_songs/user 8 | nrows = 51192 | #user = 5081 | #diff_songs = 2166 | #all_diff_25styles = True
#min_songs/user 10 | nrows = 26600 | #user = 2145 | #diff_songs = 1701 | #all_diff_25styles = True
#min_songs/user 11 | nrows = 19740 | #user = 1459 | #diff_songs = 1506 | #all_diff_25styles = True
#min_songs/user 12 | nrows = 14702 | #user = 1001 | #diff_songs = 1351 | #all_diff_25styles = True
#min_songs/user 13 | nrows = 10994 | #user = 692 | #diff_songs = 1189 | #all_diff_25styles = True
#min_songs/user 14 | nrows = 8342 | #user = 488 | #diff_songs = 1062 | #all_diff_25styles = True
#min_songs/user 15 | nrows = 6550 | #user = 360 | #diff_songs = 958 | #all_diff_25styles = True
#min_songs/user 16 | nrows = 5125 | #user = 265 | #diff_songs = 888 | #all_diff_25styles = True
#min_songs/user 17 | nrows = 4053 | #user = 198 | #diff_songs = 811 | #all_diff_25styles = True
#min_songs/user 18 | n

### Songs features

We need to extract features for each songs.

In [69]:
path_to_info_files = os.path.join(project_abspath, "data/MillionSongSubset/AdditionalFiles")

# connect to database
database_track_metadata = "subset_track_metadata.db"
db_track_metadata = sqlite3.connect(os.path.join(path_to_info_files, database_track_metadata))
db_track_metadata.execute("SELECT * FROM sqlite_master").fetchall()


In [None]:
path_to_songs_features_data = os.path.join(project_abspath, "data/MillionSongSubset/data")

def get_feature(basedir, fun=h5df_getters.get_num_songs, ext='.h5', nb_elements=10000):
    stock = []
    stock_path = []
    i = 0
    for root, dirs, files in Bar('Processing', max=nb_elements).iter(os.walk(basedir)):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            if str(f.split("/")[-1][:-3]) in df_paths_to_songs.track_id.unique():
#                 h5 = h5df_getters.open_h5_file_read(f)
#                 stock.append(fun(h5))
                stock_path.append(f)
                i += 1
#                 h5.close()
    return stock, stock_path

stock, stock_path = get_feature(basedir=path_to_songs_features_data, nb_elements=10000)

Get all the features.

HERE IS THE LINK THAT DESCRIBES THE FEATURES : http://millionsongdataset.com/pages/example-track-description/

In [72]:
df_paths_to_songs = pd.read_csv(os.path.join(project_abspath, "data/SongsFeatures/df_abspath_to_features.txt"))
df_paths_to_songs.head()

Unnamed: 0,abspath_to_file
0,/home/osboxes/Documents/python/machlrn/music-r...
1,/home/osboxes/Documents/python/machlrn/music-r...
2,/home/osboxes/Documents/python/machlrn/music-r...
3,/home/osboxes/Documents/python/machlrn/music-r...
4,/home/osboxes/Documents/python/machlrn/music-r...


In [153]:
FEATURES_CONSIDERED = [h5df_getters.get_duration,
                       h5df_getters.get_end_of_fade_in,
                       h5df_getters.get_start_of_fade_out,
                       h5df_getters.get_key,
#                        h5df_getters.get_key_confidence,
                       h5df_getters.get_loudness,
                       h5df_getters.get_mode,
#                        h5df_getters.get_mode_confidence,
                       h5df_getters.get_tempo,
                       h5df_getters.get_time_signature,
#                        h5df_getters.get_time_signature_confidence,
                       h5df_getters.get_tatums_start, # x3 => len, min, max, variance
                       h5df_getters.get_segments_pitches, # x4 => len, min, max, mean, var
                       h5df_getters.get_segments_timbre, # x3 => min, max, mean, var
                       h5df_getters.get_sections_start, # x2 => len, var
                       h5df_getters.get_beats_start, # x2 => len, var
                       h5df_getters.get_bars_start, # x2 => len, var
                       h5df_getters.get_year, # lacks => 0
                       h5df_getters.get_song_hotttnesss # lacks => nan
                      ]

In [177]:
def get_features(paths=df_paths_to_songs.values[:, 0], ext='.h5', nb_elements=10000):
    stock = []
    for file in Bar('Processing', max=nb_elements).iter(paths[:nb_elements]):
        h5 = h5df_getters.open_h5_file_read(file)
        feature = np.array(h5df_getters.get_sections_start(h5))
        print()
        print("HOP")
        print(len(feature))
        print(np.min(feature))
        print(np.max(feature))
        print(np.std(feature))
        h5.close()
    return stock

In [178]:
features = get_features(nb_elements=2)
features


HOP
4
0.0
198.78196
73.93993567653342

HOP
12
0.0
238.69594
77.27496291769941


[]

In [129]:
features = np.array(features)
np.mean(features[features != 0])

1998.888888888889

In [None]:
df_tmp = pd.DataFrame(data=stock_path)
df_tmp.columns = ["abspath_to_file"]
df_tmp.to_csv(os.path.join(project_abspath, "data/SongsFeatures/df_abspath_to_features.txt"), index=False)

In [8]:
df =  pd.read_csv(os.path.join(project_abspath, "data/SongsFeatures/df_uncleaned_features.csv"))
df

Unnamed: 0,get_song_id_decoded,get_duration,get_end_of_fade_in,get_start_of_fade_out,get_key,get_loudness,get_mode,get_tempo,get_time_signature,get_tatums_start_size,...,get_segments_pitches_mean,get_segments_pitches_var,get_sections_start_size,get_sections_start_var,get_beats_start_size,get_beats_start_var,get_bars_start_size,get_bars_start_var,get_year,get_song_hotttnesss
0,SOLKBKQ12A8C13400C,204.85179,0.000,204.852,5,-11.140,1,133.428,1,905,...,0.315484,0.080072,4,5467.114088,452,3465.200077,452,3465.200077,1995,
1,SOGEQKA12A6D4FA27C,252.08118,0.514,234.887,11,-8.706,0,125.022,4,1021,...,0.330419,0.085345,12,5971.419894,511,5097.385777,127,5039.677159,2003,0.270776
2,SOBJOLG12A8C13CC69,170.05669,0.000,170.057,9,-9.502,1,72.073,4,407,...,0.216669,0.084926,4,2074.411783,204,2408.163795,50,2313.813038,2006,0.270776
3,SOWWEWC12A6D4F8A87,383.16363,0.357,379.246,1,-16.486,1,121.896,4,1553,...,0.432146,0.091019,9,13741.200049,776,12161.091815,193,12035.755476,1995,0.265861
4,SOJCBAM12A6701FD04,296.85506,19.336,290.482,3,-6.966,1,145.271,4,1401,...,0.298037,0.091274,7,4654.468667,701,7013.955888,175,6993.523590,1998,0.717392
5,SOLYAFD12A6702090E,211.59138,0.142,190.769,5,-7.239,1,100.783,4,707,...,0.330268,0.096790,10,4927.242185,354,3562.121277,88,3520.025809,2005,
6,SOMLFVN12A8C136135,330.97098,2.148,315.902,7,-16.934,1,101.529,4,2214,...,0.272701,0.093443,16,8560.513082,553,8871.434269,137,8740.776074,1996,0.265861
7,SOXAOIN12A8C138D34,210.46812,0.421,191.402,9,-13.644,1,114.382,4,789,...,0.293731,0.087503,11,3772.380078,395,3578.532769,98,3524.007031,1986,
8,SOKGAEA12AB017F671,276.79302,0.000,271.290,9,-16.210,0,118.239,4,1070,...,0.230390,0.087247,16,6662.285928,535,6147.176446,133,6077.923256,0,
9,SOOSLVC12A8C13E51E,327.62730,0.000,309.052,10,-7.144,0,107.320,3,1748,...,0.372745,0.097373,14,9329.949128,583,8774.892784,193,8654.151499,2006,0.419674


In [65]:
# get_sections_start_var 3 - mean = 6392
# get_bars_start_var 4 - mean = 5459
# get_beats_start_var 4 - mean = 5532
# get_song_hotttnesses 507 - mean = 0.5
array = df.get_song_hotttnesss.values
np.any(pd.isna(array))
np.mean(array[[not e for e in pd.isna(array)]])
# len(array[pd.isna(array)])

0.4894457266651706

How to change a dataframe NaN values :

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html

In [68]:
value = {"get_song_hotttnesss": 0.5,
         "get_bars_start_var": 5459,
         "get_beats_start_var": 5532,
         "get_sections_start_var": 6392}
df_cleaned = df.fillna(value=value)
df_cleaned

Unnamed: 0,get_song_id_decoded,get_duration,get_end_of_fade_in,get_start_of_fade_out,get_key,get_loudness,get_mode,get_tempo,get_time_signature,get_tatums_start_size,...,get_segments_pitches_mean,get_segments_pitches_var,get_sections_start_size,get_sections_start_var,get_beats_start_size,get_beats_start_var,get_bars_start_size,get_bars_start_var,get_year,get_song_hotttnesss
0,SOLKBKQ12A8C13400C,204.85179,0.000,204.852,5,-11.140,1,133.428,1,905,...,0.315484,0.080072,4,5467.114088,452,3465.200077,452,3465.200077,1995,0.500000
1,SOGEQKA12A6D4FA27C,252.08118,0.514,234.887,11,-8.706,0,125.022,4,1021,...,0.330419,0.085345,12,5971.419894,511,5097.385777,127,5039.677159,2003,0.270776
2,SOBJOLG12A8C13CC69,170.05669,0.000,170.057,9,-9.502,1,72.073,4,407,...,0.216669,0.084926,4,2074.411783,204,2408.163795,50,2313.813038,2006,0.270776
3,SOWWEWC12A6D4F8A87,383.16363,0.357,379.246,1,-16.486,1,121.896,4,1553,...,0.432146,0.091019,9,13741.200049,776,12161.091815,193,12035.755476,1995,0.265861
4,SOJCBAM12A6701FD04,296.85506,19.336,290.482,3,-6.966,1,145.271,4,1401,...,0.298037,0.091274,7,4654.468667,701,7013.955888,175,6993.523590,1998,0.717392
5,SOLYAFD12A6702090E,211.59138,0.142,190.769,5,-7.239,1,100.783,4,707,...,0.330268,0.096790,10,4927.242185,354,3562.121277,88,3520.025809,2005,0.500000
6,SOMLFVN12A8C136135,330.97098,2.148,315.902,7,-16.934,1,101.529,4,2214,...,0.272701,0.093443,16,8560.513082,553,8871.434269,137,8740.776074,1996,0.265861
7,SOXAOIN12A8C138D34,210.46812,0.421,191.402,9,-13.644,1,114.382,4,789,...,0.293731,0.087503,11,3772.380078,395,3578.532769,98,3524.007031,1986,0.500000
8,SOKGAEA12AB017F671,276.79302,0.000,271.290,9,-16.210,0,118.239,4,1070,...,0.230390,0.087247,16,6662.285928,535,6147.176446,133,6077.923256,0,0.500000
9,SOOSLVC12A8C13E51E,327.62730,0.000,309.052,10,-7.144,0,107.320,3,1748,...,0.372745,0.097373,14,9329.949128,583,8774.892784,193,8654.151499,2006,0.419674


In [73]:
df_cleaned.to_csv(os.path.join(project_abspath, "data/SongsFeatures/df_cleaned_features.csv"), index=False)

In [77]:
df_test = pd.read_csv(os.path.join(project_abspath, "data/SongsFeatures/df_cleaned_features.csv"))
df_test.isna().values.any()

False