In [41]:
import numpy as np
import pandas as pd
import sklearn.preprocessing as pp
import sklearn.impute as impute

In [3]:
reldir = '../datasets/'

In [307]:
df = pd.read_pickle(reldir + 'msd_tastes_merged.csv', compression='zip')
df_msd = pd.read_pickle(reldir + 'msd.csv', compression='zip')

In [349]:
# split into numerical and object
df_num = df_msd.select_dtypes(exclude='object')
df_obj = df_msd.select_dtypes(include='object')

In [350]:
df_id = df_msd[[col for col in df_msd if col.endswith('id')]]

In [351]:
# drop all cols ending in id
id_cols = [col for col in df_num if col.endswith('id')]
df_num = df_num[[col for col in df_num if not id_cols.__contains__(col)]]

In [352]:
# drop all cols in df_num w/ std = 0
num_feat_std0 = list(df_num.describe().loc['std'].where(lambda x : x <= 0).dropna().index)
df_num = df_num[[col for col in df_num if not num_feat_std0.__contains__(col)]]

In [353]:
# drop useless columns
df_num = df_num.drop([
    'analysis_songs_end_of_fade_in',
    'metadata_songs_artist_latitude',
    'metadata_songs_artist_longitude',
    'analysis_songs_start_of_fade_out',
    'analysis_songs_loudness'
], axis=1)

In [354]:
# split into categorical numeric vs true numeric values
num_cat = [
    'analysis_songs_key',
    'analysis_songs_key_confidence',
    'analysis_songs_mode',
    'analysis_songs_mode_confidence',
    'analysis_songs_tempo',
    'analysis_songs_time_signature',
    'analysis_songs_time_signature_confidence',
    'musicbrainz_songs_year'
]

df_num_cat = df_num[num_cat]
df_num = df_num[[col for col in df_num if not num_cat.__contains__(col)]]

In [355]:
df_num_copy = df_num.copy()

In [356]:
pd.isnull(df_num_copy).sum().where(lambda x : x > 0).dropna()

metadata_songs_artist_familiarity       4.0
metadata_songs_song_hotttnesss       4352.0
dtype: float64

In [357]:
# Need to do imputations on NaN
df_num_cat['song_hotttnesss_indicator_missing'] = np.where(df_num_copy['metadata_songs_song_hotttnesss'].isna(), True, False)
df_num_cat['metadata_songs_artist_familiarity_missing'] = np.where(df_num_copy['metadata_songs_artist_familiarity'].isna(), True, False)
mean_imputer = impute.SimpleImputer(missing_values=np.nan)
df_num_copy = pd.DataFrame(mean_imputer.fit_transform(df_num_copy))
df_num_copy.columns = df_num.columns
df_num_copy.index = df_num.index

In [358]:
# Apply RobustScaler to song artist hotness to normalize the data.
# RobustScaler is robust to outliers
rb = pp.RobustScaler()
df_num_copy = pd.DataFrame(rb.fit_transform(df_num_copy))

# scaler = pp.MinMaxScaler()
# df_num_copy = pd.DataFrame(scaler.fit_transform(df_num_copy))
df_num_copy.columns = df_num.columns
df_num_copy.index = df_num.index

In [359]:
df_num = df_num_copy

In [360]:
le = pp.LabelEncoder()

In [361]:
# Bin year by decade and encode with LabelEncoder
df_num_cat['years_binned'] = pd.cut(df_num_cat['musicbrainz_songs_year'], bins=[0, 1970, 1980, 1990, 2000, 2010, 2020], include_lowest=True)
df_num_cat['years_binned'] = le.fit_transform(df_num_cat['years_binned'])
years_enc = le.classes_

In [362]:
# Perform equal width binning based on decile
df_num_cat['tempo_binned'] = pd.qcut(df_num_cat['analysis_songs_tempo'], q=10)
df_num_cat['tempo_binned'] = le.fit_transform(df_num_cat['tempo_binned'])
tempo_enc = le.classes_

In [372]:
columns = [df_num.columns, df_num_cat.columns]
columns

[Index(['analysis_songs_duration', 'metadata_songs_artist_familiarity',
        'metadata_songs_artist_hotttnesss', 'metadata_songs_song_hotttnesss'],
       dtype='object'),
 Index(['analysis_songs_key', 'analysis_songs_key_confidence',
        'analysis_songs_mode', 'analysis_songs_mode_confidence',
        'analysis_songs_tempo', 'analysis_songs_time_signature',
        'analysis_songs_time_signature_confidence', 'musicbrainz_songs_year',
        'song_hotttnesss_indicator_missing',
        'metadata_songs_artist_familiarity_missing', 'years_binned',
        'tempo_binned'],
       dtype='object')]

In [386]:
df_m = pd.DataFrame()
for col in df_id:
    df_m[col] = df_id[col]
for col in df_num_cat:
    df_m[col] = df_num_cat[col]
for col in df_num:
    df_m[col] = df_num[col]

In [388]:
df_m

Unnamed: 0,analysis_songs_track_id,metadata_songs_artist_7digitalid,metadata_songs_artist_id,metadata_songs_artist_mbid,metadata_songs_artist_playmeid,metadata_songs_release_7digitalid,metadata_songs_song_id,metadata_songs_track_7digitalid,analysis_songs_key,analysis_songs_key_confidence,...,analysis_songs_time_signature_confidence,musicbrainz_songs_year,song_hotttnesss_indicator_missing,metadata_songs_artist_familiarity_missing,years_binned,tempo_binned,analysis_songs_duration,metadata_songs_artist_familiarity,metadata_songs_artist_hotttnesss,metadata_songs_song_hotttnesss
0,TRARRZU128F4253CA2,16971,AREJXK41187B9A4ACC,c43bb0d6-94d7-410f-80fb-e5a243b18d23,2676,275907,SOGSMXL12A81C23D88,3073568,0,0.591,...,0.372,2008,False,False,4,5,-0.740644,-0.031380,0.042065,2.397801
1,TRARRJL128F92DED0E,92108,AR2XRFQ1187FB417FE,a69cd724-2f57-4ed0-bfed-ba20401eb84c,5772,382807,SOMBCOW12AAF3B229F,4249244,1,0.429,...,0.533,2004,False,False,4,0,0.298340,0.315424,0.420843,1.552509
2,TRARRUZ128F9307C57,1701,ARODOO01187FB44F4A,60bd8a1c-c093-4849-8f28-08101ca059b1,20540,490659,SOEYIHF12AB017B5F4,5436063,3,0.000,...,0.000,0,True,False,0,0,-1.445363,-0.688767,-2.960846,0.000000
3,TRARRWA128F42A0195,92184,ARJGW911187FB586CA,44b5b950-2ae2-403a-8c67-82d8fc72033d,-1,116616,SODJYEC12A8C13D757,1199928,7,0.380,...,0.369,2007,True,False,4,0,-0.592255,0.238264,-0.359450,0.000000
4,TRARRPG12903CD1DE9,278655,AR9HQ6Y1187FB3C2CB,0e6524bd-6641-46a6-bce5-96f06c19aa46,-1,767122,SOGSOUE12A58A76443,8493899,10,0.551,...,1.000,0,True,False,0,5,-0.229873,-0.980535,-0.537565,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,TRBBWSE128F9313963,327831,AR9UPVA1187FB51260,5d8f51d7-0531-46a4-b604-3e758ff58283,-1,453635,SOWJQRH12AB0186761,5034564,7,0.444,...,0.243,0,True,False,0,7,0.140579,-2.569412,-2.960846,0.000000
9996,TRBBWDJ128F42595D5,106,ARUJ5A41187FB3F5F1,a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432,1636,278645,SOIFMVY12A8AE467B1,3103042,9,0.756,...,0.536,1981,True,False,2,2,-0.805467,1.396137,3.459743,0.000000
9997,TRBBWNS128F426F760,197865,ARWC3XN1187B9A8D82,bccf45e7-bca3-45c7-ae7b-c8b0ee7e7fa3,168634,258315,SOGVDLQ12A58A7E3C5,2868556,7,0.180,...,0.307,2007,False,False,4,1,1.230329,0.159247,-0.276075,-0.899601
9998,TRBBWIP128F4287400,180187,AR0GGKI1187FB391D5,f930ba6a-a87f-44d6-abe5-0c172772d9e4,-1,219488,SOVVGSH12A8C14085F,2394328,8,0.042,...,0.484,2007,False,False,4,8,0.200456,0.294098,-0.054379,3.681032


## Object

In [363]:
# drop all cols ending in id
id_cols_obj = [col for col in df_obj if col.endswith('id')]
df_obj = df_obj[[col for col in df_obj if not id_cols_obj.__contains__(col)]]

In [364]:
df_obj = df_obj.drop([
    'analysis_bars_confidence',
    'analysis_bars_start',
    'analysis_beats_confidence',
    'analysis_beats_start',
    'analysis_sections_confidence',
    'analysis_sections_start',
    'analysis_segments_confidence',
    'analysis_segments_loudness_max',
    'analysis_segments_loudness_max_time',
    'analysis_segments_loudness_start',
    'analysis_segments_pitches',
    'analysis_segments_start',
    'analysis_tatums_confidence',
    'analysis_tatums_start',
    'analysis_segments_timbre',
    'analysis_songs_audio_md5',
    'metadata_songs_artist_location',
    'metadata_artist_terms_freq',
    'metadata_artist_terms_weight',
    'metadata_similar_artists',
    'musicbrainz_artist_mbtags_count',
    'metadata_songs_analyzer_version',
    'metadata_songs_genre'], axis=1)

In [365]:
list(df_obj.columns)

['metadata_artist_terms',
 'metadata_songs_artist_name',
 'metadata_songs_release',
 'metadata_songs_title',
 'musicbrainz_artist_mbtags']

In [366]:
df_obj['musicbrainz_artist_mbtags'] = ['|'.join(map(str, l)) for l in df_obj['musicbrainz_artist_mbtags']]

In [367]:
df_obj['metadata_artist_terms'] = ['|'.join(map(str, l)) for l in df_obj['metadata_artist_terms']]

In [385]:
df_obj

Unnamed: 0,metadata_artist_terms,metadata_songs_artist_name,metadata_songs_release,metadata_songs_title,musicbrainz_artist_mbtags
0,b'chanson'|b'visual kei'|b'hip hop'|b'pop rock...,Raphaël,Je Sais Que La Terre Est Plate (Deluxe),Je Sais Que La Terre Est Plate,
1,b'chanson'|b'dance pop'|b'pop rock'|b'soft roc...,Julie Zenatti,Comme Vous,On Efface,
2,b'early music'|b'celtic'|b'mediaeval'|b'folk'|...,The Baltimore Consort,Watkins Ale - Music of the English Renaissance,Howells Delight,
3,b'post-hardcore'|b'doomcore'|b'metalcore'|b'sc...,I Hate Sally,Don't Worry Lady,Martha Served,
4,b'orchestra'|b'musical theater'|b'british'|b'b...,Orlando Pops Orchestra,Easy Listening: Cartoon Songs,Zip-A-Dee-Doo-Dah (Song of the South),
...,...,...,...,...,...
9995,b'country gospel'|b'ccm'|b'country'|b'aor'|b'a...,Brent Lamb,Reflections Of A Simple Man,One About Heaven,
9996,b'dance rock'|b'pop rock'|b'british pop'|b'bal...,U2,October,October,b'irish'|b'rock'|b'ireland'|b'irlandais'|b'cla...
9997,b'hard rock'|b'modern rock'|b'glam metal'|b'ro...,ZO2,Ain't It Beautiful,Comin' Home,
9998,b'frevo'|b'samba'|b'banda'|b'rockabilly'|b'bos...,Eddie,Brazil Classics 7: What's Happening in Pernamb...,Pode Me Chamar,


In [392]:
list(df_id.columns)

['analysis_songs_track_id',
 'metadata_songs_artist_7digitalid',
 'metadata_songs_artist_id',
 'metadata_songs_artist_mbid',
 'metadata_songs_artist_playmeid',
 'metadata_songs_release_7digitalid',
 'metadata_songs_song_id',
 'metadata_songs_track_7digitalid']

In [390]:
df_id.head()

Unnamed: 0,analysis_songs_track_id,metadata_songs_artist_7digitalid,metadata_songs_artist_id,metadata_songs_artist_mbid,metadata_songs_artist_playmeid,metadata_songs_release_7digitalid,metadata_songs_song_id,metadata_songs_track_7digitalid
0,TRARRZU128F4253CA2,16971,AREJXK41187B9A4ACC,c43bb0d6-94d7-410f-80fb-e5a243b18d23,2676,275907,SOGSMXL12A81C23D88,3073568
1,TRARRJL128F92DED0E,92108,AR2XRFQ1187FB417FE,a69cd724-2f57-4ed0-bfed-ba20401eb84c,5772,382807,SOMBCOW12AAF3B229F,4249244
2,TRARRUZ128F9307C57,1701,ARODOO01187FB44F4A,60bd8a1c-c093-4849-8f28-08101ca059b1,20540,490659,SOEYIHF12AB017B5F4,5436063
3,TRARRWA128F42A0195,92184,ARJGW911187FB586CA,44b5b950-2ae2-403a-8c67-82d8fc72033d,-1,116616,SODJYEC12A8C13D757,1199928
4,TRARRPG12903CD1DE9,278655,AR9HQ6Y1187FB3C2CB,0e6524bd-6641-46a6-bce5-96f06c19aa46,-1,767122,SOGSOUE12A58A76443,8493899


In [391]:
df_m = pd.DataFrame()
for col in df_id:
    df_m[col] = df_id[col]
for col in df_num_cat:
    df_m[col] = df_num_cat[col]
for col in df_num:
    df_m[col] = df_num[col]
for col in df_obj:
    df_m[col] = df_obj[col]

In [400]:
df_m = df_m.drop([
    'metadata_songs_artist_7digitalid', 
    'metadata_songs_artist_mbid',
    'metadata_songs_release_7digitalid',
    'metadata_songs_artist_playmeid',
    'metadata_songs_track_7digitalid'], axis=1)

In [401]:
df_master = df[['user','song','count']].merge(df_m, left_on='song', right_on='metadata_songs_song_id')

In [402]:
df_master.shape

(772661, 27)

In [403]:
df_master.head()

Unnamed: 0,user,song,count,analysis_songs_track_id,metadata_songs_artist_id,metadata_songs_song_id,analysis_songs_key,analysis_songs_key_confidence,analysis_songs_mode,analysis_songs_mode_confidence,...,tempo_binned,analysis_songs_duration,metadata_songs_artist_familiarity,metadata_songs_artist_hotttnesss,metadata_songs_song_hotttnesss,metadata_artist_terms,metadata_songs_artist_name,metadata_songs_release,metadata_songs_title,musicbrainz_artist_mbtags
0,b80344d063b5ccb3212f76538f3d9e43d87dca9e,SOWEZSI12A81C21CE6,1,TRAUCNU128F42671EB,AR2UQQ51187B9AC816,SOWEZSI12A81C21CE6,5,0.351,0,0.318,...,8,-0.280898,0.732042,1.243373,5.09644,b'flamenco'|b'soundtrack'|b'folk'|b'spanish'|b...,Gipsy Kings,Greatest Hits,Tu Quieres Volver,b'classic pop and rock'|b'folk'
1,833c530ecda3d99deb8395f70400aa3999783d91,SOWEZSI12A81C21CE6,2,TRAUCNU128F42671EB,AR2UQQ51187B9AC816,SOWEZSI12A81C21CE6,5,0.351,0,0.318,...,8,-0.280898,0.732042,1.243373,5.09644,b'flamenco'|b'soundtrack'|b'folk'|b'spanish'|b...,Gipsy Kings,Greatest Hits,Tu Quieres Volver,b'classic pop and rock'|b'folk'
2,d6c5bd2b570b4faf8964d7ed04f3392ff505d2be,SOWEZSI12A81C21CE6,1,TRAUCNU128F42671EB,AR2UQQ51187B9AC816,SOWEZSI12A81C21CE6,5,0.351,0,0.318,...,8,-0.280898,0.732042,1.243373,5.09644,b'flamenco'|b'soundtrack'|b'folk'|b'spanish'|b...,Gipsy Kings,Greatest Hits,Tu Quieres Volver,b'classic pop and rock'|b'folk'
3,724534729c9f5dc72a009269c2c225883e4775d2,SOWEZSI12A81C21CE6,1,TRAUCNU128F42671EB,AR2UQQ51187B9AC816,SOWEZSI12A81C21CE6,5,0.351,0,0.318,...,8,-0.280898,0.732042,1.243373,5.09644,b'flamenco'|b'soundtrack'|b'folk'|b'spanish'|b...,Gipsy Kings,Greatest Hits,Tu Quieres Volver,b'classic pop and rock'|b'folk'
4,ee7aa84c164038c963cfd02a7e52a5598aa470c3,SOWEZSI12A81C21CE6,2,TRAUCNU128F42671EB,AR2UQQ51187B9AC816,SOWEZSI12A81C21CE6,5,0.351,0,0.318,...,8,-0.280898,0.732042,1.243373,5.09644,b'flamenco'|b'soundtrack'|b'folk'|b'spanish'|b...,Gipsy Kings,Greatest Hits,Tu Quieres Volver,b'classic pop and rock'|b'folk'


In [404]:
df_master.to_pickle(reldir + 'final/combined.csv', compression='zip')
df_m.to_pickle(reldir + 'final/songdata.csv', compression='zip')

In [406]:
pd.Series(years_enc).to_pickle(reldir + 'final/years_enc.csv', compression='zip')
pd.Series(tempo_enc).to_pickle(reldir + 'final/tempo_enc.csv', compression='zip')