In [8]:
import pandas as pd
import os
import numpy as np
import ast

In [10]:
def load(filepath):

    filename = os.path.basename(filepath)

    if 'features' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])

    if 'echonest' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])

    if 'genres' in filename:
        return pd.read_csv(filepath, index_col=0)

    if 'tracks' in filename:
        tracks = pd.read_csv(filepath, index_col=0, header=[0, 1])

        COLUMNS = [('track', 'tags'), ('album', 'tags'), ('artist', 'tags'),
                   ('track', 'genres'), ('track', 'genres_all')]
        for column in COLUMNS:
            tracks[column] = tracks[column].map(ast.literal_eval)

        COLUMNS = [('track', 'date_created'), ('track', 'date_recorded'),
                   ('album', 'date_created'), ('album', 'date_released'),
                   ('artist', 'date_created'), ('artist', 'active_year_begin'),
                   ('artist', 'active_year_end')]
        for column in COLUMNS:
            tracks[column] = pd.to_datetime(tracks[column])

        SUBSETS = ('small', 'medium', 'large')
        try:
            tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                    'category', categories=SUBSETS, ordered=True)
        except (ValueError, TypeError):
            # the categories and ordered arguments were removed in pandas 0.25
            tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                     pd.CategoricalDtype(categories=SUBSETS, ordered=True))

        COLUMNS = [('track', 'genre_top'), ('track', 'license'),
                   ('album', 'type'), ('album', 'information'),
                   ('artist', 'bio')]
        for column in COLUMNS:
            tracks[column] = tracks[column].astype('category')

        return tracks


In [14]:
features = load("../../fma_metadata/features.csv")

In [15]:
tracks = load("../../fma_metadata/tracks.csv")

In [38]:
tracks['track','favorites']

track_id
2           2
3           1
5           6
10        178
20          0
         ... 
155316      1
155317      1
155318      2
155319      0
155320      1
Name: (track, favorites), Length: 106574, dtype: int64

In [44]:
tracks['track','interest']


track_id
2          4656
3          1470
5          1933
10        54881
20          978
          ...  
155316      122
155317      194
155318      214
155319      336
155320      972
Name: (track, interest), Length: 106574, dtype: int64

In [48]:
tracks.track.columns

Index(['bit_rate', 'comments', 'composer', 'date_created', 'date_recorded',
       'duration', 'favorites', 'genre_top', 'genres', 'genres_all',
       'information', 'interest', 'language_code', 'license', 'listens',
       'lyricist', 'number', 'publisher', 'tags', 'title'],
      dtype='object')

In [8]:
onlySmallTid = tracks[tracks['set','subset'].isin(["small"])]

In [16]:
onlySmallTid = pd.DataFrame(onlySmallTid.index)

In [18]:
onlySmallTid.to_csv("onlySmallTid.csv")

In [9]:
genres = load("../../Genre Prediction/Data/Original Data/fma_metadata/genres.csv")

In [28]:
genres.sort_values("#tracks",ascending=False).head(10)

Unnamed: 0_level_0,#tracks,parent,title,top_level
genre_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
38,38154,0,Experimental,38
15,34413,0,Electronic,15
12,32923,0,Rock,12
1235,14938,0,Instrumental,1235
10,13845,0,Pop,10
17,12706,0,Folk,17
25,9261,12,Punk,12
1,8693,38,Avant-Garde,38
21,8389,0,Hip-Hop,21
32,7268,38,Noise,38


In [315]:
needed_genres = genres.sort_values(by="#tracks",ascending=False).head(17).index.to_list()

In [316]:
track=tracks['track']
tracks_genre = pd.DataFrame(track['genres_all'],index=track.index)

In [317]:
tracks_genre

Unnamed: 0_level_0,genres_all
track_id,Unnamed: 1_level_1
2,[21]
3,[21]
5,[21]
10,[10]
20,"[17, 10, 76, 103]"
...,...
155316,"[25, 12]"
155317,"[25, 12]"
155318,"[25, 12]"
155319,"[25, 12]"


In [318]:
tracks_genre_hot = tracks_genre['genres_all'].apply(lambda x: pd.Series({val: 1 for val in x})).fillna(0)

In [319]:
tracks_genre_hot_needed = tracks_genre_hot[needed_genres]
tracks_genre_hot_needed = tracks_genre_hot_needed.astype(int)

In [320]:
tracks_genre_hot_needed = tracks_genre_hot_needed.loc[(tracks_genre_hot_needed!=0).any(axis=1)]

In [321]:
tracks_genre_hot_needed

Unnamed: 0_level_0,38,15,12,1235,10,17,25,1,21,32,107,76,41,27,18,42,66
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
10,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
20,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155316,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
155317,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
155318,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
155319,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [322]:

tracks_genre_hot_needed.reset_index(inplace=True)

In [323]:
feacol = features.columns 

In [324]:
newCols = list(map(lambda x: (x,'',''),tracks_genre_hot_needed.columns.to_list())) + features.columns.to_list()

In [325]:
features = features.droplevel(0,axis=1).droplevel(0,axis=1)

In [327]:
features.reset_index(inplace=True)
features

number,track_id,01,02,03,04,05,06,07,08,09,...,04.1,05.1,06.1,01.1,01.2,01.3,01.4,01.5,01.6,01.7
0,2,7.180653,5.230309,0.249321,1.347620,1.482478,0.531371,1.481593,2.691455,0.866868,...,0.054125,0.012226,0.012111,5.758890,0.459473,0.085629,0.071289,0.000000,2.089872,0.061448
1,3,1.888963,0.760539,0.345297,2.295201,1.654031,0.067592,1.366848,1.054094,0.108103,...,0.063831,0.014212,0.017740,2.824694,0.466309,0.084578,0.063965,0.000000,1.716724,0.069330
2,5,0.527563,-0.077654,-0.279610,0.685883,1.937570,0.880839,-0.923192,-0.927232,0.666617,...,0.040730,0.012691,0.014759,6.808415,0.375000,0.053114,0.041504,0.000000,2.193303,0.044861
3,10,3.702245,-0.291193,2.196742,-0.234449,1.367364,0.998411,1.770694,1.604566,0.521217,...,0.074358,0.017952,0.013921,21.434212,0.452148,0.077515,0.071777,0.000000,3.542325,0.040800
4,20,-0.193837,-0.198527,0.201546,0.258556,0.775204,0.084794,-0.289294,-0.816410,0.043851,...,0.095003,0.022492,0.021355,16.669037,0.469727,0.047225,0.040039,0.000977,3.189831,0.030993
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106569,155316,-0.490129,0.463834,2.321970,-0.084352,1.662914,2.115189,-0.237794,5.695442,0.830353,...,0.128410,0.022547,0.019816,4.448255,0.172852,0.028773,0.028320,0.003906,0.955388,0.012385
106570,155317,-0.461559,-0.229601,-0.496632,-0.422033,0.130612,-0.263825,-0.628103,-0.082687,-0.229483,...,0.132964,0.023548,0.026527,3.270612,0.196289,0.031116,0.027832,0.002441,1.283060,0.019059
106571,155318,0.552473,-0.110498,-0.532014,0.263131,-0.224011,-0.530972,1.713526,1.418444,1.325197,...,0.108324,0.017540,0.020471,2.356727,0.212891,0.038450,0.037109,0.003418,0.828569,0.017904
106572,155319,-0.176901,0.187208,-0.050664,0.368843,0.066005,-0.857354,-0.780860,0.626281,-0.630938,...,0.088311,0.018328,0.017936,6.188604,0.167480,0.041480,0.038086,0.004883,1.818740,0.020133


In [328]:
tracks_genre_features_hot_needed = tracks_genre_hot_needed.merge(features,on="track_id",how="inner")

In [329]:
tracks_genre_features_hot_needed.set_index("track_id",inplace=True)

In [330]:
tracks_genre_features_hot_needed[38].dropna(inplace=True)

In [331]:
tracks_genre_features_hot_needed.columns[0:17]

Index([38, 15, 12, 1235, 10, 17, 25, 1, 21, 32, 107, 76, 41, 27, 18, 42, 66], dtype='object')

In [338]:
tracks_genre_features_hot_needed.columns = pd.MultiIndex.from_tuples(newCols[1:])

In [480]:
tracks_genre_hot_needed.sum()

track_id    7967304054
38               38154
15               34413
12               32923
1235             14938
10               13845
17               12706
25                9261
1                 8693
21                8389
32                7268
107               7206
76                7144
41                6110
27                6041
18                5913
42                5723
66                5432
dtype: int64

In [354]:
X = tracks_genre_features_hot_needed.drop(list(map(lambda x: (x,'',''),tracks_genre_hot_needed.columns.to_list()))[1:],axis=1)

In [356]:
y = tracks_genre_features_hot_needed[list(map(lambda x: (x,'',''),tracks_genre_hot_needed.columns.to_list()))[1:]]

In [494]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, log_loss, mean_squared_error, r2_score
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier


In [496]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [498]:
model = BalancedRandomForestClassifier(n_estimators=100)
multi_target_clf = MultiOutputClassifier(model)

In [500]:
multi_target_clf.fit(x_train,y_train)

In [504]:
y_pred=multi_target_clf.predict_proba(x_test)

In [506]:
y_pred = list(map(lambda x:x.tolist(),y_pred))

In [508]:
ypred = pd.DataFrame(y_pred)

In [510]:
y_pred = ypred.transpose()

In [446]:
y_pred = ypred.transpose()

In [512]:
y_pred= y_pred.map(lambda x: 1 if x[0]<0.5 else 0)

In [514]:
y_pred

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,1,0,0,0,0,0,1,1,0,1,1,0,1,1,0,1,0
1,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0
2,1,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0
3,1,1,0,1,0,1,0,1,0,0,1,1,1,0,1,1,0
4,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29692,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0
29693,0,0,1,1,1,1,0,0,0,0,0,0,0,1,1,0,1
29694,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0
29695,1,0,0,1,1,1,0,1,0,0,0,1,0,1,1,0,0


In [516]:
y_test 

Unnamed: 0_level_0,38,15,12,1235,10,17,25,1,21,32,107,76,41,27,18,42,66
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
84649,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
30255,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0
33104,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
16595,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
105253,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17879,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
95997,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0
131474,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
35835,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [518]:
accuracy= accuracy_score(y_pred,y_test)

In [520]:
accuracy

0.013738761491059704