In [2]:
import pandas as pd
import os
import ast


In [4]:
def load(filepath):

    filename = os.path.basename(filepath)

    if 'features' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])

    if 'echonest' in filename:
        return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])

    if 'genres' in filename:
        return pd.read_csv(filepath, index_col=0)

    if 'tracks' in filename:
        tracks = pd.read_csv(filepath, index_col=0, header=[0, 1])

        COLUMNS = [('track', 'tags'), ('album', 'tags'), ('artist', 'tags'),
                   ('track', 'genres'), ('track', 'genres_all')]
        for column in COLUMNS:
            tracks[column] = tracks[column].map(ast.literal_eval)

        COLUMNS = [('track', 'date_created'), ('track', 'date_recorded'),
                   ('album', 'date_created'), ('album', 'date_released'),
                   ('artist', 'date_created'), ('artist', 'active_year_begin'),
                   ('artist', 'active_year_end')]
        for column in COLUMNS:
            tracks[column] = pd.to_datetime(tracks[column])

        SUBSETS = ('small', 'medium', 'large')
        try:
            tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                    'category', categories=SUBSETS, ordered=True)
        except (ValueError, TypeError):
            # the categories and ordered arguments were removed in pandas 0.25
            tracks['set', 'subset'] = tracks['set', 'subset'].astype(
                     pd.CategoricalDtype(categories=SUBSETS, ordered=True))

        COLUMNS = [('track', 'genre_top'), ('track', 'license'),
                   ('album', 'type'), ('album', 'information'),
                   ('artist', 'bio')]
        for column in COLUMNS:
            tracks[column] = tracks[column].astype('category')

        return tracks


In [6]:
tracks = load("../../../fma_metadata/tracks.csv")

In [7]:
all_features = pd.read_csv("../all_features.csv",header=[0,1,2],index_col=[0])

In [8]:
all_features

Unnamed: 0_level_0,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,mel_spec,mel_spec,mel_spec,mel_spec,mel_spec,mel_spec,mel_spec,mel_spec,mel_spec,mel_spec
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,std,std,std,std,std,std,std
number,01,02,03,04,05,06,07,08,09,10,...,90,91,92,93,94,95,96,97,98,99
track_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2,7.180653,5.230309,0.249321,1.347620,1.482478,0.531371,1.481593,2.691455,0.866868,1.341231,...,0.005500,0.004066,0.003965,0.002347,0.001663,0.002632,0.001452,0.000114,0.000002,0.000002
3,1.888963,0.760539,0.345297,2.295201,1.654031,0.067592,1.366848,1.054094,0.108103,0.619185,...,0.014708,0.019320,0.078528,0.072792,0.069122,0.013616,0.008447,0.001303,0.000003,0.000002
5,0.527563,-0.077654,-0.279610,0.685883,1.937570,0.880839,-0.923192,-0.927232,0.666617,1.038546,...,0.000010,0.000009,0.000008,0.000006,0.000005,0.000007,0.000004,0.000002,0.000002,0.000001
10,3.702245,-0.291193,2.196742,-0.234449,1.367364,0.998411,1.770694,1.604566,0.521217,1.982386,...,0.018939,0.015547,0.010535,0.006567,0.006368,0.003965,0.003788,0.002343,0.001213,0.000316
20,-0.193837,-0.198527,0.201546,0.258556,0.775204,0.084794,-0.289294,-0.816410,0.043851,-0.804761,...,0.008917,0.008339,0.006849,0.006172,0.005579,0.004760,0.004246,0.004115,0.003781,0.003704
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155316,-0.490129,0.463834,2.321970,-0.084352,1.662914,2.115189,-0.237794,5.695442,0.830353,1.951819,...,0.001007,0.000663,0.000429,0.000345,0.000208,0.000123,0.000121,0.000038,0.000002,0.000001
155317,-0.461559,-0.229601,-0.496632,-0.422033,0.130612,-0.263825,-0.628103,-0.082687,-0.229483,-0.492753,...,0.002338,0.001594,0.001051,0.000818,0.000603,0.000557,0.000391,0.000214,0.000067,0.000098
155318,0.552473,-0.110498,-0.532014,0.263131,-0.224011,-0.530972,1.713526,1.418444,1.325197,0.120333,...,0.003115,0.002223,0.001846,0.001351,0.000899,0.000688,0.000458,0.000194,0.000042,0.000019
155319,-0.176901,0.187208,-0.050664,0.368843,0.066005,-0.857354,-0.780860,0.626281,-0.630938,-0.787229,...,0.002461,0.001665,0.001135,0.000730,0.000473,0.000313,0.000198,0.000103,0.000006,0.000005


In [9]:
genres = tracks[tracks['track','genres'].map(lambda x: len(x) > 0)]['track','genres']

In [10]:
track_genres = pd.DataFrame(genres)

In [11]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Fit-transform the 'tags' column
genres_encoded = mlb.fit_transform(genres)

# Turn it into a DataFrame
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_)

In [12]:
track_genres.columns = ["genres"]

In [13]:
track_one_hot_genres = pd.concat([track_genres,genres_df],axis=1).drop("genres",axis=1)

In [14]:
track_one_hot_genres.columns =pd.MultiIndex.from_tuples(track_one_hot_genres.columns.map(lambda x: ('','',x)))

In [15]:
individual_genre_value_counts = pd.DataFrame(track_one_hot_genres.sum(),columns=["count"])

In [16]:
individual_genre_value_counts[individual_genre_value_counts["count"]<2]

Unnamed: 0,Unnamed: 1,Unnamed: 2,count
,,377,1.0


In [17]:
track_one_hot_genres.drop(('','',377),axis=1,inplace=True)

In [18]:
track_one_hot_genres.sum(axis=1)

2         1.0
3         1.0
5         2.0
10        2.0
20        1.0
         ... 
104335    1.0
104336    1.0
104337    1.0
104338    1.0
104339    1.0
Length: 142049, dtype: float64

In [19]:
features_genre_one_hot_encoded = all_features.merge(track_one_hot_genres,left_index=True,right_index=True)


In [20]:
features_genre_one_hot_encoded.dropna(inplace=True)

In [23]:
X = features_genre_one_hot_encoded.drop(track_one_hot_genres.columns.to_list(),axis=1)
y = features_genre_one_hot_encoded[track_one_hot_genres.columns.to_list()]

In [24]:
pd.DataFrame(y_tem.sum())

NameError: name 'y_tem' is not defined

In [None]:
from sklearn.preprocessing import StandardScaler
scaler_standard = StandardScaler()
X = scaler_standard.fit_transform(X)

In [None]:
X

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, mean_squared_error, r2_score
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [None]:
x_train, x_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3)
x_val, x_test, y_val, y_test = train_test_split(
    x_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
# y_train_resampled.value_counts()

In [None]:
model = XGBClassifier(n_estimators = 50)

In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# param_grid = {
#     'n_estimators': [100, 300, 500],
#     'max_depth': [3, 5, 7],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'subsample': [0.7, 0.8, 1.0],
#     'colsample_bytree': [0.7, 0.8, 1.0],
#     'gamma': [0, 0.1, 0.2],
#     'reg_lambda': [1, 1.5, 2]
# }
# random_search = RandomizedSearchCV(
#     estimator=model,
#     param_distributions=param_grid,
#     n_iter=30,            # number of random combinations to try
#     scoring='accuracy',
#     cv=3,
#     verbose=2,
#     random_state=42,
#     n_jobs=-1
# )

# random_search.fit(x_train, y_train)

# print("Best Random Search Parameters:", random_search.best_params_)
# best_xgb_random = random_search.best_estimator_


In [None]:
# best_xgb_random

In [None]:
# y_train_resampled

In [None]:
# y_train.value_counts()

In [None]:
# y_train_resampled = y_train_resampled.replace(dictG)

In [None]:
# # from imblearn.over_sampling import SMOTE

# # smote = SMOTE(sampling_strategy='auto', random_state=42)
# # x_train, y_train = smote.fit_resample(x_train, y_train)
# from sklearn.utils.class_weight import compute_class_weight
# # import numpy as np

# # # Suppose y_train has classes like [0, 1, 2, 3]
# # classes = np.unique(y_train)
# # class_weights = compute_class_weight('balanced', classes=classes, y=y_train)

# # print(class_weights)  # one weight for each class

# # weights = np.array([class_weights[i] for i in y_train])
# import numpy as np
# classes = np.unique(y_train)
# class_weights = compute_class_weight('balanced', classes=classes, y=y_train)

# # Now create sample weights
# sample_weights = np.array([class_weights[label] for label in y_train])

In [None]:
eval_set = [(x_train, y_train), (x_val, y_val)]

In [None]:
model.fit(x_train,y_train,eval_set=[(x_val, y_val)],verbose=True)

In [None]:
y_pred=model.predict(x_test)

In [None]:
y_pred = pd.DataFrame(y_pred)

In [None]:
# def top_two_columns(row):
#     # Sort values, get top two columns
#     top2 = row.sort_values(ascending=False).index[:2]
#     return top2.to_list()

# # Apply function row-wise (axis=1)
# y_pred['top_two_genres'] = y_pred.apply(top_two_columns, axis=1)

# y_pred

In [27]:
y_test

NameError: name 'y_test' is not defined

In [None]:
from sklearn.metrics import hamming_loss, f1_score, accuracy_score

# Hamming Loss (lower is better)
hamm_loss = hamming_loss(y_test, y_pred)
print("Hamming Loss:", hamm_loss)

# Macro F1 (average F1 across all genres equally)
macro_f1 = f1_score(y_test, y_pred, average='macro')
print("Macro F1 Score:", macro_f1)

# Micro F1 (better for imbalanced labels)
micro_f1 = f1_score(y_test, y_pred, average='micro')
print("Micro F1 Score:", micro_f1)

# Exact Match Accuracy (very strict)
exact_match = accuracy_score(y_test, y_pred)
print("Exact Match Accuracy:", exact_match)

In [254]:
# def get_active_columns(row):
#     return row.index[row == 1].to_list()

# # Apply row-wise
# y_test['labels'] = y_test.apply(get_active_columns, axis=1)

# y_test

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0,Unnamed: 20_level_0,labels
Unnamed: 0_level_1,1,2,3,4,5,6,7,8,9,10,...,808,810,811,906,1032,1060,1156,1193,1235,Unnamed: 21_level_1
9555,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[(, , 10), (, , 37), (, , 130)]"
46150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[(, , 27), (, , 76)]"
34439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[(, , 12)]"
69703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[(, , 21)]"
75429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[(, , 33), (, , 94)]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[(, , 10), (, , 76)]"
47082,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[(, , 70), (, , 107)]"
91786,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[(, , 53), (, , 101), (, , 109), (, , 167)]"
78122,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[(, , 76), (, , 85)]"


In [123]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       831
           1       0.00      0.00      0.00       158
           2       0.00      0.00      0.00       180
           3       0.00      0.00      0.00       270
           4       0.00      0.00      0.00       204
           5       0.00      0.00      0.00        35
           6       0.00      0.00      0.00        24
           7       0.00      0.00      0.00        77
           8       0.00      0.00      0.00        84
           9       0.00      0.00      0.00       650
          10       0.00      0.00      0.00        31
          11       0.00      0.00      0.00       747
          12       0.00      0.00      0.00        16
          13       0.00      0.00      0.00        40
          14       0.80      0.00      0.00      2246
          15       0.00      0.00      0.00        11
          16       0.00      0.00      0.00       703
          17       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [425]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(score_func=f_classif, k=1000)  # select top 50 features
X_train_selected = selector.fit_transform(x_train, y_train)
X_test_selected = selector.transform(x_test)
X_val_selected = selector.transform(x_val)

In [426]:
# import matplotlib.pyplot as plt

# # Plot top 50 features
# top_n = 50

# plt.figure(figsize=(10, 12))
# plt.barh(feat_importances['feature'][:top_n], feat_importances['importance'][:top_n])
# plt.gca().invert_yaxis()  # Most important on top
# plt.xlabel('Importance')
# plt.title(f'Top {top_n} Feature Importances')
# plt.show()

In [436]:
best_xgb_random.fit(X_train_selected,y_train,sample_weight=sample_weights,eval_set=[(X_val_selected, y_val)],verbose=True)

[0]	validation_0-mlogloss:1.56132
[1]	validation_0-mlogloss:1.51920
[2]	validation_0-mlogloss:1.48154
[3]	validation_0-mlogloss:1.44850
[4]	validation_0-mlogloss:1.41932
[5]	validation_0-mlogloss:1.39339
[6]	validation_0-mlogloss:1.36929
[7]	validation_0-mlogloss:1.34718
[8]	validation_0-mlogloss:1.32707
[9]	validation_0-mlogloss:1.30856
[10]	validation_0-mlogloss:1.29222
[11]	validation_0-mlogloss:1.27706
[12]	validation_0-mlogloss:1.26229
[13]	validation_0-mlogloss:1.24901
[14]	validation_0-mlogloss:1.23569
[15]	validation_0-mlogloss:1.22375
[16]	validation_0-mlogloss:1.21208
[17]	validation_0-mlogloss:1.20157
[18]	validation_0-mlogloss:1.19218
[19]	validation_0-mlogloss:1.18312
[20]	validation_0-mlogloss:1.17515
[21]	validation_0-mlogloss:1.16746
[22]	validation_0-mlogloss:1.15952
[23]	validation_0-mlogloss:1.15268
[24]	validation_0-mlogloss:1.14549
[25]	validation_0-mlogloss:1.13929
[26]	validation_0-mlogloss:1.13305
[27]	validation_0-mlogloss:1.12681
[28]	validation_0-mlogloss:1.1

In [437]:
y_test_pred = best_xgb_random.predict(X_test_selected)

In [438]:
print(classification_report(y_test_pred, y_test))

              precision    recall  f1-score   support

           0       0.72      0.79      0.76      3132
           1       0.80      0.74      0.77      3514
           2       0.36      0.40      0.38       877
           3       0.64      0.64      0.64      1000
           4       0.69      0.60      0.64      1068

    accuracy                           0.70      9591
   macro avg       0.64      0.63      0.64      9591
weighted avg       0.70      0.70      0.70      9591



In [638]:
we_will_rock_you = pd.read_csv("we_will_rock_you.csv",header=[0,1,2],index_col=[0])

In [640]:
we_will_rock_you

feature,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,chroma_cens,...,tonnetz,tonnetz,tonnetz,zcr,zcr,zcr,zcr,zcr,zcr,zcr
statistics,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,kurtosis,...,std,std,std,kurtosis,max,mean,median,min,skew,std
number,01,02,03,04,05,06,07,08,09,10,...,04,05,06,01,01,01,01,01,01,01
0,2.14157,0.411554,-0.419062,0.632222,0.564811,0.001753,0.424778,2.937335,0.471645,0.067198,...,0.102356,0.034659,0.03353,33.296363,0.554199,0.062411,0.050293,0.0,5.113267,0.060578


In [642]:
best_xgb_random.predict_proba(selector.transform(we_will_rock_you))

array([[0.31377733, 0.5654845 , 0.02500724, 0.09083951, 0.00489141]],
      dtype=float32)