# Modelo de Classificação Músical 

In [2]:
#Importando as bibliotecas 
import pandas as pd

# MODELO COM PIPELINE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer


In [3]:
# lê o dataset com as musicas
tracks = pd.read_csv('datasets/fma-rock-vs-hiphop.csv')

# lê o dataset com as características de cada estilo
echonest_metrics = pd.read_json('datasets/echonest-metrics.json',precise_float=True)

# une os dois datasets
echo_tracks = pd.merge(echonest_metrics,
tracks[['track_id','genre_top']],
on='track_id')

In [4]:

#Verificando o dataset antes da união:
tracks.head()

Unnamed: 0,track_id,bit_rate,comments,composer,date_created,date_recorded,duration,favorites,genre_top,genres,...,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
0,135,256000,1,,2008-11-26 01:43:26,2008-11-26 00:00:00,837,0,Rock,"[45, 58]",...,,2484,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1832,,0,,[],Father's Day
1,136,256000,1,,2008-11-26 01:43:35,2008-11-26 00:00:00,509,0,Rock,"[45, 58]",...,,1948,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1498,,0,,[],Peel Back The Mountain Sky
2,151,192000,0,,2008-11-26 01:44:55,,192,0,Rock,[25],...,,701,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,148,,4,,[],Untitled 04
3,152,192000,0,,2008-11-26 01:44:58,,193,0,Rock,[25],...,,637,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,98,,11,,[],Untitled 11
4,153,256000,0,Arc and Sender,2008-11-26 01:45:00,2008-11-26 00:00:00,405,5,Rock,[26],...,,354,en,Attribution-NonCommercial-NoDerivatives (aka M...,424,,2,,[],Hundred-Year Flood


In [5]:
#Verificando o dataset das características 
echonest_metrics.head()

Unnamed: 0,track_id,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence
0,2,0.416675,0.675894,0.634476,0.010628,0.177647,0.15931,165.922,0.576661
1,3,0.374408,0.528643,0.817461,0.001851,0.10588,0.461818,126.957,0.26924
2,5,0.043567,0.745566,0.70147,0.000697,0.373143,0.124595,100.26,0.621661
3,10,0.95167,0.658179,0.924525,0.965427,0.115474,0.032985,111.562,0.96359
4,134,0.452217,0.513238,0.56041,0.019443,0.096567,0.525519,114.29,0.894072


In [6]:
# verifica como ficou o dataset resultante da união
echo_tracks.head()

Unnamed: 0,track_id,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence,genre_top
0,2,0.416675,0.675894,0.634476,0.010628,0.177647,0.15931,165.922,0.576661,Hip-Hop
1,3,0.374408,0.528643,0.817461,0.001851,0.10588,0.461818,126.957,0.26924,Hip-Hop
2,5,0.043567,0.745566,0.70147,0.000697,0.373143,0.124595,100.26,0.621661,Hip-Hop
3,134,0.452217,0.513238,0.56041,0.019443,0.096567,0.525519,114.29,0.894072,Hip-Hop
4,153,0.988306,0.255661,0.979774,0.973006,0.121342,0.05174,90.241,0.034018,Rock


In [7]:
# variáveis que utilizaremos para fazer a predição
features = echo_tracks.drop(columns=['genre_top', 'track_id'], axis=1)

# variável que queremos prever
labels = echo_tracks['genre_top']


# divisão em treino e teste
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=0)

In [8]:
# Criando o Preprocessamento no Pipeline
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())])

preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, X_train.columns)])

# Pipeline com random forest
from sklearn.ensemble import RandomForestClassifier
rf = Pipeline(steps=[('preprocessor', preprocessor),
('classifier', RandomForestClassifier())])

# treina o modelo
rf.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['acousticness', 'danceability', 'energy', 'instrumentalness',
       'liveness', 'speechiness', 'tempo', 'valence'],
      dtype='object'))])),
                ('classifier', RandomForestClassifier())])

In [9]:
# predicoes
y_pred = rf.predict(X_test)

# Acurácia do modelo
from sklearn.metrics import accuracy_score
print("Random Forest Accuracy: \n",accuracy_score(y_test, y_pred))

# Classification report
from sklearn.metrics import classification_report
class_rep_rf = classification_report(y_test, y_pred)

print("Random Forest: \n", class_rep_rf)

# Matriz de Confusao
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

Random Forest Accuracy: 
 0.926439972241499
Random Forest: 
               precision    recall  f1-score   support

     Hip-Hop       0.85      0.75      0.80       279
        Rock       0.94      0.97      0.96      1162

    accuracy                           0.93      1441
   macro avg       0.90      0.86      0.88      1441
weighted avg       0.92      0.93      0.92      1441



array([[ 209,   70],
       [  36, 1126]], dtype=int64)

In [12]:
from sklearn.metrics import precision_recall_fscore_support

def trying_class_weight(class_weight):
    
#FUNÇÃO PARA TESTAR O RESULTADO DE DIFERENTES PESOS PARA AS CLASSES """
# Criando o Preprocessamento no Pipeline
numeric_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler()),
]
)

preprocessor = ColumnTransformer(
transformers=[("num", numeric_transformer, X_train.columns)]
)

# Pipeline com random forest
rf = Pipeline(
steps=[
("preprocessor", preprocessor),
("classifier", RandomForestClassifier(class_weight=class_weight)),
]
)

# Fita o modelo
rf.fit(X_train, y_train)

# predicoes
y_pred = rf.predict(X_test)

# Matriz de Confusao
cm = confusion_matrix(y_test, y_pred)
accuracy = (cm[0, 0] + cm[1, 1]) / len(y_test)

precision, recall, fscore, support = score(y_test, y_pred, average="macro")

return accuracy, fscore

IndentationError: expected an indented block (<ipython-input-12-bea2231c9dae>, line 7)

In [14]:
# Primeiro, algumas tentativas padrão:
accuracy, fscore = trying_class_weight("balanced")
print("For balanced:")
print(accuracy)
print(fscore)

accuracy, fscore = trying_class_weight(None)
print("For None:")
print(accuracy)
print(fscore)

accuracy, fscore = trying_class_weight({"Hip-Hop": 1, "Rock": 4})
print("For 1:4")
print(accuracy)
print(fscore)

NameError: name 'trying_class_weight' is not defined