In [1]:
import pandas as pd
import numpy as np
import librosa
import glob
import os
import seaborn as sns

import IPython.display as ipd
import matplotlib.pyplot as plt
#import librosa.display

from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation,Flatten
from tensorflow.keras.optimizers import Adam
from sklearn import metrics

from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime 

In [2]:
singers = {"1_izumi_iori":"iori",
          "2_nikaido_yamato":"yamato",
          "3_izumi_mitsuki":"mitsuki",
          "4_yotsuba_tamaki":"tamaki",
          "5_osaka_sougo":"sougo",
          "6_rokuya_nagi":"nagi",
          "7_nanase_riku":"riku",
          }

In [3]:
filenames = []
labels = []
folder = []

for file_folder, label in singers.items():
    
    filepath = f"datasets/music_files/{file_folder}/clips/"
    filelist = glob.glob(os.path.join(os.getcwd(), filepath, "*.mp3"))
    
    for file in filelist:
        filename = os.path.basename(file)
        
        folder.append(file_folder)
        filenames.append(filename)
        labels.append(singers[file_folder])

In [4]:
filename_dict = {"folder":folder,
                 "filenames":filenames,
                "labels":labels}

In [5]:
filename_df = pd.DataFrame(filename_dict)

filename_df.head()

Unnamed: 0,folder,filenames,labels
0,1_izumi_iori,iori_1.mp3,iori
1,1_izumi_iori,iori_10.mp3,iori
2,1_izumi_iori,iori_100.mp3,iori
3,1_izumi_iori,iori_101.mp3,iori
4,1_izumi_iori,iori_102.mp3,iori


In [6]:
filename_df.tail()

Unnamed: 0,folder,filenames,labels
994,7_nanase_riku,riku_95.mp3,riku
995,7_nanase_riku,riku_96.mp3,riku
996,7_nanase_riku,riku_97.mp3,riku
997,7_nanase_riku,riku_98.mp3,riku
998,7_nanase_riku,riku_99.mp3,riku


In [7]:
filename_df.count()

folder       999
filenames    999
labels       999
dtype: int64

In [8]:
filename_df['labels'].unique()

array(['iori', 'yamato', 'mitsuki', 'tamaki', 'sougo', 'nagi', 'riku'],
      dtype=object)

In [9]:
filename_df['labels'].value_counts()

riku       163
mitsuki    157
tamaki     145
yamato     141
nagi       133
iori       132
sougo      128
Name: labels, dtype: int64

In [18]:
# transform audio data into MFCCs
# sr=None will use native sample rate of audio file

def features_extractor(file, sr=None):
    
    
    #load audio file - get sound_sample and sample_rate
    audio, sample_rate = librosa.load(file, sr=sr) 
    #extract mfcc, 1st and 2nd derivatives
    mfcc_features = librosa.feature.mfcc(y=audio, sr=sample_rate)
    mfcc1_features = librosa.feature.delta(mfcc_features,order=1)
    mfcc2_features = librosa.feature.delta(mfcc_features,order=2)
    #reduce dimensionality for scaled feature by calculating mean
    mfcc_scaled_features = np.mean(mfcc_features.T,axis=0)
    mfcc1_scaled_features = np.mean(mfcc1_features.T,axis=0)
    mfcc2_scaled_features = np.mean(mfcc2_features.T,axis=0)
    
    concat_mfcc_scaled = np.concatenate((mfcc_scaled_features, mfcc1_scaled_features, mfcc2_scaled_features))
    return concat_mfcc_scaled

In [19]:
# iterate through all audio files and extract features

extracted_features=[]

for index_num,row in filename_df.iterrows():
    
    file_folder = row['folder']
    dataset_path = f"datasets/music_files/{file_folder}/clips/"
    
    filepath = os.path.join(os.getcwd(), dataset_path, row['filenames'])
    
    feature_data=features_extractor(filepath)
    
    extracted_features.append(feature_data)

In [20]:
filename_df['features'] = extracted_features

filename_df.head()

Unnamed: 0,folder,filenames,labels,features
0,1_izumi_iori,iori_1.mp3,iori,"[-212.99126, 81.818504, -59.78514, 16.213083, ..."
1,1_izumi_iori,iori_10.mp3,iori,"[-213.6378, 83.18009, -45.80968, 28.044502, -3..."
2,1_izumi_iori,iori_100.mp3,iori,"[-208.48625, 88.34909, -23.720839, 37.77376, -..."
3,1_izumi_iori,iori_101.mp3,iori,"[-221.32008, 102.64978, -24.54152, 30.993345, ..."
4,1_izumi_iori,iori_102.mp3,iori,"[-244.0426, 94.649635, -20.543169, 33.919144, ..."


In [21]:
# checking the value for the first row of data
# checking the shape of the array
filename_df['features'][0]

array([-2.12991257e+02,  8.18185043e+01, -5.97851410e+01,  1.62130833e+01,
       -2.82210579e+01, -9.54763293e-01, -4.48582420e+01,  1.75307322e+00,
       -1.66345387e+01, -8.28979301e+00, -1.63693485e+01,  2.10300970e+00,
       -8.55566788e+00, -4.05240107e+00, -6.12985277e+00, -6.00301266e+00,
       -1.33579950e+01, -1.06783886e+01, -4.16215086e+00, -1.15102015e+01,
        1.01522613e+00,  1.16658375e-01,  2.99505711e-01,  2.53887177e-01,
       -1.97675020e-01, -2.34436877e-02, -1.24401845e-01, -4.72518355e-02,
       -1.22273818e-01, -7.91277215e-02, -9.40342620e-02, -4.02564146e-02,
       -5.38735390e-02, -2.37187278e-03, -9.09994096e-02,  2.43451055e-02,
       -1.10673234e-01, -1.54779423e-02, -3.62798423e-02, -1.25949845e-01,
        6.36945218e-02,  4.47250716e-02,  6.02905564e-02, -2.96153370e-02,
        3.77080850e-02,  7.02312589e-03,  1.22778760e-02,  1.66906994e-02,
       -1.74562968e-02,  1.08389538e-02,  2.96856109e-02, -1.72340702e-02,
        1.14990478e-04,  

In [22]:
# length of a feature array - this will be used in model input shape
filename_df['features'][0].shape

(60,)

In [23]:
# separate input features from output variables
X=np.array(filename_df['features'].tolist())
y=np.array(filename_df['labels'].tolist())

In [24]:
# change word category labels into numerical category labels
labelencoder=LabelEncoder()
y=to_categorical(labelencoder.fit_transform(y))

In [25]:
# separate into train and test data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=97)

In [26]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(799, 60)
(200, 60)
(799, 7)
(200, 7)


In [27]:
filename_df['features'][0].shape[0]

60

In [28]:
feature_shape = filename_df['features'][0].shape[0]

In [30]:
# number of classes (label categories)
num_labels=y.shape[1]

print(num_labels)

7


In [31]:
model=Sequential()
###first layer
model.add(Dense(256,input_shape=(feature_shape,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
###second layer
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.5))
###third layer
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
###final layer
model.add(Dense(num_labels))
model.add(Activation('softmax'))

In [41]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 256)               15616     
                                                                 
 activation_3 (Activation)   (None, 256)               0         
                                                                 
 dropout_3 (Dropout)         (None, 256)               0         
                                                                 
 dense_4 (Dense)             (None, 128)               32896     
                                                                 
 activation_4 (Activation)   (None, 128)               0         
                                                                 
 dropout_4 (Dropout)         (None, 128)               0         
                                                                 
 dense_5 (Dense)             (None, 64)               

In [32]:
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')

In [33]:
num_epochs = 200
num_batch_size = 32

start = datetime.now()
model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), verbose=2)
duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/200
25/25 - 2s - loss: 24.9545 - accuracy: 0.1289 - val_loss: 1.9075 - val_accuracy: 0.2900 - 2s/epoch - 69ms/step
Epoch 2/200
25/25 - 0s - loss: 9.8158 - accuracy: 0.1665 - val_loss: 1.8709 - val_accuracy: 0.2100 - 177ms/epoch - 7ms/step
Epoch 3/200
25/25 - 0s - loss: 5.7308 - accuracy: 0.1539 - val_loss: 1.9045 - val_accuracy: 0.2250 - 172ms/epoch - 7ms/step
Epoch 4/200
25/25 - 0s - loss: 4.2478 - accuracy: 0.1452 - val_loss: 1.8854 - val_accuracy: 0.2900 - 175ms/epoch - 7ms/step
Epoch 5/200
25/25 - 0s - loss: 3.4338 - accuracy: 0.1414 - val_loss: 1.8908 - val_accuracy: 0.2600 - 183ms/epoch - 7ms/step
Epoch 6/200
25/25 - 0s - loss: 2.7705 - accuracy: 0.1602 - val_loss: 1.9006 - val_accuracy: 0.2500 - 175ms/epoch - 7ms/step
Epoch 7/200
25/25 - 0s - loss: 2.5113 - accuracy: 0.1677 - val_loss: 1.9266 - val_accuracy: 0.1800 - 181ms/epoch - 7ms/step
Epoch 8/200
25/25 - 0s - loss: 2.3546 - accuracy: 0.1790 - val_loss: 1.9324 - val_accuracy: 0.1150 - 177ms/epoch - 7ms/step
Epoch 9/2

In [34]:
test_accuracy=model.evaluate(X_test,y_test,verbose=0)
print(test_accuracy[1])

0.8399999737739563


In [35]:
predict_x=model.predict(X_test) 
classes_x=np.argmax(predict_x,axis=1)
predictions = np.array(classes_x)



In [36]:
true_labels_ytest = np.argmax(y_test,axis=1)

In [37]:
predictions_col = pd.Series(predictions)
true_col = pd.Series(true_labels_ytest)

In [38]:
prediction_df = pd.DataFrame({'predictions':predictions_col, 'true_label':true_col})

prediction_df.head()

Unnamed: 0,predictions,true_label
0,3,3
1,1,1
2,5,5
3,3,3
4,1,6


In [39]:
cm = pd.DataFrame(
    confusion_matrix(true_col, predictions_col),
    columns=['pred_iori',
             'pred_yamato',
            'pred_mitsuki',
            'pred_tamaki',
            'pred_sougo',
            'pred_nagi',
            'pred_riku'],
    index=['true_iori',
             'true_yamato',
            'true_mitsuki',
            'true_tamaki',
            'true_sougo',
            'true_nagi',
            'true_riku']
)

cm

Unnamed: 0,pred_iori,pred_yamato,pred_mitsuki,pred_tamaki,pred_sougo,pred_nagi,pred_riku
true_iori,30,0,0,1,3,4,0
true_yamato,0,25,0,2,0,0,1
true_mitsuki,0,0,23,0,2,0,0
true_tamaki,0,1,0,20,1,1,1
true_sougo,0,1,0,2,23,1,0
true_nagi,2,0,1,1,0,24,0
true_riku,0,5,0,2,0,0,23


In [42]:
vocalists = ['iori', 'yamato', 'mitsuki', 'tamaki', 'sougo', 'nagi', 'riku']

In [44]:
print(classification_report(true_col, predictions_col, target_names=vocalists))

              precision    recall  f1-score   support

        iori       0.94      0.79      0.86        38
      yamato       0.78      0.89      0.83        28
     mitsuki       0.96      0.92      0.94        25
      tamaki       0.71      0.83      0.77        24
       sougo       0.79      0.85      0.82        27
        nagi       0.80      0.86      0.83        28
        riku       0.92      0.77      0.84        30

    accuracy                           0.84       200
   macro avg       0.84      0.84      0.84       200
weighted avg       0.85      0.84      0.84       200

