# Extract Spectrogram Features + Model Prediction

### Given 30-second song intro, extract spectrogram features

https://gist.github.com/parulnith/7f8c174e6ac099e86f0495d3d9a4c01e#file-music_genre_classification-ipynb

In [1]:
import librosa
import pandas as pd
import numpy as np
import csv
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

import tensorflow as tf
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization
from tensorflow.keras.layers import Conv2D, MaxPooling2D
import pickle

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [2]:
def extract_features(songname):
    features = {
        'length': '', 'chroma_stft_mean': '', 'chroma_stft_var': '', 'rms_mean': '', 'rms_var': '',
       'spectral_centroid_mean': '', 'spectral_centroid_var': '',
       'spectral_bandwidth_mean': '', 'spectral_bandwidth_var': '', 'rolloff_mean': '',
       'rolloff_var': '', 'zero_crossing_rate_mean': '', 'zero_crossing_rate_var': '',
       'harmony_mean': '', 'harmony_var': '', 'perceptr_mean': '', 'perceptr_var': '', 'tempo': '',
       'mfcc1_mean': '', 'mfcc1_var': '', 'mfcc2_mean': '', 'mfcc2_var': '', 'mfcc3_mean': '',
       'mfcc3_var': '', 'mfcc4_mean': '', 'mfcc4_var': '', 'mfcc5_mean': '', 'mfcc5_var': '',
       'mfcc6_mean': '', 'mfcc6_var': '', 'mfcc7_mean': '', 'mfcc7_var': '', 'mfcc8_mean': '',
       'mfcc8_var': '', 'mfcc9_mean': '', 'mfcc9_var': '', 'mfcc10_mean': '', 'mfcc10_var': '',
       'mfcc11_mean': '', 'mfcc11_var': '', 'mfcc12_mean': '', 'mfcc12_var': '', 'mfcc13_mean': '',
       'mfcc13_var': '', 'mfcc14_mean': '', 'mfcc14_var': '', 'mfcc15_mean': '', 'mfcc15_var': '',
       'mfcc16_mean': '', 'mfcc16_var': '', 'mfcc17_mean': '', 'mfcc17_var': '', 'mfcc18_mean': '',
       'mfcc18_var': '', 'mfcc19_mean': '', 'mfcc19_var': '', 'mfcc20_mean': '', 'mfcc20_var': ''
    }
    y, sr = librosa.load(songname, mono=True, duration=30)
    length = 661794
    chroma_stft_mean, chroma_stft_var = np.mean(librosa.feature.chroma_stft(y=y, sr=sr)), np.var(librosa.feature.chroma_stft(y=y, sr=sr))
    rms_mean, rms_var = np.mean(librosa.feature.rms(y=y)), np.var(librosa.feature.rms(y=y))
    spectral_centroid_mean, spectral_centroid_var = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)), np.var(librosa.feature.spectral_centroid(y=y, sr=sr))
    spectral_bandwidth_mean, spectral_bandwidth_var = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr)), np.var(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    rolloff_mean, rolloff_var = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr)), np.var(librosa.feature.spectral_rolloff(y=y, sr=sr))
    zcr_mean, zcr_var = np.mean(librosa.feature.zero_crossing_rate(y)), np.var(librosa.feature.zero_crossing_rate(y))
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    y_harm, y_perc = librosa.effects.hpss(y)
    harmony_mean, harmony_var = np.mean(y_harm), np.var(y_harm)
    perceptr_mean, perceptr_var = np.mean(y_perc), np.var(y_perc)
    
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    list_mfccs = [ (0,0) for x in range(0, 21) ]
    for index, x in enumerate(mfcc):
        list_mfccs[index] = (np.mean(x), np.var(x))
    
    
    features['length'] = length
    features['chroma_stft_mean'] = chroma_stft_mean
    features['chroma_stft_var'] = chroma_stft_var
    features['rms_mean'] = rms_mean
    features['rms_var'] = rms_var
    features['spectral_centroid_mean'] = spectral_centroid_mean
    features['spectral_centroid_var'] = spectral_centroid_var
    features['spectral_bandwidth_mean'] = spectral_bandwidth_mean
    features['spectral_bandwidth_var'] = spectral_bandwidth_var
    features['rolloff_mean'] = rolloff_mean
    features['rolloff_var'] = rolloff_var
    features['zero_crossing_rate_mean'] = zcr_mean
    features['zero_crossing_rate_var'] = zcr_var
    features['harmony_mean'] = harmony_mean
    features['harmony_var'] = harmony_var
    features['perceptr_mean'] = perceptr_mean
    features['perceptr_var'] = perceptr_var
    features['tempo'] = tempo
    features['mfcc1_mean']  = list_mfccs[0][0]
    features['mfcc1_var']   = list_mfccs[0][1]
    features['mfcc2_mean']  = list_mfccs[1][0]
    features['mfcc2_var']   = list_mfccs[1][1]
    features['mfcc3_mean']  = list_mfccs[2][0]
    features['mfcc3_var']   = list_mfccs[2][1]
    features['mfcc4_mean']  = list_mfccs[3][0]
    features['mfcc4_var']   = list_mfccs[3][1]
    features['mfcc5_mean']  = list_mfccs[4][0]
    features['mfcc5_var']   = list_mfccs[4][1]
    features['mfcc6_mean']  = list_mfccs[5][0]
    features['mfcc6_var']   = list_mfccs[5][1]
    features['mfcc7_mean']  = list_mfccs[6][0]
    features['mfcc7_var']   = list_mfccs[6][1]
    features['mfcc8_mean']  = list_mfccs[7][0]
    features['mfcc8_var']   = list_mfccs[7][1]
    features['mfcc9_mean']  = list_mfccs[8][0]
    features['mfcc9_var']   = list_mfccs[8][1]
    features['mfcc10_mean'] = list_mfccs[9][0]
    features['mfcc10_var']  = list_mfccs[9][1]
    features['mfcc11_mean'] = list_mfccs[10][0]
    features['mfcc11_var']  = list_mfccs[10][1]
    features['mfcc12_mean'] = list_mfccs[11][0]
    features['mfcc12_var']  = list_mfccs[11][1]
    features['mfcc13_mean'] = list_mfccs[12][0]
    features['mfcc13_var']  = list_mfccs[12][1]
    features['mfcc14_mean'] = list_mfccs[13][0]
    features['mfcc14_var']  = list_mfccs[13][1]
    features['mfcc15_mean'] = list_mfccs[14][0]
    features['mfcc15_var']  = list_mfccs[14][1]
    features['mfcc16_mean'] = list_mfccs[15][0]
    features['mfcc16_var']  = list_mfccs[15][1]
    features['mfcc17_mean'] = list_mfccs[16][0]
    features['mfcc17_var']  = list_mfccs[16][1]
    features['mfcc18_mean'] = list_mfccs[17][0]
    features['mfcc18_var']  = list_mfccs[17][1]
    features['mfcc19_mean'] = list_mfccs[18][0]
    features['mfcc19_var']  = list_mfccs[18][1]
    features['mfcc20_mean'] = list_mfccs[19][0]
    features['mfcc20_var']  = list_mfccs[19][1]
    
    return features

In [3]:
model = tf.keras.models.load_model('best_model_75.h5')

In [4]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_24 (Dense)             (None, 256)               15104     
_________________________________________________________________
dense_25 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_26 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_27 (Dense)             (None, 10)                650       
Total params: 56,906
Trainable params: 56,906
Non-trainable params: 0
_________________________________________________________________


In [19]:
filename = '../dataset/gitzan/genres_original/metal/metal.00000.wav'

In [21]:
features = extract_features(filename)

In [22]:
features

{'length': 661794,
 'chroma_stft_mean': 0.48779988,
 'chroma_stft_var': 0.06744512,
 'rms_mean': 0.11614155,
 'rms_var': 0.00030277608,
 'spectral_centroid_mean': 3274.342156991105,
 'spectral_centroid_var': 268370.15639422735,
 'spectral_bandwidth_mean': 2722.3163670049926,
 'spectral_bandwidth_var': 151347.69595078588,
 'rolloff_mean': 6407.227923035991,
 'rolloff_var': 1688083.8619361662,
 'zero_crossing_rate_mean': 0.18307976973684212,
 'zero_crossing_rate_var': 0.0021831421699045304,
 'harmony_mean': 0.0003913453,
 'harmony_var': 0.00695561,
 'perceptr_mean': 0.002437511,
 'perceptr_var': 0.002319706,
 'tempo': 95.703125,
 'mfcc1_mean': -52.901485,
 'mfcc1_var': 350.3016,
 'mfcc2_mean': 50.085777,
 'mfcc2_var': 197.13062,
 'mfcc3_mean': -11.487131,
 'mfcc3_var': 260.34912,
 'mfcc4_mean': 46.34991,
 'mfcc4_var': 221.39444,
 'mfcc5_mean': 6.5513625,
 'mfcc5_var': 116.3445,
 'mfcc6_mean': 10.126185,
 'mfcc6_var': 81.80571,
 'mfcc7_mean': 5.4812465,
 'mfcc7_var': 66.73194,
 'mfcc8_mea

In [23]:
feature_array = [[features[x] for x in features.keys()]]
feature_array

[[661794,
  0.48779988,
  0.06744512,
  0.11614155,
  0.00030277608,
  3274.342156991105,
  268370.15639422735,
  2722.3163670049926,
  151347.69595078588,
  6407.227923035991,
  1688083.8619361662,
  0.18307976973684212,
  0.0021831421699045304,
  0.0003913453,
  0.00695561,
  0.002437511,
  0.002319706,
  95.703125,
  -52.901485,
  350.3016,
  50.085777,
  197.13062,
  -11.487131,
  260.34912,
  46.34991,
  221.39444,
  6.5513625,
  116.3445,
  10.126185,
  81.80571,
  5.4812465,
  66.73194,
  8.855128,
  65.14533,
  5.941896,
  57.46595,
  6.623278,
  51.432953,
  -4.045889,
  56.517776,
  9.771428,
  54.274494,
  -3.1174026,
  53.380817,
  3.6594033,
  57.866287,
  -6.507284,
  49.80213,
  1.0878415,
  47.44548,
  -4.4854007,
  41.09154,
  4.725541,
  36.171886,
  -2.7550318,
  44.879578,
  0.50596565,
  37.08108]]

In [25]:
scaler = pickle.load(open('scaler.pkl', 'rb'))
X = np.array(feature_array, dtype=float)
X_transformed = scaler.transform(X.reshape(1,-1))
# X = scaler.fit_transform(np.array(feature_array, dtype = float))
print(X.shape)
print(X_transformed)

(1, 58)
[[-0.13282213  1.33618427 -2.44388196 -0.22525658 -0.75663742  1.49882262
  -0.50242554  0.91202801  0.14800251  1.16624774 -0.10970533  1.90113726
  -0.26572068  0.44992986 -0.47944947  2.6195039  -0.51567322 -0.8425559
   0.91407627 -1.22692735 -1.57969596 -1.16146771 -0.11839762 -0.72452063
   0.6038223   0.00815313  0.63004985 -0.57646394 -0.38088818 -0.66373152
   1.06791776 -0.85034028 -0.12086815 -0.56190487  1.56235563 -0.78787889
  -0.13947551 -0.82528415  0.28988766 -0.49935451  0.78954066 -0.44148018
   0.27216728 -0.43729339  0.37505958 -0.20032518 -0.54154981 -0.38574005
  -0.01317609 -0.39346825 -0.1142125  -0.64376927  1.09067988 -0.8009563
  -0.11354876 -0.57466707  0.4175429  -0.73098912]]


In [27]:
y_pred = model.predict(X_transformed)
print(y_pred)
print(np.argmax(y_pred))

categories = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
print(categories[np.argmax(y_pred)])

[[3.6417348e-06 5.2628474e-11 3.9219591e-04 1.0080415e-02 4.3267362e-05
  1.1700623e-09 9.8816013e-01 4.0674192e-07 3.8071976e-05 1.2820113e-03]]
6
metal
