In [2]:
!pip install python_speech_features==0.6

Collecting python_speech_features==0.6
  Using cached python_speech_features-0.6.tar.gz (5.6 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: python_speech_features
  Building wheel for python_speech_features (setup.py) ... [?25ldone
[?25h  Created wheel for python_speech_features: filename=python_speech_features-0.6-py3-none-any.whl size=5889 sha256=b93c231013f8d830240abb52c86df5741b56ccae52a2eac924a4a07820f5356e
  Stored in directory: /root/.cache/pip/wheels/09/a1/04/08e2688d2562d8f9ff89e77c6ddfbf7268e07dae1a6f22455e
Successfully built python_speech_features
Installing collected packages: python_speech_features
Successfully installed python_speech_features-0.6
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.[0m[33m
[0m

In [9]:
from python_speech_features import mfcc
import scipy.io.wavfile as wav
import numpy as np
from tempfile import TemporaryFile
import os
import pickle
import random 
import operator
import math
import numpy as np
import librosa
import librosa.display

import seaborn as sns
import json

In [25]:
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

In [4]:
example_file = '/work/pop000.wav'

In [5]:
signal, sample_rate = librosa.load(example_file)

In [6]:
print(signal) #amplitude of the sound

[ 0.10225765  0.24458705  0.2634704  ... -0.52111185 -0.49635828
 -0.3400721 ]


In [7]:
print(sample_rate) #the default rate

22050


In [26]:
#plotting the waveform
plt.figure(figsize = (16,8))
librosa.display.waveshow(y =signal, sr =sample_rate, alpha=0.5)
plt.xlabel('Time')
plt.ylabel('Amplitude')
plt.title('Waveforming')
plt.show()

In [15]:
#Fourier transform fft - (Frequency domain)
fft = np.fft.fft(signal)

In [16]:
#calculating the magnitude
spectrum = np.abs(fft)

In [17]:
#recreating the frequency variable
f = np.linspace(0, sample_rate, len(spectrum))

In [27]:
#plotting the spectrum
plt.figure(figsize = (16,8))
plt.plot(f,spectrum, alpha=0.5)
plt.xlabel('Frequency')
plt.ylabel('Magnitude')
plt.title('Spectrum')

Text(0.5, 1.0, 'Spectrum')

There is strong magnitude for lower frequencies, and strong magnitude for the high frequencies

In [19]:
#taking the half of the freq and magnitude
left_spectrum = spectrum[:int(len(spectrum)/2)]
left_f = f[:int(len(spectrum)/2)]

In [28]:
# Plot the power spectrum
plt.figure(figsize=(16, 8))
plt.plot(left_f, left_spectrum, alpha=0.5)
plt.xlabel("Frequency")
plt.ylabel("Magnitude")
plt.title("Power Spectrum")

Text(0.5, 1.0, 'Power Spectrum')

In [21]:
# Spectrogram (STFT)
hop_length = 512 # num. of samples
n_fft = 2048 # num. of samples for window

In [22]:
# Perform STFT
stft = librosa.stft(signal, n_fft=n_fft, hop_length=hop_length)
     

In [23]:
# Calculate the Magnitude (abs values on complex numbers)
spectrogram = np.abs(stft)

In [29]:

# Plot the Spectrogram
plt.figure(figsize=(16, 8))
librosa.display.specshow(spectrogram, sr=sample_rate, hop_length=hop_length)
plt.xlabel("Time")
plt.ylabel("Frequency")
plt.colorbar()
plt.title("Spectrogram")

Text(0.5, 1.0, 'Spectrogram')

This shows the changing spectra as a function of time

In [31]:
# Apply Logarithm to get values in Decibels
log_spectrogram = librosa.amplitude_to_db(spectrogram)

In [32]:

# Plot the Spectrogram in Decibels
plt.figure(figsize=(16,8))
librosa.display.specshow(log_spectrogram, sr=sample_rate, hop_length=hop_length)
plt.xlabel("Time")
plt.ylabel("Frequency")
plt.colorbar(format="%+2.0f dB")
plt.title("Spectrogram (dB)")

Text(0.5, 1.0, 'Spectrogram (dB)')

MFCCs divide the audio signal into short time frames and applying the Fourier transform to each frame. Its also becomes non-sensitive to small changes and variations in the spectrum. MFCCs extract relevant features from the audio that are differentiate the different music genres. This is why we want the MFCC feature.

In [33]:
# MFCCs (we use 13 MFCCs)
MFCCs = librosa.feature.mfcc(y=signal, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mfcc=13)

In [34]:
# Plot MFCCs
plt.figure(figsize=(16,8))
librosa.display.specshow(MFCCs, sr=sample_rate, hop_length=hop_length)
plt.xlabel("Time")
plt.ylabel("MFCC coefficients")
plt.colorbar()
plt.title("MFCCs")
     

Text(0.5, 1.0, 'MFCCs')

### Get MFCCs for all the files in the dataset

In [35]:
dataset_path = "/datasets/my-drive/genre_wav"
json_path = "data_10.json"
sample_rate = 22050
track_duration = 30 # measured in seconds
samples_per_track = sample_rate * track_duration

In [39]:
classes = [a for a in os.listdir('/datasets/my-drive/genre_wav') if '.' not in a]
print(classes)
     

['rock', 'pop', 'mpop', 'indie', 'hiphop', 'folk']


In [44]:
def save_mfcc(dataset_path, json_path, num_mfcc=13, n_fft=2048, hop_length=512, num_segments=5):
    """Extracts MFCCs from music dataset and saves them into a json file along with genre labels.
    """

    # dictionary to store mapping, labels, and MFCCs
    data = {
        "mapping": [],
        "labels": [],
        "mfcc": []
    }
    global sample_rate

    samples_per_segment = int(samples_per_track / num_segments)
    num_mfcc_vectors_per_segment = math.ceil(samples_per_segment / hop_length)

    # loop through all genre sub-folder
    for i, (dirpath, dirnames, filenames) in enumerate(os.walk(dataset_path)):

        # ensure we're processing a genre sub-folder level
        if dirpath is not dataset_path:

            # save genre label in the mapping
            semantic_label = dirpath.split("/")[-1]
            data["mapping"].append(semantic_label)
            print("\nProcessing: {}".format(semantic_label))

            # process all audio files in genre sub-directory
            for f in filenames:

		# load audio file
                file_path = os.path.join(dirpath, f)
                signal, sample_rate = librosa.load(file_path, sr=sample_rate)

                # process all segments of audio file
                for d in range(num_segments):

                    # calculate start and finish sample for current segment
                    start = samples_per_segment * d
                    finish = start + samples_per_segment

                    # extract mfcc
                    mfcc = librosa.feature.mfcc(y = signal[start:finish], sr = sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)
                    mfcc = mfcc.T

                    # store only mfcc feature with expected number of vectors
                    if len(mfcc) == num_mfcc_vectors_per_segment:
                        data["mfcc"].append(mfcc.tolist())
                        data["labels"].append(i-1)
                        print("{}, segment:{}".format(file_path, d+1))

    # save MFCCs to json file
    with open(json_path, "w") as fp:
        json.dump(data, fp, indent=4)

    return 

In [45]:
save_mfcc(dataset_path, json_path, num_segments=10)

/datasets/my-drive/genre_wav/pop/pop025.wav, segment:5
/datasets/my-drive/genre_wav/pop/pop025.wav, segment:6
/datasets/my-drive/genre_wav/pop/pop025.wav, segment:7
/datasets/my-drive/genre_wav/pop/pop025.wav, segment:8
/datasets/my-drive/genre_wav/pop/pop025.wav, segment:9
/datasets/my-drive/genre_wav/pop/pop025.wav, segment:10
/datasets/my-drive/genre_wav/pop/pop026.wav, segment:1
/datasets/my-drive/genre_wav/pop/pop026.wav, segment:2
/datasets/my-drive/genre_wav/pop/pop026.wav, segment:3
/datasets/my-drive/genre_wav/pop/pop026.wav, segment:4
/datasets/my-drive/genre_wav/pop/pop026.wav, segment:5
/datasets/my-drive/genre_wav/pop/pop026.wav, segment:6
/datasets/my-drive/genre_wav/pop/pop026.wav, segment:7
/datasets/my-drive/genre_wav/pop/pop026.wav, segment:8
/datasets/my-drive/genre_wav/pop/pop026.wav, segment:9
/datasets/my-drive/genre_wav/pop/pop027.wav, segment:1
/datasets/my-drive/genre_wav/pop/pop027.wav, segment:2
/datasets/my-drive/genre_wav/pop/pop027.wav, segment:3
/datasets

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=b07dafd9-52d0-496f-b99d-0bc116229374' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>