## References

  + [Deep Learning For Audio, part 12: dataset preparation](https://github.com/musikalkemist/DeepLearningForAudioWithPython/blob/44a0e1880eee57a523780a1862cb8bf44963fbe8/12-%20Music%20genre%20classification%3A%20Preparing%20the%20dataset/code/extract_data.py)

# Setup

Use a copy of the [GTZAN music dataset](https://www.kaggle.com/datasets/andradaolteanu/gtzan-dataset-music-genre-classification) that is stored in a Google Drive folder. Reference the path to this "gtzan" dataset, and update the `DATASET_PATH` below, as necessary.



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

# you might need to update the path below, or create a shortcut to the path below
DATASET_PATH = '/content/drive/MyDrive/Research/DS Research Shared 2023/data/gtzan'

print(DATASET_PATH)
assert os.path.isdir(DATASET_PATH)
print(os.listdir(DATASET_PATH))


/content/drive/MyDrive/Research/DS Research Shared 2023/data/gtzan
['features_3_sec.csv', 'features_30_sec.csv', 'genres_original', 'images_original']


# Directory Traversal

In [None]:
GENRES_DIR = os.path.join(DATASET_PATH, "genres_original")

GENRES = sorted(os.listdir(GENRES_DIR))
print("GENRES:", GENRES)


GENRES: ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']


In [None]:

for genre in GENRES:
    print("--------")
    genre_dirpath = os.path.join(GENRES_DIR, genre)
    audio_filenames = sorted(os.listdir(genre_dirpath))
    print(len(audio_filenames), audio_filenames[0], "...", audio_filenames[-1])
    for audio_filename in audio_filenames:
        # todo: process audio and get mfccs
        # todo: save mfccs
        pass

--------
100 blues.00000.wav ... blues.00099.wav
--------
100 classical.00000.wav ... classical.00099.wav
--------
100 country.00000.wav ... country.00099.wav
--------
100 disco.00000.wav ... disco.00099.wav
--------
100 hiphop.00000.wav ... hiphop.00099.wav
--------
100 jazz.00000.wav ... jazz.00099.wav
--------
100 metal.00000.wav ... metal.00099.wav
--------
100 pop.00000.wav ... pop.00099.wav
--------
100 reggae.00000.wav ... reggae.00099.wav
--------
100 rock.00000.wav ... rock.00099.wav


## Tracks

In [None]:
genre_dirpath = os.path.join(GENRES_DIR, "classical")

audio_filename = "classical.00099.wav"
audio_filepath = os.path.join(genre_dirpath, audio_filename)
print(audio_filepath)
os.path.isfile(audio_filepath)

/content/drive/MyDrive/Research/DS Research Shared 2023/data/gtzan/genres_original/classical/classical.00099.wav


True

In [None]:
# https://librosa.org/doc/main/generated/librosa.load.html

import librosa

SAMPLE_RATE = 22_050

track, sample_rate = librosa.load(audio_filepath, sr=SAMPLE_RATE)
print(track.shape)

(661794,)


In [None]:
# https://librosa.org/doc/main/generated/librosa.feature.mfcc.html

num_mfcc = 13
n_fft = 2_048
hop_length = 512

mfcc = librosa.feature.mfcc(track, sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)
#print(mfcc.shape) #> row per mfcc

mfcc = mfcc.T
print(mfcc.shape) #>  col per mfcc

(1293, 13)


In [None]:
len(track) / len(mfcc) #> hop_length

511.8283062645012

In [None]:
len(mfcc[0])

mfcc[0]

array([-3.8486108e+02,  1.3096635e+02, -1.9904621e+01,  2.8951111e+01,
        1.4113569e-01,  1.0240509e+01, -9.5239773e+00,  6.7944188e+00,
        6.0813303e+00,  3.2001930e+01,  1.6242659e+01, -8.4883595e+00,
       -8.3745480e+00], dtype=float32)

In [None]:
result = {"track": audio_filename, "mfcc_shape": mfcc.shape, "mfcc": mfcc.tolist()}
print(result)

{'track': 'classical.00099.wav', 'mfcc_shape': (1293, 13), 'mfcc': [[-384.861083984375, 130.9663543701172, -19.904621124267578, 28.95111083984375, 0.14113569259643555, 10.240509033203125, -9.523977279663086, 6.794418811798096, 6.081330299377441, 32.001930236816406, 16.242658615112305, -8.488359451293945, -8.374547958374023], [-397.16082763671875, 136.9587860107422, -20.314861297607422, 35.79778289794922, -3.612337112426758, 13.264921188354492, -10.143945693969727, 16.813772201538086, 5.256778240203857, 31.219106674194336, 9.874444007873535, -5.389386177062988, -6.883050918579102], [-407.7560729980469, 135.43907165527344, -24.69814682006836, 38.4129638671875, -10.196651458740234, 18.297866821289062, -10.599720001220703, 29.443866729736328, -1.248809576034546, 30.71044921875, 6.367610931396484, 3.1525542736053467, -13.563739776611328], [-406.5735168457031, 134.37557983398438, -19.25736427307129, 41.049835205078125, -14.360234260559082, 15.627386093139648, -9.671116828918457, 29.991573333

In [None]:
json_filename = audio_filename.replace(".wav", ".json")
json_filename

'classical.00099.json'

In [None]:
# import json
# 
# json_filepath = json_filename
# with open(json_filepath, "w") as json_file:
#     json.dump(result, json_file)
# 

In [None]:
mfcc_dirpath = os.path.join(DATASET_PATH, "mfcc")
if not os.path.isdir(mfcc_dirpath):
    os.mkdir(mfcc_dirpath)

genre_mfcc_dirpath = os.path.join(mfcc_dirpath, "classical")
if not os.path.isdir(genre_mfcc_dirpath):
    os.mkdir(genre_mfcc_dirpath)

In [None]:
json_filepath = os.path.join(genre_mfcc_dirpath, json_filename)
json_filepath

'/content/drive/MyDrive/Research/DS Research Shared 2023/data/gtzan/mfcc/classical/classical.00099.json'

In [None]:
import json

with open(json_filepath, "w") as json_file:
    json.dump(result, json_file)

In [None]:
# https://librosa.org/doc/main/generated/librosa.load.html

import librosa

SAMPLE_RATE = 22_050

def process_track(genre, audio_filename, sr=SAMPLE_RATE):
    genre_dirpath = os.path.join(GENRES_DIR, genre)
    audio_filepath = os.path.join(genre_dirpath, audio_filename)
    #print(audio_filepath)
    os.path.isfile(audio_filepath)

    # https://librosa.org/doc/main/generated/librosa.load.html
    track, sample_rate = librosa.load(audio_filepath, sr=sr)
    #print(track.shape)
    return track
    

def process_mfcc(track, sr=SAMPLE_RATE, n_mfcc=13, n_fft=2048, hop_length=512):
    # https://librosa.org/doc/main/generated/librosa.feature.mfcc.html
    mfcc = librosa.feature.mfcc(track, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)
    #print(mfcc.shape) #> row per mfcc

    mfcc = mfcc.T
    #print(mfcc.shape) #>  col per mfcc
    return mfcc


In [None]:
import json

MFCC_DIRPATH = os.path.join(DATASET_PATH, "mfcc")
if not os.path.isdir(MFCC_DIRPATH):
    os.mkdir(MFCC_DIRPATH)

def save_mfcc(genre, audio_filename, mfcc):
    genre_mfcc_dirpath = os.path.join(MFCC_DIRPATH, genre)
    if not os.path.isdir(genre_mfcc_dirpath):
        os.mkdir(genre_mfcc_dirpath)

    json_filename = json_filename = audio_filename.replace(".wav", ".json")
    json_filepath = os.path.join(genre_mfcc_dirpath, json_filename)

    with open(json_filepath, "w") as json_file:
        json.dump(result, json_file)

In [None]:
results = []

for genre in GENRES:
    genre_dirpath = os.path.join(GENRES_DIR, genre)
    
    audio_filenames = sorted(os.listdir(genre_dirpath))
    #audio_filenames = sorted([fname for fname in os.listdir(genre_dirpath)] if fname.endswith(".wav"))
    print(genre, len(audio_filenames))
    
    for audio_filename in audio_filenames:
        try:
            track = process_track(genre, audio_filename)
            
            mfcc = process_mfcc(track)
            #print(audio_filename, track.shape, mfcc.shape)
            results.append({
                "audio_filename": audio_filename,
                #"error": None,
                "track_length": len(track),
                "mfcc_rows": mfcc.shape[0],
                "mfcc_cols": mfcc.shape[1]
            })

            save_mfcc(genre, audio_filename, mfcc)
        except Exception as err:
            print("ERR:", audio_filename, err)
            #results.append({
            #    "audio_filename": audio_filename,
            #    "error": err,
            #    "track_length": None,
            #    "mfcc_rows": None,
            #    "mfcc_cols": None
            #})


from pandas import DataFrame
results_df = DataFrame(results)
results_df

--------
BLUES
100
--------
CLASSICAL
100
--------
COUNTRY
100
--------
DISCO
100
--------
HIPHOP
100
--------
JAZZ
100




ERR: jazz.00054.wav 
--------
METAL
100
--------
POP
100
--------
REGGAE
100
--------
ROCK
100


Unnamed: 0,audio_filename,track_length,mfcc_rows,mfcc_cols
0,blues.00000.wav,661794,1293,13
1,blues.00001.wav,661794,1293,13
2,blues.00002.wav,661794,1293,13
3,blues.00003.wav,661794,1293,13
4,blues.00004.wav,661794,1293,13
...,...,...,...,...
994,rock.00095.wav,661794,1293,13
995,rock.00096.wav,661794,1293,13
996,rock.00097.wav,661794,1293,13
997,rock.00098.wav,661794,1293,13


In [None]:
results_df.to_csv(os.path.join(MFCC_DIRPATH, "results.csv"), index=False)

## Track Segments

## Scratch Work

In [None]:
genre_dirpath = os.path.join(GENRES_DIR, "blues")
audio_filename = "blues.00000.wav"
audio_filepath = os.path.join(genre_dirpath, audio_filename)
print(audio_filepath)
os.path.isfile(audio_filepath)

/content/drive/MyDrive/Research/DS Research Shared 2023/data/gtzan/genres_original/blues/blues.00000.wav


True

In [None]:
%%capture

!pip install librosa

### Audio Signal Processing

In [None]:
# https://github.com/musikalkemist/DeepLearningForAudioWithPython/blob/44a0e1880eee57a523780a1862cb8bf44963fbe8/12-%20Music%20genre%20classification%3A%20Preparing%20the%20dataset/code/extract_data.py


import librosa

SAMPLE_RATE = 22_050 # this is standard?

signal, sample_rate = librosa.load(audio_filepath, sr=SAMPLE_RATE)
print(type(signal), signal.shape) #> <class 'numpy.ndarray'> (661794,)
print(sample_rate) #> 22050

<class 'numpy.ndarray'>
<class 'int'>


In [None]:
TRACK_DURATION = 30 # measured in seconds
SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION
print(SAMPLES_PER_TRACK) #> 661_500

661500


In [None]:
num_segments = 5
samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
print(samples_per_segment) #> 132_300

132300


In [None]:
#from math import ceil
#
#hop_length = 512
#
#num_mfcc_vectors_per_segment = ceil(samples_per_segment / hop_length)
#print(num_mfcc_vectors_per_segment) #> 259

259


In [None]:




# process all segments of audio file
for i in range(num_segments):
    # calculate start and finish sample for current segment
    start = samples_per_segment * i
    finish = start + samples_per_segment

    segment = signal[start:finish]
    print(segment.shape, start, finish) #> each of n segments has length 132_300

    # extract mfcc
    #num_mfcc = 13
    #n_fft = 2048
    #mfcc = librosa.feature.mfcc(segment, sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)
    #mfcc = mfcc.T
    #
    ## store only mfcc feature with expected number of vectors
    #if len(mfcc) == num_mfcc_vectors_per_segment:
    #    data["mfcc"].append(mfcc.tolist())
    #    data["labels"].append(i-1)
    #    print("{}, segment:{}".format(file_path, d+1))

(132300,) 0 132300
(132300,) 132300 264600
(132300,) 264600 396900
(132300,) 396900 529200
(132300,) 529200 661500


In [None]:
segment

array([0.1331482 , 0.08804321, 0.02166748, ..., 0.18188477, 0.1637268 ,
       0.15093994], dtype=float32)

In [None]:
from math import ceil

hop_length = 512

num_mfcc_vectors_per_segment = ceil(samples_per_segment / hop_length)
print(num_mfcc_vectors_per_segment) #> 259

259


In [None]:
# extract mfcc
num_mfcc = 13
n_fft = 2048

mfcc = librosa.feature.mfcc(segment, sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)
print(mfcc.shape) #> row per mfcc

mfcc = mfcc.T
print(mfcc.shape) #>  col per mfcc

(13, 259)
(259, 13)


In [None]:
print(len(mfcc), num_mfcc_vectors_per_segment)
len(mfcc) == num_mfcc_vectors_per_segment

259 259


True

(numpy.ndarray, (13, 259))

In [None]:
mfcc.tolist()

In [None]:
print(len(mfcc.tolist()))
#print(len(max(mfcc.tolist()))) # each is length 13
#print(len(min(mfcc.tolist()))) # each is length 13
print(mfcc.tolist())

259
[[-117.90886688232422, 155.34262084960938, -36.403350830078125, 34.48040771484375, -5.858351707458496, 8.968637466430664, -19.260128021240234, 0.7487397193908691, -14.720978736877441, 11.206427574157715, -7.416699409484863, 1.2523670196533203, 2.3261494636535645], [-113.97077178955078, 147.74484252929688, -38.89734649658203, 43.98476028442383, -5.780422210693359, 23.27222442626953, -23.569931030273438, 9.320098876953125, -13.726792335510254, 12.35094165802002, -5.651129245758057, 4.012964248657227, -1.831825613975525], [-106.36287689208984, 131.99267578125, -39.43109893798828, 51.41212463378906, -4.727408409118652, 32.766693115234375, -23.325923919677734, 10.54241943359375, -13.951803207397461, 12.975793838500977, -2.6781277656555176, 4.66048002243042, -8.287091255187988], [-118.74872589111328, 130.39808654785156, -37.42637634277344, 54.163055419921875, -1.2674015760421753, 27.4476261138916, -21.594623565673828, 11.580585479736328, -20.20451545715332, 7.8146209716796875, -2.5202610