In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from os import listdir
import os
import librosa
from librosa import display,util,load,feature,stft

In [2]:
%matplotlib inline

* mfcc: Mel-frequency cepstral coefficients
* melspectrogram: Compute a Mel-scaled power spectrogram
* chorma-stft: Compute a chromagram from a waveform or power spectrogram
in music, the term chroma feature or chromagram closely relates to the twelve different pitch classes. Chroma-based features, which are also referred to as "pitch class profiles", are a powerful tool for analyzing music whose pitches can be meaningfully categorized and whose tuning approximates to the equal-tempered scale.
* spectral_contrast: Compute spectral contrast
* Spectral contrast is defined as the level difference between peaks and valleys in the spectrum
* tonnetz: Computes the tonal centroid features (tonnetz), following the method of [2]

In [26]:
def listdir_nohidden(path):
    for f in os.listdir(path):
        if not f.startswith('.'):
            yield f

In [3]:
def extract_feat(audiofile_path):
    # load in audio file
    y, sr = load(audiofile_path) # y = audio file, sr = sample rate

    # extract the various features of the audio
    mfcc = np.mean(feature.mfcc(y = y, sr = sr, n_mfcc=40).T, axis = 0)  
    mel = np.mean(feature.melspectrogram(y = y, sr = sr).T, axis = 0)
    stft = np.abs(librosa.stft(y))
    chroma = np.mean(feature.chroma_stft(S = stft, y = y, sr = sr).T, axis = 0)
    contrast = np.mean(feature.spectral_contrast(S = stft, y = y, sr = sr).T, axis = 0)
    tonnetz =  np.mean(feature.tonnetz(y = librosa.effects.harmonic(y), sr = sr).T, axis = 0)

    return mfcc,chroma,mel,contrast,tonnetz # shape: (40,), (12,), (128,), (7,), (6,)

In [33]:
def _read_master_dir(master_media_dir_path):
    # Instatiate a dataframe the train audio features will be in 
    columns = ['id'] + ['mfcc']*40 + ['chroma']*12 + ['mel']*128 + ['contrast']*7 + ['tonnetz']*6
    audio_df = pd.DataFrame(columns = columns)
    
    # Get data, extract features, append features into dataframe
    ids = [f for f in listdir_nohidden(master_media_dir_path)] # get the filenames (ids) in the train_audio/ directory

    for idd in ids:
        id_path = master_media_dir_path  + str(idd)
        id_sub_sample_folder = listdir_nohidden(id_path)

        for mul_sample_one_id_folder in id_sub_sample_folder:
            audio_sample_folder_path = id_path  + '/' + str(mul_sample_one_id_folder) + '/'

            for audiofile in listdir_nohidden(audio_sample_folder_path):
                audiofile_path = audio_sample_folder_path + str(audiofile)
                mfcc,chroma,mel,contrast,tonnetz = extract_feat(audiofile_path)
                features = np.hstack([mfcc,chroma,mel,contrast,tonnetz])

                # get the id in integer form
                idd_int = int(idd[2:])

                # add id in the front of the features array
                labelled = np.insert(features, 0, idd_int, axis = 0)
                fill = np.empty((0,194))
                row = np.vstack([fill,labelled]) # shape (1,193)

                # put row in a dataframe
                row_df = pd.DataFrame(row, columns = columns)

                # append row_df into the dataframe
                audio_df = audio_df.append(row_df, ignore_index = True)

    audio_df.reset_index(inplace = True, drop = True)
    
    return audio_df

http://www.robots.ox.ac.uk/~vgg/data/voxceleb/
    
VoxCeleb is an audio-visual dataset consisting of short clips of human speech, extracted from interview videos uploaded to YouTube

VoxCeleb1 contains over 100,000 utterances for 1,251 celebrities.

In [5]:
print("Done")

Done
