In [1]:
# Imports
import os
import pandas as pd
import numpy as np
import librosa
import librosa.display

In [2]:
def extract_audio_features(directory):
    '''
    This function takes in a directory of .wav files and returns a 
    DataFrame that includes several numeric features of the audio file 
    as well as the corresponding genre labels.
    
    The numeric features incuded are the first 13 mfccs, zero-crossing rate, 
    spectral centroid, and spectral rolloff.
    
    Parameters:
    directory (int): a directory of audio files in .wav format
    
    Returns:
    df (DataFrame): a table of audio files that includes several numeric features 
    and genre labels.
    '''
    
    # Creating an empty list to store all file names
    files = []
    labels = []
    zcrs = []
    spec_centroids = []
    spec_rolloffs = []
    mfccs_1 = []
    mfccs_2 = []
    mfccs_3 = []
    mfccs_4 = []
    mfccs_5 = []
    mfccs_6 = []
    mfccs_7 = []
    mfccs_8 = []
    mfccs_9 = []
    mfccs_10 = []
    mfccs_11 = []
    mfccs_12 = []
    mfccs_13 = []
    
    # Looping through each file in the directory
    for file in os.scandir(directory):
        
        # Loading in the audio file
        y, sr = librosa.load(file)
        
        # Adding the file to our list of files
        files.append(file)
        
        # Adding the label to our list of labels
        label = str(file).split('.')[0]
        labels.append(label)
        
        # Calculating zero-crossing rates
        zcr = librosa.feature.zero_crossing_rate(y)
        zcrs.append(np.mean(zcr))
        
        # Calculating the spectral centroids
        spec_centroid = librosa.feature.spectral_centroid(y=y)
        spec_centroids.append(np.mean(spec_centroid))
        
        # Calculating the spectral rolloffs
        spec_rolloff = librosa.feature.spectral_rolloff(y=y)
        spec_rolloffs.append(np.mean(spec_rolloff))
        
        # Calculating the first 13 mfcc coefficients
        mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=512, n_mfcc=13)
        mfcc_scaled = np.mean(mfcc.T, axis=0)
        mfccs_1.append(mfcc_scaled[0])
        mfccs_2.append(mfcc_scaled[1])
        mfccs_3.append(mfcc_scaled[2])
        mfccs_4.append(mfcc_scaled[3])
        mfccs_5.append(mfcc_scaled[4])
        mfccs_6.append(mfcc_scaled[5])
        mfccs_7.append(mfcc_scaled[6])
        mfccs_8.append(mfcc_scaled[7])
        mfccs_9.append(mfcc_scaled[8])
        mfccs_10.append(mfcc_scaled[9])
        mfccs_11.append(mfcc_scaled[10])
        mfccs_12.append(mfcc_scaled[11])
        mfccs_13.append(mfcc_scaled[12])
    
    # Creating a data frame with the values we collected
    df = pd.DataFrame({
        'files': files,
        'zero_crossing_rate': zcrs,
        'spectral_centroid': spec_centroids,
        'spectral_rolloff': spec_rolloffs,
        'mfcc_1': mfccs_1,
        'mfcc_2': mfccs_2,
        'mfcc_3': mfccs_3,
        'mfcc_4': mfccs_4,
        'mfcc_5': mfccs_5,
        'mfcc_6': mfccs_6,
        'mfcc_7': mfccs_7,
        'mfcc_8': mfccs_8,
        'mfcc_9': mfccs_9,
        'mfcc_10': mfccs_10,
        'mfcc_11': mfccs_11,
        'mfcc_12': mfccs_12,
        'mfcc_13': mfccs_13,
        'labels': labels
    })
    
    # Returning the data frame
    return df

In [3]:
df = extract_audio_features("../data/wavfiles")

In [4]:
genre = df.reset_index(drop=True)
genre.head()

Unnamed: 0,files,zero_crossing_rate,spectral_centroid,spectral_rolloff,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,labels
0,<DirEntry 'blues.00000.wav'>,0.083045,1784.122641,3805.72303,-113.598824,121.570671,-19.162258,42.363937,-6.362263,18.621929,-13.699731,15.3398,-12.274303,10.970943,-8.326058,8.802087,-3.669939,<DirEntry 'blues
1,<DirEntry 'blues.00001.wav'>,0.05604,1530.261767,3550.713616,-207.523834,123.985138,8.947024,35.867146,2.909599,21.51947,-8.55651,23.370674,-10.103606,11.89924,-5.558822,5.377874,-2.23449,<DirEntry 'blues
2,<DirEntry 'blues.00002.wav'>,0.076291,1552.832481,3042.410115,-90.757164,140.440887,-29.084547,31.686693,-13.976547,25.753752,-13.66499,11.634441,-11.778321,9.714755,-13.125314,5.791246,-8.901966,<DirEntry 'blues
3,<DirEntry 'blues.00003.wav'>,0.033309,1070.153418,2184.879029,-199.575134,150.086121,5.663406,26.855278,1.770073,14.232647,-4.827844,9.286853,-0.756119,8.134434,-3.200025,6.078081,-2.478445,<DirEntry 'blues
4,<DirEntry 'blues.00004.wav'>,0.101461,1835.128513,3579.957471,-160.354172,126.209496,-35.581394,22.139254,-32.473549,10.850698,-23.350067,0.493245,-11.796535,1.203516,-13.084955,-2.810502,-6.934469,<DirEntry 'blues


In [5]:
# Mapping the labels to numeric values
label_map = {
    'blues': 1,
    'classical': 2,
    'country': 3,
    'disco': 4,
    'hiphop': 5,
    'jazz': 6,
    'metal': 7,
    'pop': 8,
    'reggae': 9,
    'rock': 10
}
genre.to_csv('../data/genre.csv', index=False)
genre['y'] = genre['labels'].map(label_map)

In [6]:
genre.to_csv('../data/genre_clean.csv', index=False)

In [9]:
def make_mel_spectrogram_df(directory):
    '''
    This function takes in a directory of audio files in .wav format, computes the
    mel spectrogram for each audio file, reshapes them so that they are all the 
    same size, flattens them, and stores them in a dataframe.
    
    Genre labels are also computed and added to the dataframe.
    
    Parameters:
    directory (int): a directory of audio files in .wav format
    
    Returns:
    df (DataFrame): a dataframe of flattened mel spectrograms and their 
    corresponding genre labels
    '''
    
    # Creating empty lists for mel spectrograms and labels
    labels = []
    mel_specs = []
    
    # Looping through each file in the directory
    for file in os.scandir(directory):
        
        # Loading in the audio file
        y, sr = librosa.core.load(file)
        
        # Extracting the label and adding it to the list
        label = str(file).split('.')[0][11:]
        labels.append(label)
        
        # Computing the mel spectrograms
        spect = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=1024)
        spect = librosa.power_to_db(spect, ref=np.max)
        
        # Adjusting the size to be 128 x 660
        if spect.shape[1] != 660:
            spect.resize(128,660, refcheck=False)
        
        # Flattening to fit into dataframe and adding to the list
        spect = spect.flatten()
        mel_specs.append(spect)
        
    # Converting the lists to arrays so we can stack them
    mel_specs = np.array(mel_specs)
    labels = np.array(labels).reshape(1000,1)
    
    # Create dataframe
    df = pd.DataFrame(np.hstack((mel_specs,labels)))
    
    # Returning the mel spectrograms and labels
    return df

In [10]:
# Using the above function to create a dataframe with all of the flattened mel spectrograms and genre labels
df = make_mel_spectrogram_df('../data/wavfiles')
df.to_csv('../data/genre_mel_specs.csv', index=False)