## 2.0 Further Pre-Processing
This notebook does further pre-processing of data to be ready for input into machine learning models.

### Table of Contents
[2.1. Setup](#1.)<br>
[2.1.1 Loading libraries](#1.1)<br>
[2.1.2 Setting data directories](#1.2)<br>
[2.1.3 Defining functions](#1.3)<br>

[2.2. Further Pre-processing](#2.)<br>
[2.2.1 Reading in train, validation, and test data sets](#2.1)<br>
[2.2.2 Scaling the spectrograms for min max](#2.2)<br>
[2.2.3 Setting genre classes](#2.3)<br>
[2.2.4 OPTIONAL: Conversion to 3-channel input](#2.4)<br>
[2.2.5 OPTIONAL: Normalization with mean and standard deviation](#2.5)<br>

[2.3. Saving Pre-Processed Data](#3.)<br>
[2.3.1 Shuffling the data and saving as .npy files](#3.1)<br>
[2.3.2 OPTIONAL: No shuffling for embedding and saving as .npy files](#3.2)<br>

### 2.1. Setup <a class="anchor" id="1."></a>

#### 2.1.1 Loading libraries <a class="anchor" id="1.1"></a>

In [1]:
import os
import numpy as np
import pandas as pd
import random
import librosa
import librosa.display
import matplotlib.pyplot as plt
%matplotlib inline

import timeit
import datetime

from sklearn import preprocessing

#### 2.1.2 Setting data directories <a class="anchor" id="1.2"></a>
V2 edits:
* added seed num

In [2]:
ds_description = '2x10s'
# Set seed num, if no seed number, set as None
ds_seed_num = 'seed119'

# Set the directory for the spectrograms
data_dir = f'./data/spect_subsample_{ds_description}_np'
print("Directory of spectrograms: {}".format(data_dir))

Directory of spectrograms: ./data/spect_subsample_2x10s_np


#### 2.1.3 Defining functions <a class="anchor" id="1.3"></a>

In [3]:
def load_data(data_dir, ds_description, ds_seed_num, str_X, str_Y):
    '''
    Loads the .npy data files generated previously from the pre-processing ipynb
    Note: .npy files need to be in the format: train_spect_{ds_description}_np.npy
    
    Inputs
    ------
    data_dir: directory of the .npy files
    ds_description: e.g. '5x10s'  5 subsamples of 10s length
    str_X: str name of the 'X' data, either: 'spect' or 'X'
    str_Y: str name of the 'Y' data, either: 'labels' or 'Y'
    
    Returns
    -------
    6 numpy arrays of:
        train_{str_X}, train_{str_Y}, val_{str_X}, val_{str_Y}, test_{str_X}, test_{str_Y}
    '''
    assert (str_X in ['spect','X']), "Assertion Error, str_X must be either 'spect' or 'X'."
    assert (str_Y in ['labels','Y']), "Assertion Error, str_Y must be either 'labels' or 'Y'."
    
    if ds_seed_num != None:
        ds_description_with_seed = f'{ds_description}_{ds_seed_num}'
    else:
        ds_description_with_seed = f'{ds_description}'
    
    print("Loading .npy data files...")
    # Start timer
    start_time = timeit.default_timer()

    train_str_X = np.load(f'{data_dir}/train_{str_X}_{ds_description_with_seed}_np.npy')
    val_str_X = np.load(f'{data_dir}/val_{str_X}_{ds_description_with_seed}_np.npy')
    test_str_X = np.load(f'{data_dir}/test_{str_X}_{ds_description_with_seed}_np.npy')
    
    if str_Y == 'labels':
        train_str_Y = np.load(f'{data_dir}/train_{str_Y}_{ds_description}_np.npy')
        val_str_Y = np.load(f'{data_dir}/val_{str_Y}_{ds_description}_np.npy')
        test_str_Y = np.load(f'{data_dir}/test_{str_Y}_{ds_description}_np.npy')
    else:
        train_str_Y = np.load(f'{data_dir}/train_{str_Y}_{ds_description_with_seed}_np.npy')
        val_str_Y = np.load(f'{data_dir}/val_{str_Y}_{ds_description_with_seed}_np.npy')
        test_str_Y = np.load(f'{data_dir}/test_{str_Y}_{ds_description_with_seed}_np.npy')        
    
    elapsed = str(datetime.timedelta(seconds = timeit.default_timer() - start_time))
    print("", end='\n')
    print("Total processing time (h:mm:ss): {}".format(elapsed[:-7]))
    print("\nLoaded .npy data files, verifying shape of saved data...")
    print(f"Shape of 'train_{str_X}':", train_str_X.shape)
    print(f"Shape of 'train_{str_Y}':", train_str_Y.shape)

    print(f"Shape of 'val_{str_X}':", val_str_X.shape)
    print(f"Shape of 'val_{str_Y}':", val_str_Y.shape)

    print(f"Shape of 'test_{str_X}':", test_str_X.shape)
    print(f"Shape of 'test_{str_Y}':", test_str_Y.shape)
    
    return train_str_X, train_str_Y, val_str_X, val_str_Y, test_str_X, test_str_Y

In [4]:
def min_max_scaler_3d(array_3d, feature_range=[0,1]):
    '''
    Takes in a 3D numpy array, converts to 2D to apply scikit-learn's 
    preprocessing.MinMaxScaler() method, and then converts to 3D
    
    Returns
    -------
    3D numpy array with values [0,1] (scaled with MinMaxScaler)    
    '''
    (s0, s1, s2) = array_3d.shape
    array_2d = np.reshape(array_3d, (s0 * s1, s2))
    min_max_scaler = preprocessing.MinMaxScaler(feature_range=feature_range)
    array_2d = min_max_scaler.fit_transform(array_2d)
    array_3d = np.reshape(array_2d, (s0, s1, s2))
    
    return array_3d

In [5]:
def map_classes(data_labels, class_dict):
    '''
    Takes in a 1D numpy array of labels, and converts to a 2D array of labels
    based on the class_dict
    
    Inputs
    ------
    Class_dict: dictionary of int keys starting from 0, and str label values
        e.g. genre_dict = {0 : 'Hip-Hop', 1 : 'Pop', 2 : 'Folk',}
    
    Returns
    -------
    data_classified: 2D numpy int array of [0,1], 1 indicating the class for the jth location.
    '''
    # Reverse the dict to have str as the keys
    class_dict_reverse = {v:k for k,v in class_dict.items()}
    n_obs = len(data_labels)
    n_cls = len(class_dict)
    data_classified = np.zeros((n_obs, n_cls), dtype=int)
    
    for i in range(n_obs):
        data_classified[i][class_dict_reverse[data_labels[i]]] = 1
    
    return data_classified

In [6]:
def unison_shuffled_copies(a, b):
    '''
    Shuffles two arrays in unison along the first axis by using a permutation
    Returns
    -------
    a and b numpy arrays shuffled in unison
    '''
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

### 2.2. Further Preprocessing <a class="anchor" id="2."></a>

#### 2.2.1 Reading in train, validation, and test data sets <a class="anchor" id="2.1"></a>

In [7]:
# Read in the spectrogram and labels data from the .npy files
train_spect, train_labels, val_spect, val_labels, test_spect, test_labels = load_data(
    data_dir, ds_description, ds_seed_num, 'spect', 'labels')

Loading .npy data files...

Total processing time (h:mm:ss): 0:00:50

Loaded .npy data files, verifying shape of saved data...
Shape of 'train_spect': (12788, 431, 128)
Shape of 'train_labels': (12788,)
Shape of 'val_spect': (1600, 431, 128)
Shape of 'val_labels': (1600,)
Shape of 'test_spect': (1600, 431, 128)
Shape of 'test_labels': (1600,)


In [8]:
# Checking if all train, val, test sets have same min and max ranges
print("Train [min, max]:", [train_spect.min(), train_spect.max()])
print("Val [min, max]:", [val_spect.min(), val_spect.max()])
print("Test [min, max]:", [test_spect.min(), test_spect.max()])

assert(train_spect.min() == val_spect.min() == test_spect.min()), 'minimum values do not match'
assert(train_spect.max() == val_spect.max() == test_spect.max()), 'maximum values do not match'

Train [min, max]: [-80.0, 3.814697265625e-06]
Val [min, max]: [-80.0, 3.814697265625e-06]
Test [min, max]: [-80.0, 3.814697265625e-06]


#### 2.2.2 Scaling the spectrograms for min max <a class="anchor" id="2.2"></a>

In [9]:
# Define the feature range, [0,1] or [0,255]
feature_range = [0,255]

val_spect_minmax = min_max_scaler_3d(val_spect, feature_range)
test_spect_minmax = min_max_scaler_3d(test_spect, feature_range)
train_spect_minmax = min_max_scaler_3d(train_spect, feature_range)

# Cheecking shape and min and max values
print("Shape of 'train_spect':", train_spect_minmax.shape)
print("Train [min, max]:", [train_spect_minmax.min(), train_spect_minmax.max()])
print()
print("Shape of 'val_spect':", val_spect_minmax.shape)
print("Val [min, max]:", [val_spect_minmax.min(), val_spect_minmax.max()])
print()
print("Shape of 'test_spect':", test_spect_minmax.shape)
print("Test [min, max]:", [test_spect_minmax.min(), test_spect_minmax.max()])

Shape of 'train_spect': (12788, 431, 128)
Train [min, max]: [0.0, 255.0]

Shape of 'val_spect': (1600, 431, 128)
Val [min, max]: [0.0, 255.00000000000006]

Shape of 'test_spect': (1600, 431, 128)
Test [min, max]: [0.0, 255.00000000000003]


#### 2.2.3 Setting genre classes <a class="anchor" id="2.3"></a>

In [10]:
genre_dict = {0 : 'Hip-Hop',
              1 : 'Pop',
              2 : 'Folk',
              3 : 'Experimental',
              4 : 'Rock',
              5 : 'International',
              6 : 'Electronic',
              7 : 'Instrumental'}

In [11]:
# map labels to classes
train_classes = map_classes(train_labels, genre_dict)
val_classes = map_classes(val_labels, genre_dict)
test_classes = map_classes(test_labels, genre_dict)

#### 2.2.4 OPTIONAL: Conversion to 3-channel input <a class="anchor" id="2.4"></a>
V2 edit:
* added this for direct feed into pre-trained CNNs

In [12]:
train_spect_minmax = np.stack([train_spect_minmax,train_spect_minmax,train_spect_minmax], axis=-1)
val_spect_minmax = np.stack([val_spect_minmax,val_spect_minmax,val_spect_minmax], axis=-1)
test_spect_minmax = np.stack([test_spect_minmax,test_spect_minmax,test_spect_minmax], axis=-1)

print("Shape of 'train_spect':", train_spect_minmax.shape)
print("Train [min, max]:", [train_spect_minmax.min(), train_spect_minmax.max()])
print()
print("Shape of 'val_spect':", val_spect_minmax.shape)
print("Val [min, max]:", [val_spect_minmax.min(), val_spect_minmax.max()])
print()
print("Shape of 'test_spect':", test_spect_minmax.shape)
print("Test [min, max]:", [test_spect_minmax.min(), test_spect_minmax.max()])

Shape of 'train_spect': (12788, 431, 128, 3)
Train [min, max]: [0.0, 255.0]

Shape of 'val_spect': (1600, 431, 128, 3)
Val [min, max]: [0.0, 255.00000000000006]

Shape of 'test_spect': (1600, 431, 128, 3)
Test [min, max]: [0.0, 255.00000000000003]


#### 2.2.5 OPTIONAL: Normalization with mean and standard deviation <a class="anchor" id="2.5"></a>
V2 edit:
* added this but better to rely on preprocess_input from Keras for pre-trained CNNs

In [None]:
# train_spect_minmax = (train_spect_minmax - np.mean(train_spect_minmax))/np.std(train_spect_minmax))
# val_spect_minmax = (val_spect_minmax - np.mean(val_spect_minmax))/np.std(val_spect_minmax)
# test_spect_minmax = (test_spect_minmax - np.mean(test_spect_minmax))/np.std(test_spect_minmax)

# print("Shape of 'train_spect':", train_spect_minmax.shape)
# print("Train [min, max]:", [train_spect_minmax.min(), train_spect_minmax.max()])
# print("Train (mean, stdev) :", [np.mean(train_spect_minmax), np.std(train_spect_minmax)])
# print()
# print("Shape of 'val_spect':", val_spect_minmax.shape)
# print("Val [min, max]:", [val_spect_minmax.min(), val_spect_minmax.max()])
# print("Val (mean, stdev) :", [np.mean(val_spect_minmax), np.std(val_spect_minmax)])
# print()
# print("Shape of 'test_spect':", test_spect_minmax.shape)
# print("Test [min, max]:", [test_spect_minmax.min(), test_spect_minmax.max()])
# print("Test (mean, stdev) :", [np.mean(test_spect_minmax), np.std(test_spect_minmax)])

### 2.3. Saving Pre-Processed Data <a class="anchor" id="3."></a>

#### 2.3.1 Shuffling the data and saving as .npy files<a class="anchor" id="3.1"></a>

In [13]:
# shuffle the data and save the the pre-processed 'X' and 'Y' data files

train_X, train_Y = unison_shuffled_copies(train_spect_minmax, train_classes)
val_X, val_Y = unison_shuffled_copies(val_spect_minmax, val_classes)
test_X, test_Y = unison_shuffled_copies(test_spect_minmax, test_classes)

if ds_seed_num != None:
    ds_description_with_seed = f'{ds_description}_{ds_seed_num}'
else:
    ds_description_with_seed = f'{ds_description}'

np.save(f'{data_dir}/train_X_{ds_description_with_seed}_np', train_X)
np.save(f'{data_dir}/val_X_{ds_description_with_seed}_np', val_X)
np.save(f'{data_dir}/test_X_{ds_description_with_seed}_np', test_X)

np.save(f'{data_dir}/train_Y_{ds_description_with_seed}_np', train_Y)
np.save(f'{data_dir}/val_Y_{ds_description_with_seed}_np', val_Y)
np.save(f'{data_dir}/test_Y_{ds_description_with_seed}_np', test_Y)


#### 2.3.2 OPTIONAL: No shuffling for embedding and saving as .npy files<a class="anchor" id="3.2"></a>

In [15]:
# direct saving without shuffling the data.  This is for the embedding set.

if ds_seed_num != None:
    ds_description_with_seed = f'{ds_description}_{ds_seed_num}'
else:
    ds_description_with_seed = f'{ds_description}'

np.save(f'{data_dir}/train_X_{ds_description_with_seed}_np', train_spect_minmax)
np.save(f'{data_dir}/val_X_{ds_description_with_seed}_np', val_spect_minmax)
np.save(f'{data_dir}/test_X_{ds_description_with_seed}_np', test_spect_minmax)

np.save(f'{data_dir}/train_Y_{ds_description_with_seed}_np', train_classes)
np.save(f'{data_dir}/val_Y_{ds_description_with_seed}_np', val_classes)
np.save(f'{data_dir}/test_Y_{ds_description_with_seed}_np', test_classes)
