# UrbanSound8K metadata
- I write this notebook to learn how to handle audio dataset. 
- What I refered(almost copied): [Link](https://github.com/keunwoochoi/UrbanSound8K-preprocessing/blob/master/preprocess_urban.ipynb)
- It creates three files, train, valid, and test + .h5
- Split: folder 1-8: train, 9:valid, 10:test

In [None]:
# Imports
import time
import os

import h5py
import numpy as np
import pandas as pd
import librosa
import tqdm

In [None]:
# Paths
SRC_ROOT = "UrbanSound8K"
HDF_PATH = "urbansound8k_hdf"

## About metadata

In [None]:
metadata = pd.read_csv(SRC_ROOT + "/metadata/UrbanSound8K.csv")
class_ids = sorted(metadata['classID'].unique())
fold_ids = sorted(metadata['fold'].unique())
print("class id list: {}".format(class_ids))
print("fold id list: {}".format(fold_ids))

# Sort metadata by class id
metadata.sort_values(by=["classID"], inplace=True)
metadata.head()

In [None]:
print("Number of data: {}".format(metadata.shape[0]))
print("Number of classes: {}".format(metadata['class'].nunique()))
print("Classes: {}".format(metadata['class'].unique()))

In [None]:
# Initialize decription list
data_description = []

In [None]:
for fold_num in fold_ids:
    data_description.append(
        metadata[metadata.fold == fold_num]['class'].value_counts())

In [None]:
data_description = pd.DataFrame(data_description)
data_description['folder'] = ["fold" + str(x) for x in fold_ids]
data_description.set_index('folder', inplace = True)
data_description

- fold9 will be **validation set** and fold10 will be **test set**

## Shuffling
- The shuffling should be WITHIN each dataset
    - For example, train data should be shuffled in train dataset

In [None]:
df = pd.read_csv(SRC_ROOT + "/metadata/UrbanSound8K.csv", header=0)
N_TRAIN = len(df[df['fold']<9])
N_VALID = len(df[df['fold']==9])
N_TEST = len(df[df['fold']==10])
print("Number of Train data: {}".format(N_TRAIN))
print("Number of Valid data: {}".format(N_VALID))
print("Number of Test data: {}".format(N_TEST))

In [None]:
if not os.path.exists('shuffled_idxs.npy'):
    np.random.seed(42)  # for reproducibility
    train_shfl_idxs = np.random.permutation(N_TRAIN)
    valid_shfl_idxs = np.random.permutation(N_VALID)
    test_shfl_idxs = np.random.permutation(N_TEST)
    np.save("shuffled_idxs.npy", [train_shfl_idxs, valid_shfl_idxs, test_shfl_idxs])
    print("Generated new shuffled index file")
    
[train_shfl_idxs, valid_shfl_idxs, test_shfl_idxs] = np.load(
    'shuffled_idxs.npy', allow_pickle=True)
print("Shuffle index loaded")

# Create dataset

In [None]:
# Raw audio saving config

# Sample rate in Hz
SAMPLE_RATE = 44100 

# should be < 4.0. Not recommend to change. 
MAX_LEN_SEC = 4.0 

LEN_RAW = int(SAMPLE_RATE * MAX_LEN_SEC)

CLASS_COUNT = len(metadata['classID'].unique())

In [None]:
def row_to_label(row_index, row, dataset):
    class_id = getattr(row, 'classID')
    
    # One-hot encoding
    dataset[row_index, class_id] = True

In [None]:
def row_to_raw(row_index, row, dataset):
    fname, fold = getattr(row, 'slice_file_name'), getattr(row, 'fold') 
    src_path = SRC_ROOT + "/audio/fold" + str(fold) + '/' + fname
    src, sr = librosa.load(src_path, SAMPLE_RATE)
    dataset[row_index, :min(LEN_RAW, len(src))] = src[:LEN_RAW]

In [None]:
def save_urban_sound_8k_hdf(hdf_file_path, dataframe_subset, shuffle_index):
    start_time = time.time()
    num_data = len(dataframe_subset)
    with h5py.File(hdf_file_path, 'w') as f_hdf:
        ds_raw_data = f_hdf.create_dataset('raw', (num_data, LEN_RAW), dtype='float32')
        ds_label = f_hdf.create_dataset('label', (num_data, CLASS_COUNT), dtype='float32')
        for row_index, row in enumerate(dataframe_subset.iloc[shuffle_index].itertuples()):
            row_to_raw(row_index, row, ds_raw_data)
            row_to_label(row_index, row, ds_label)
            if row_index % 100 == 0:
                sys.stdout.write("\r{0}/{1}-th sample was written".format(row_index + 1, num_data))
        print("The file {0} save done: It took {1} seconds.".format(
            hdf_file_path ,int(time.time() - start_time)))

In [None]:
# Create training sets
save_urban_sound_8k_hdf(HDF_PATH+'train.h5', df[df['fold']<9], train_shfl_idxs)
save_urban_sound_8k_hdf(HDF_PATH+'validate.h5', df[df['fold']==9], valid_shfl_idxs)
save_urban_sound_8k_hdf(HDF_PATH+'test.h5', df[df['fold']==10], test_shfl_idxs)