#### Data Munging Notebook!

This notebook is for doing data manipulation and various munging to get chiptune .wav data ready for prediction/analysis.

Steps: load the wave data into memory, save as raw numpy data as either a numpy archive or a pickled pandas dataframe

In [None]:
import numpy as np
import scipy.io.wavfile as wav
import pandas as pd
import pickle
import os

from pprint import pprint

In [None]:
# useful constants
DATA_DIR = './20-test-dataset'
SINGLE_ARCHIVE_NAME = "20-test-dataset.npy"
ARCHIVE_NAME = "20-test-archive.pkl"

Lets write a quick function to extract all the wave files in a directory, return a giant (wavFileNum, ...) numpy array with ALL THE AUDIO DATA

In [None]:
wav_files = os.listdir(DATA_DIR)
test_wav = wav_files[0]
samps_per_sec, data = wav.read(os.path.join(DATA_DIR, test_wav))

In [None]:
def create_wav_dict(data_dir):
    """ Function to extract all the wav data from a directory of wav files, and creates a dictionary that maps file names to data
    Warning: no test to make sure directory is pure
    Parameters
    ----------
    data_dir: str
        The directory of wavs
    
    Returns
    -------
    dict
        dictionary that maps file names to wav data
    """
    data_dict = {}
    file_names = os.listdir(data_dir)
    for file_name in file_names:
        rate, data = wav.read(os.path.join(data_dir, file_name))
        data_dict[file_name] = {
            'rate': rate,
            'data': data,
            'dtype': data.dtype
        }
    
    return data_dict

In [None]:
def extract_wavs(data_dir):
    """ Function to extract all the wav data from a directory of wav files
    Warning: no test to make sure directory is pure
    Parameters
    ----------
    data_dir: str
        The directory of wavs
    
    Returns
    -------
    np.array:
        numpy array of all wav data, dtype is determined by wav file
        (num_files, max_file_size)
    """
    ret_array = [] #this is sad and bad
    file_names = os.listdir(data_dir)
    for file_name in file_names:
        _, data = wav.read(os.path.join(data_dir, file_name))
        ret_array.append(data)
    
    return np.array(ret_array)

In [None]:
audio_data = extract_wavs(DATA_DIR)
audio_data_archive = create_wav_dict(DATA_DIR)

In [None]:
# save the archive
np.save(SINGLE_ARCHIVE_NAME, audio_data)
with open(ARCHIVE_NAME, 'wb') as f:
    pickle.dump(audio_data_archive, f)

In [None]:
# load it back, both the single archive and the new .pkl dict archive
audio_restored = np.load(SINGLE_ARCHIVE_NAME)

In [None]:
audio_restored[0].shape

In [None]:
# load the .pkl archive to make sure that it works
with open(ARCHIVE_NAME, 'rb') as f:
    audio_archive_restored = pickle.load(f)
    print(audio_archive_restored)