In [1]:
import os
import matplotlib.pyplot as plt
plt.style.use('dark_background')
import seaborn as sns
import re
import numpy as np
from scipy import signal
from IPython import display
import time
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from pdb import set_trace
import librosa
import soundfile as sf

import glob
import audiofile as af


from sklearn.model_selection import train_test_split

import glob
import audiofile as af

SoX could not be found!

    If you do not have SoX, proceed here:
     - - - http://sox.sourceforge.net/ - - -

    If you do (or think that you should) have SoX, double-check your
    path variables.
    


# Setup

In [2]:
# Direcotries of the csv files that contains the list of tracks within each dataset
crowd_csv_path = './BBCSoundEffects_crowd.csv' # contains a list of tracks from BBC Sound Effects
voice_csv_path = './track_speaker.csv' # contains the list of audio tracks from librivox

# Base directory of the audio samples
crowd_filepath = '/data/crowd_wav' # detail loading handled by get_crowd function
voice_filepath = '/data/librivox' # detail loading handled by get_voice function

# Propose to draw 20000 samples. Note that the final output is not going to be necessarily 20000
# The program will check for some of the samples if the voice and crowd sound duration is long enough.
# If not long enough, the sample will not be created. This is to prevent creating samples with silence.
n_samples = 32
duration = 65536/16000 # duration of the sample in seconds
merged_fs = 16000 # the sampling frequency (Hz) of the resulting samples

# Output
save_dir = '/data/simulated_wav10' # directory to save the audio samples
audio_sample_path = './audio_samples.csv' # proposed audio samples
audio_sample_path2 = './audio_samples_written.csv' # Executed (files saved) audio sample

# csvs used to build models
train_dir = './train.csv'
test_dir = './test.csv'
val_dir = './val.csv'

# Draw audio samples from the two dataset

In [3]:
def get_crowd(df_crowd, i):
    path = os.path.join(crowd_filepath, '{}'.format(df_crowd.loc[i, 'location']))
    duration = df_crowd.loc[i, 'secs']
    fs = df_crowd.loc[i, 'bitRate']
    
    return path, duration, fs

def get_voice(df_voice, i, track=0):
    path_format = os.path.join(voice_filepath, '{}/{}/{}')
    #file_format = '{}-{}-{}.flac'
    path = path_format.format(df_voice.loc[i, 'subset'], str(int(df_voice.loc[i, 'reader_id'])), str(int(df_voice.loc[i, 'chapter_id'])))
    file = df_voice.loc[i, 'trackFile']
    path = os.path.join(path, file) # combine path
    
    duration = df_voice.loc[i, 'trackDuration']
    fs = df_voice.loc[i, 'trackfs']
    
    return path, duration, fs

def load_sound(sound, fs=None, normalize=False, always_2d=False, *args, **kwargs):
    """
    Additional arguments for soundfile.read
    returns y, length, fs"""
    if isinstance(sound, str):
        y, fs_out = sf.read(sound, always_2d=always_2d, *args, **kwargs)
    elif isinstance(sound, tuple):
        y, fs_out = sound
    else:
        raise(TypeError('Unrecognized input type for sound'))

    if always_2d and y.shape[1] == 1: # convert 1 channel to 2 channels
        y = np.c_[y, y]

    # convert to monotone if not forced to load 2d
    if not always_2d and np.ndim(y)>1:
        y = np.mean(y, axis=1)

    # Resample the sounds if it is different from the target output
    if fs is not None and fs_out != fs:
        if np.ndim(y)>1:
            y = librosa.core.resample(y.T, fs_out, fs)
            y = y.T
        else:
            y = librosa.core.resample(y, fs_out, fs)

    # Normalizing to -1 and 1
    if normalize:
        y = 0.9999847412109375/np.max(abs(y), axis=0) * y # (1-1/2^16) / (max(abs(y)))

    # Num of data points
    N = y.shape[0]

    return y, N, fs_out

def slice_audio(audio, fs, duration=4, overlap=1, include_end=True):
    """
    Dividing an audio clip of into pieces, with overlap
    audio: audio time series
    fs: sampling frequency of the audio, in Hz
    duration: duration of the segments, in sec
    overlap: overlap between the segments, in sec
    include_end: if the last window exceeds the duration of the audio,
            keep the shortened version instead of dropping it completely
    """
    l = len(audio) # length of audio
    dur_len = int(duration * fs)
    ol_len = int(overlap * fs)
    win = int(dur_len - ol_len)

    # Build the start and end index
    start_index = np.floor(np.arange(0, l, win)).astype(np.int32)

    if include_end:
        end_index = np.floor(np.arange(dur_len, l+win, win)).astype(np.int32)
        end_index[-1] = min(l, end_index[-1])
    else:
        end_index = np.floor(np.arange(dur_len, l+1, win)).astype(np.int32)

    min_len = min(len(start_index), len(end_index))
    start_index = start_index[:min_len]
    end_index = end_index[:min_len]
    segment_indices = np.c_[start_index, end_index]
    num_segments = segment_indices.shape[0]
        
    # If the last segment is shorter, replace the segment with identical length
    if segment_indices[-1, 1] - segment_indices[-1, 0] < win:
        segment_indices[-1, 0] = segment_indices[-1, 1] - dur_len 

    # Do the segmentation
    audio_segments = [[]] * num_segments

    for n in range(num_segments):
        audio_segments[n] = audio[segment_indices[n, 0]:segment_indices[n, 1]]

    return audio_segments, segment_indices

def combine_audio(audio_segments, segment_indices, duration=None, normalize=False):
    """
    Sticthing the audio segments back together again. Reverse the procss of slice_audio
    audio_segments: list of audios, length N
    segment_indices: index of the audio segments to be put together, shape of [N, 2]
    duration: total duration of the audio after sticthing it together. Default is the last
              segment indices: segment_indices[-1, 1]
    normalize: normalize the audio output
    """
    num_segments = len(audio_segments)
    #set_trace()
    assert num_segments == segment_indices.shape[0] # Make sure it's the same number of segments

    # Initialize the audio matrix
    if duration is not None:
        audio = np.empty((max(duration, segment_indices[-1, 1]), num_segments))
    else:
        audio = np.empty((segment_indices[-1, 1], num_segments))
    # Intialize with nan
    audio[:] = np.nan

    # Putting the audio segments in the correct place
    for n, (aud, seg) in enumerate(zip(audio_segments, segment_indices)):
        assert seg[1]-seg[0] == len(aud) # for debugging only
        audio[seg[0]:seg[1], n] = aud

    # Take the average
    audio = np.nanmean(audio, axis=1)

    #  Normalize
    if normalize:
        audio = 0.9999847412109375/max(abs(audio)) * audio # (1-1/2^16) / (max(abs(y)))

    return audio

def filter_audio(audio):
    b, a = butter(5, [3800/8000, 4100/8000], 'bandstop')
    audio_out = filtfilt(b, a, audio)
    return audio_out

def propose_audio_segment(N_crowd, N_voice, fs, duration, from_start=False, til_end=False, seed=42):
    """Determine the indices to cut the audios
    Assuming the two audios have the same sampling frequency
    from_start: merge the sampled voice from the very start
    til_end: voice continues to the end of the sample
    """ 
    np.random.seed(seed)

    # buffer:
    buffer = int(duration * fs) # leave enough of seconds for audio clips

    # Determine the start index of the clips
    start_crowd_index = np.random.randint(0, N_crowd-buffer) # where to start clipping the crowd
    end_crowd_index = start_crowd_index + buffer # where to end the clipping the crowd, end-start = duration
    start_voice_index = np.random.randint(0, N_voice) # where to start clipping voice

    # Determine how to merge the two clips
    if from_start:
        start_merge_index = 0
    else:
        start_merge_index = np.random.randint(0, buffer)

    if til_end:
        end_voice_index = buffer - start_merge_index + start_voice_index # duration + start = end
    else:
        end_voice_index = np.random.randint(0, buffer - start_merge_index) + start_voice_index
        
    end_voice_index = min(end_voice_index, N_voice)

    return start_crowd_index, end_crowd_index, start_voice_index, end_voice_index, start_merge_index

def draw_audio_samples(df_crowd, df_voice, n_samples=10000, duration=4.096, merged_fs=16000, seed=42, 
                      crowd_offset=0, voice_offset=0, shuffle=True, from_start=False, til_end=False):
    """
    df_crowd, df_voice: tables to draw samples from
    n_samples: number of samples to create
    duration: duration of the sample
    merged_fs: sampling frequency (Hz) after merging
    seed: random seed
    crowd_offset: starting row index of the crowd table to draw
    voice_offset: starting row index of the voice table to draw
    shuffle: shuffle before sampling
    Additional arguments for propose_audio_segments
    """
    
    # Filter the data frame
    df_crowd = df_crowd.loc[df_crowd['secs'] > duration+1, :]
    df_voice = df_voice.loc[df_voice['trackDuration'] > duration+1, :]
    
    # Make a new dataframe
    df = pd.DataFrame(columns=['crowd_row', 'crowd_path', 'crowd_duration', 'crowd_fs', 
                               'crowd_start_index','crowd_end_index', 'crowd_weight',
                               'crowd_sample_duration', # duration of subset
                               'voice_row', 'voice_path', 'voice_duration', 'voice_fs',
                               'voice_start_index', 'voice_end_index', 'voice_weight',
                               'voice_gender', 'voice_sample_duration', # duration of voice subset sample
                               'start_merge_index', 'merged_fs'])
    
    # Shuffle the dataframes
    if shuffle:
        df_crowd = df_crowd.sample(frac=1, replace=False, random_state=seed).reset_index()
        df_voice = df_voice.sample(frac=1, replace=False, random_state=seed+1).reset_index()
        
    nrows_crowd = df_crowd.shape[0]
    nrows_voice = df_voice.shape[0]
    
    printProgressBar(0, n_samples, prefix='Samples:',suffix='Complete',length=50,mode="counts")
    for n in range(n_samples): # generate the samples over and over
        printProgressBar(n+1,n_samples,prefix='Samples:',suffix='Complete',length=50,mode="counts")
        index_crowd = (n + crowd_offset) % nrows_crowd # cycling through the rows
        index_voice = (n + voice_offset) % nrows_voice
        
        # Crowd
        df.loc[n, 'crowd_row'] = index_crowd
        df.loc[n, 'crowd_path'], df.loc[n, 'crowd_duration'], df.loc[n, 'crowd_fs'] = get_crowd(df_crowd, index_crowd)
        
        # Voice
        df.loc[n, 'voice_row'] = index_voice
        df.loc[n, 'voice_path'], df.loc[n, 'voice_duration'], df.loc[n, 'voice_fs'] = get_voice(df_voice, index_voice)
        df.loc[n, 'voice_gender'] = df_voice.loc[index_voice, 'sex']
        
        # Draw the samples
        # start_crowd_index, end_crowd_index, start_voice_index, end_voice_index, start_merge_index
        try:
            df.loc[n, 'crowd_start_index'], df.loc[n, 'crowd_end_index'], \
                df.loc[n, 'voice_start_index'], df.loc[n, 'voice_end_index'], df.loc[n, 'start_merge_index'] = \
                    propose_audio_segment(df.loc[n, 'crowd_duration'] * merged_fs, 
                                          df.loc[n, 'voice_duration'] * merged_fs,
                                          merged_fs,
                                          duration=duration,
                                          seed = seed + n, 
                                          from_start=from_start, til_end=til_end
                                         )
        except: # returned a None, go to next sample
            set_trace()

        # Putting the rest of the info in the columns
        df.loc[n, 'crowd_sample_duration'] = (df.loc[n,'crowd_end_index']-df.loc[n,'crowd_start_index'])/merged_fs
        df.loc[n, 'voice_sample_duration'] = (df.loc[n,'voice_end_index']-df.loc[n,'voice_start_index'])/merged_fs
        
    # The rest of the columns does not depend on the loaded data
    # Target sampling frequency
    df['merged_fs'] = merged_fs
    
    # weights
    np.random.seed(seed+9527) # avoid using the same seed
    df['crowd_weight'] = np.random.uniform(0.3, 0.7, size=n_samples)
    df['voice_weight'] = 1 - df['crowd_weight']
        
    return df

def printProgressBar (iteration, total, prefix = 'Progress', suffix = 'Complete', decimals = 1, length = 100, fill = '█', mode='percentage'):
    """
    Call in a loop to create terminal progress bar
    @params:
        iteration   - Required  : current iteration (Int)
        total       - Required  : total iterations (Int)
        prefix      - Optional  : prefix string (Str)
        suffix      - Optional  : suffix string (Str)
        decimals    - Optional  : positive number of decimals in percent complete (Int)
        length      - Optional  : character length of bar (Int)
        fill        - Optional  : bar fill character (Str)
        mode        - Optional  : display mode, either "percentage" or "counts"

    # Sample Usage:
        from time import sleep
        # A List of Items
        items = list(range(0, 57))
        l = len(items)

        # Initial call to print 0% progress
        printProgressBar(0, l, prefix = 'Progress:', suffix = 'Complete', length = 50)
        for i, item in enumerate(items):
            # Do stuff...
            sleep(0.1)
            # Update Progress Bar
            printProgressBar(i + 1, l, prefix = 'Progress:', suffix = 'Complete', length = 50)

    # Sample Output
    Progress: |█████████████████████████████████████████████-----| 90.0% Complete
    Progress: |█████████████████████████████████████████████-----| 90/100 Complete
    """
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    if mode == "percentage":
        percent = ("{0:." + str(decimals) + "f}").format(100 * (iteration / float(total)))
        print('\r%s |%s| %s%% %s' % (prefix, bar, percent, suffix), end = '\r')
    elif mode == "counts":
        print('\r%s  |%s| %d / %d %s' % (prefix, bar, iteration, total, suffix), end='\r')
    # Print New Line on Complete
    if iteration >= total:
        print('\nended')

In [4]:
df_crowd = pd.read_csv(crowd_csv_path)
df_voice = pd.read_csv(voice_csv_path)
df = draw_audio_samples(df_crowd, df_voice, n_samples=n_samples, duration=duration, merged_fs=merged_fs, seed=124, 
                        from_start=True, til_end=True)

df.to_csv(audio_sample_path, index=False)

Samples:  |--------------------------------------------------| 0 / 32 CompleteSamples:  |█-------------------------------------------------| 1 / 32 CompleteSamples:  |███-----------------------------------------------| 2 / 32 CompleteSamples:  |████----------------------------------------------| 3 / 32 CompleteSamples:  |██████--------------------------------------------| 4 / 32 CompleteSamples:  |███████-------------------------------------------| 5 / 32 CompleteSamples:  |█████████-----------------------------------------| 6 / 32 CompleteSamples:  |██████████----------------------------------------| 7 / 32 CompleteSamples:  |████████████--------------------------------------| 8 / 32 CompleteSamples:  |██████████████------------------------------------| 9 / 32 CompleteSamples:  |███████████████-----------------------------------| 10 / 32 CompleteSamples:  |█████████████████---------------------------------| 11 / 32 CompleteSamples:  |██████████████████--------

# Merge 2 audios

In [5]:
def percent_audio(audio, win=5, threshold=1E-3):
    audio = pd.Series(audio)
    sd_out = audio.rolling(win).std()
    sd_out = np.sum(sd_out > threshold) / len(sd_out) * 100 # percent that has audio
    return sd_out

def load_sound(sound, fs=None, start=0, stop=None, normalize=False, always_2d=False):
    """returns y, length, fs"""
    if isinstance(sound, str):
        y, fs_out = sf.read(sound, always_2d=always_2d)
    elif isinstance(sound, tuple):
        y, fs_out = sound
    else:
        raise(TypeError('Unrecognized input type for sound'))
        
    if always_2d and y.shape[1] == 1: # convert 1 channel to 2 channels
        y = np.c_[y, y]

    # convert to monotone if not forced to load 2d      
    if not always_2d and np.ndim(y)>1:
        y = np.mean(y, axis=1) 

    # Resample the sounds if it is different from the target output
    if fs is not None and fs_out != fs:
        if np.ndim(y)>1:
            y = librosa.core.resample(y.T, fs_out, fs)
            y = y.T
        else:
            y = librosa.core.resample(y, fs_out, fs)
        
    # Normalizing to -1 and 1
    if normalize:
        if always_2d:
            y = 0.9999847412109375/np.max(abs(y), axis=0) * y # (1-1/2^16) / (max(abs(y))) 
        else:
            y = 0.9999847412109375/max(abs(y)) * y # (1-1/2^16) / (max(abs(y))) 

    # Num of data points
    N = y.shape[0]

    return y, N, fs_out

def create_merged_audios(crowd,voice,params,fs=16000,noise=None,stereo=True, 
                         path=None,crowd_savepath=None,voice_savepath=None, 
                         min_audio_percent=[30, 90]):
    """
    crowd: directory of the crowd wavefile, or a tuple (y, fs) for crowd
    voice: directory of the voice wavefile, or a tuple (y, fs) for voice
    params: [start_crowd_index, end_crowd_index, 
             start_voice_index, end_voice_index, 
             start_merge_index, crowd_weight, voice_weight]
             reutrned by propose_audio_segment / draw_audio_samples table
    fs: target sampling frequency to output the audio
    noise: add Gaussian noise on top of the merged track. Usually 0.05 or 5%
    stereo: load the audio as 2d stereo

    path: path to write a merged file
    crowd_savepath: path to write the segmented crowd
    voice_savepath: path to write the segmented voice
    
    min_audio_percent: minimum of audio percentage. Default 30% for voice, and 90% for crowd
    
    Returns: 
        * y: merged crowd and voice sound clips
        * variance: variance of the time series over all
        * isGaussian: if the merged audio is basically gaussian noise, p-value from shapiro test
    """
    # Load the sounds
    y_crowd, N_crowd, _ = load_sound(crowd, fs=fs, always_2d=stereo, normalize=True)
    y_voice, N_voice, _ = load_sound(voice, fs=fs, always_2d=stereo, normalize=True)
    
    # Getting the parameters
    start_crowd_index, end_crowd_index, start_voice_index, \
    end_voice_index, start_merge_index, crowd_weight, voice_weight = params
    
    start_crowd_index = int(start_crowd_index)
    end_crowd_index = int(end_crowd_index)
    
    start_voice_index = int(start_voice_index)
    end_voice_index = int(end_voice_index)
    start_merge_index = int(start_merge_index)
    
    # Subsetting
    if stereo: # 2D
        y_crowd = y_crowd[start_crowd_index:end_crowd_index, :]
        y_voice = y_voice[start_voice_index:end_voice_index, :]
        # Padding zeros on the voice; crowd already has a fixed length
        y_voice = np.r_[np.zeros([start_merge_index, 2]), y_voice]
        try:
            y_voice = np.r_[y_voice, np.zeros([y_crowd.shape[0]-y_voice.shape[0], 2])]
        except:
            print('There are crowd samples with duration less than that of requested segment length')
            set_trace()
    else:
        y_crowd = y_crowd[start_crowd_index:end_crowd_index]
        y_voice = y_voice[start_voice_index:end_voice_index]
        # Padding zeros on the voice; crowd already has a fixed length
        y_voice = np.concatenate([np.zeros(start_merge_index), y_voice])
        y_voice = np.concatenate([y_voice, np.zeros(N_crowd-N_voice)])
    
     # Percent of audio in the sources
    if stereo:
        v = percent_audio(y_voice[:, 0])
        c = percent_audio(y_crowd[:, 0])
    else:
        v = percent_audio(y_voice)
        c = percent_audio(y_crowd)
           
    long_enough = v>= min_audio_percent[0] and c>=min_audio_percent[1]
    if not long_enough:
        return None,None, None, v, c, int(long_enough)
    
    # Weighted average of the two
    y = y_crowd * crowd_weight + y_voice * voice_weight
    
    # Add noise
    if noise:
        if stereo:
            y = y + np.random.randn(y.shape[0], 2) * noise
        else:
            y = y + np.random.randn(y.shape[0]) * noise
    
    # Normalize
    if stereo:
        y = 0.9999847412109375/np.max(abs(y), axis=0) * y # (1-1/2^16) / (max(abs(y))) 
    else:
        y = 0.9999847412109375/max(abs(y)) * y # (1-1/2^16) / (max(abs(y))) 
        
    # Save the outputs
    if path:
        librosa.output.write_wav(path, y, fs)
        
    if crowd_savepath:
        librosa.output.write_wav(crowd_savepath, y_crowd, fs) # save normalized, original segments
    
    if voice_savepath:
        librosa.output.write_wav(voice_savepath, y_voice, fs) # save normalized, original segments
        
    return y, y_crowd, y_voice, v, c, int(long_enough)

In [6]:
# Iterating through the window to create the merged audios
fmt = '{:03d}-{:06d}_{}.wav' # saved file format
batch_num = 0 # use to keep track of batches. For changing seeds of simulation, etc
n_samples = df.shape[0]
printProgressBar(0, n_samples,prefix='Samples:',suffix='Complete',length=50,mode="counts")
for i in df.index:
    try:
        if not pd.isnull(df.loc[i, 'simName']): continue
    except:
        pass
    printProgressBar(i+1, n_samples,prefix='Samples:',suffix='Complete',length=50,mode="counts")
    params = df.loc[i, ['crowd_start_index', 'crowd_end_index', 'voice_start_index', 'voice_end_index', \
                        'start_merge_index', 'crowd_weight', 'voice_weight']]
    params = list(params)
    
    crowd = df.loc[i, 'crowd_path']
    voice = df.loc[i, 'voice_path']
    
    merge_name = fmt.format(batch_num, i, 'merged')
    merge_path = os.path.join(save_dir, merge_name)
    crowd_name = fmt.format(batch_num, i, 'crowd')
    crowd_savepath = os.path.join(save_dir, crowd_name)
    voice_name = fmt.format(batch_num, i, 'voice')
    voice_savepath = os.path.join(save_dir, voice_name)
    
    _, _, _, v, c, l = create_merged_audios(crowd, voice, params, fs=merged_fs, path=merge_path, 
                                        crowd_savepath=crowd_savepath, voice_savepath=voice_savepath)
    
    df.loc[i, 'simName'] = merge_name
    df.loc[i, 'simPath'] = merge_path #.replace(save_dir, aws_dir)
    df.loc[i, 'crowdSegName'] = crowd_name
    df.loc[i, 'crowdSegPath'] = crowd_savepath#.replace(save_dir, aws_dir)
    df.loc[i, 'voiceSegName'] = voice_name
    df.loc[i, 'voiceSegPath'] = voice_savepath#.replace(save_dir, aws_dir)
    df.loc[i, 'voiceDurPercent'] = v
    df.loc[i, 'crowdDurPercent'] = c
    df.loc[i, 'longEnough'] = l
    
    if i % 100 == 0: # write to file every 100 files made
        df.to_csv(audio_sample_path2, index=False)
        
# Get rid of the ones with no audio
df = df.loc[df['longEnough'] == 1, :].reset_index(drop=True)
df.to_csv(audio_sample_path2, index=False)

Samples:  |██████████████████████████████████████████████████| 32 / 32 Complete
ended


# Train-test split

In [9]:
df_train, df_test = train_test_split(df, test_size=0.25, random_state=21)
df_test, df_val = train_test_split(df_test, test_size=0.5, random_state=20)
df_train.to_csv(train_dir, index=False)
df_test.to_csv(test_dir, index=False)
df_val.to_csv(val_dir, index=False)

# Inspecting the sound quality

In [10]:
index = 0
merge, fs = sf.read(df.loc[index, 'simPath'])
merge = merge[:, 0]
voice, fs = sf.read(df.loc[index, 'voiceSegPath'])
voice = voice[:, 0]
crowd, fs = sf.read(df.loc[index, 'crowdSegPath'])
crowd = crowd[:, 0]


print('Merge')
display.display(display.Audio(merge, rate=fs))
print('Voice')
display.display(display.Audio(voice, rate=fs))
print('Crowd')
display.display(display.Audio(crowd, rate=fs))

Merge


Voice


Crowd
