# Steps in data preparation

1. Load the audio using librosa
2. Get the duration using librosa.get_duration
3. Calculate each frame width in ms
4. Split the audio on VAD (Below 20db is silence)
5. For each split calculate mel (180 frames) 
6. np.transpose the data Ex: (1,40,180) to (180,1,40)

In [1]:
# All imports
import os, sys, logging
import datetime
import time, shutil, pickle
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity
from scipy.ndimage.filters import gaussian_filter
from configuration import get_config

config = get_config()
log_file = os.path.abspath("data-generation-for-uisrnn.logs")
logging.basicConfig(
    filename=log_file,
    level=logging.DEBUG,
    format="%(asctime)s:%(levelname)s:%(message)s"
    )
print(f'Log path: {log_file}')

%matplotlib inline

Namespace(noise_path='/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/noise', train_path='/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/train', test_path='/datadrive2/dalon/diarization-experiments/Speaker_Verification/data/VCTK-Corpus/test', tdsv=False, sr=16000, nfft=512, window=0.025, hop=0.01, tdsv_frame=160, tisv_frame=160, tisv_frame_min=50, hidden=768, proj=256, num_layer=3, restore=False, model_path='./tisv_model', model_num=3, train=False, N=4, M=5, noise_filenum=16, loss='softmax', optim='sgd', lr=0.001, beta1=0.5, beta2=0.9, iteration=100000, comment='', max_batch_utterances=1000)
Log path: /home/jovyan/work/voxsrc21-dia/data-prep/data-generation-for-uisrnn.logs


# All configurations below:

In [3]:
random_state = 222 # random seed
config.N = 64 # Number of speakers per batch
config.M = 10 # Number of utterences per speaker
config.iteration = 50000000 # Number of iterations to run
config.lr = 1e-3
config.hidden = 768 # hidden state dimension of lstm
config.proj = 256 # projection dimension of lstm
# config.restore = True
config.model_num = 1
logging.info(f'N={config.N}, M={config.M}')
logging.info(f'Model restore: {config.restore}, Model number: {config.model_num}')

# Configurations

#_____________ Parameters to tune on dev set _______________________
# VAD param
# Changing to 25, which will give slightly better intervals, 20 gives very short intervals
vad_threshold = 25 # threshold for voice activity detection

# Segment param
acceptable_shortseg_dur = 0.2 # in second
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

# model parameters
model_path = '/home/jovyan/work/voxsrc21-dia/models/model.uisrnn-1' # model save path
dataset_path = '/home/jovyan/work/datasets/voxceleb-1/sample/wav'
save_dir_path = '/home/jovyan/work/voxsrc21-dia/embeddings'
os.makedirs(save_dir_path, exist_ok=True)

# Data Preparation

I'm saving only 2 embeddings i.e. first and last tisv_frames for given interval in an audio. So each .npy
embedding file will have a shape of (2, 256)

In [8]:
# Each embedding saved file will have (2, 256)

for folder in os.listdir(dataset_path):
    speakerid = folder
    folder = os.path.join(dataset_path, folder)
    # for base_id in os.listdir(data_path):
    #     for video_id in os.listdir(os.path.join(data_path, base_id)):
    #         if video_id.startswith('.'): #hidden folders
    #             continue;
    #         for audio_id in os.listdir(os.path.join(data_path, base_id, video_id)):
    
    for audio_file_name in os.listdir(folder):
        audio_path = os.path.join(folder, audio_file_name)
        audio_file_number = audio_file_name.split('.')[0].split('_')[1]
        utter, sr = librosa.core.load(
            audio_path, sr=config.sr)        # load audio
        # lower bound of utterance length
        utter_min_len = (config.tisv_frame *
                         config.hop + config.window) * sr
        # Get the duration
        duration = librosa.get_duration(utter, sr)
        # Duration of each window
        duration_per_frame = (duration / utter.shape[0])
#             print(f'Duration: {duration}\nDuration per frame: {duration_per_frame}s\nMin length of utterance: {utter_min_len * duration_per_frame}s')
        tisv_frame_duration_s = utter_min_len * duration_per_frame
        intervals = librosa.effects.split(
            utter, top_db=vad_threshold)         # voice activity detection

        for idx, current_interval in enumerate(intervals):
            utterances_spec = []
            # save first and last 160 frames of spectrogram.
            utter_part = utter[current_interval[0]:current_interval[1]]
            S = librosa.core.stft(y=utter_part, n_fft=config.nfft,
                                  win_length=int(config.window * sr), hop_length=int(config.hop * sr))
            S = np.abs(S) ** 2
            mel_basis = librosa.filters.mel(
                sr=sr, n_fft=config.nfft, n_mels=40)
            # log mel spectrogram of utterances
            S = np.log10(np.dot(mel_basis, S) + 1e-6)
        #         print(S.shape)
            utterances_spec.append(S[:, :config.tisv_frame])
            utterances_spec.append(S[:, -config.tisv_frame:])

            utterances_spec = np.array(utterances_spec)
            utter_batch = np.transpose(utterances_spec, axes=(
                2, 0, 1))     # transpose [frames, batch, n_mels]
        #         print(utter_batch.shape)

            data = sess.run(embedded, feed_dict={verif: utter_batch})
            save_embedding_path = os.path.join(
                save_dir_path, f'vctk-{speakerid}-{audio_file_number}-{idx}.npy')
            np.save(save_embedding_path, data)
#                 print(data.shape)


/home/jovyan/work/datasets/voxceleb-1/sample/wav/id10001/.DS_Store
/home/jovyan/work/datasets/voxceleb-1/sample/wav/id10001/1zcIwhmdeo4


IndexError: list index out of range

## structuring the dataset

In [10]:
from collections import defaultdict
list_of_embedding_path = sorted(os.listdir(save_dir_path))
print(f'Total number of files {len(list_of_embedding_path)}')
list_of_embedding_path[:5]

embedding_dict = defaultdict(list)

for file in list_of_embedding_path:
    embedding_dict[file.split('-')[1]].append(os.path.join(save_dir_path, file))

for key in embedding_dict:
    if len(embedding_dict[key]) < 5: # remove the keys if its embeddings is less than 5
        embedding_dict.pop(key)
        print(f'Poped {key}')

def shuffle_two(dict_of_two, train_sequence_path, train_ids):
    """Shuffle the given 2 labels"""
    max_to_pick = 5
    key1, key2 = dict_of_two.keys()
#     print(len(dict_of_two[key1]), len(dict_of_two[key2]))
    while dict_of_two[key1] and dict_of_two[key2]:
#         print('in while')
        no_to_pick = np.random.randint(1, max_to_pick)
        if no_to_pick <= len(dict_of_two[key1]):
#             print(no_to_pick, len(dict_of_two[key1]))
            train_sequence_path.extend(dict_of_two[key1][:no_to_pick])
            del dict_of_two[key1][:no_to_pick]
            train_ids.extend([key1] * no_to_pick)
        else: break
        no_to_pick = np.random.randint(1, max_to_pick)
        if no_to_pick <= len(dict_of_two[key2]):
#             print(no_to_pick)
            train_sequence_path.extend(dict_of_two[key2][:no_to_pick])
            del dict_of_two[key2][:no_to_pick]
            train_ids.extend([key2] * no_to_pick)
        else: break
    no_to_pick = len(dict_of_two[key1])
    train_sequence_path.extend(dict_of_two[key1])
    train_ids.extend([key1] * no_to_pick)
    no_to_pick = len(dict_of_two[key2])
    train_sequence_path.extend(dict_of_two[key2])
    train_ids.extend([key2] * no_to_pick)

Total number of files 93417


In [11]:
train_sequence_path = []
train_ids = []
while len(embedding_dict) >= 2:
    first2pairs = {k: embedding_dict[k] for k in list(embedding_dict)[:2]}
    shuffle_two(first2pairs, train_sequence_path, train_ids)
    # remove the keys from embedding_dict
    for key in first2pairs:
        embedding_dict.pop(key)
print(len(embedding_dict), len(train_sequence_path), len(train_ids))

0 93417 93417


In [12]:
# train_sequence_path[81234:81239], train_ids[81234:81239]

In [13]:
train_sequence = []
train_cluster_id = []
for idx, item in enumerate(train_sequence_path):
    embeddings = np.load(item)
    train_sequence.extend(embeddings.tolist())
    train_cluster_id.extend([train_ids[idx], train_ids[idx]])
#     break

In [14]:
len(train_sequence), len(train_cluster_id)

(186834, 186834)

In [15]:
np.savez('/datadrive/dalon/diarizer-dataset/VCTK-Corpus/vctk_training_data.npz',
         train_sequence=train_sequence, train_cluster_id=train_cluster_id)

# Prepare test dataset

In [88]:
videoid = 'zPFptdATk_s'
save_utter_label_interval = f'/datadrive/dalon/uis-rnn/Notebooks/{videoid}_5min.b'
save_test_data = f'/datadrive/dalon/uis-rnn/Notebooks/{videoid}_test.npz' # it will have embeddings and data

In [89]:
with open(save_utter_label_interval, 'rb') as f:
    _tmp = pickle.load(f)

In [90]:
embeddings = []
# Each embedding saved file will have (2, 256)
with tf.Session(config=config_tensorflow) as sess:
    tf.global_variables_initializer().run()
    saver.restore(sess, model_path)

    utter = _tmp['utter']
    sr = config.sr
    utter_min_len = (config.tisv_frame * config.hop + config.window) * sr    # lower bound of utterance length
    # Get the duration
    duration = librosa.get_duration(utter, sr)
    # Duration of each window
    duration_per_frame = (duration / utter.shape[0])
#             print(f'Duration: {duration}\nDuration per frame: {duration_per_frame}s\nMin length of utterance: {utter_min_len * duration_per_frame}s')
    tisv_frame_duration_s = utter_min_len * duration_per_frame
    intervals = _tmp['intervals']

    for idx, current_interval in enumerate(intervals):
        utterances_spec = []
        utter_part = utter[current_interval[0]:current_interval[1]]         # save first and last 160 frames of spectrogram.
        S = librosa.core.stft(y=utter_part, n_fft=config.nfft,
                              win_length=int(config.window * sr), hop_length=int(config.hop * sr))
        S = np.abs(S) ** 2
        mel_basis = librosa.filters.mel(sr=sr, n_fft=config.nfft, n_mels=40)
        S = np.log10(np.dot(mel_basis, S) + 1e-6)           # log mel spectrogram of utterances
#         print(S.shape)
        utterances_spec.append(S[:, :config.tisv_frame])
        utterances_spec.append(S[:, -config.tisv_frame:])

        utterances_spec = np.array(utterances_spec)
        utter_batch = np.transpose(utterances_spec, axes=(2,0,1))     # transpose [frames, batch, n_mels]
        data = sess.run(embedded, feed_dict={verif:utter_batch})
        embeddings.extend(data)

INFO:tensorflow:Restoring parameters from /datadrive/dalon/models/m-64-10-768-256/Check_Point/model.ckpt-46


In [92]:
test_cluster_ids = []
for item in _tmp['labels_list']:
    test_cluster_ids.extend([item, item])

In [93]:
len(test_cluster_ids)

542

In [94]:
np.savez(save_test_data,
         test_sequences=embeddings,
         test_cluster_ids=test_cluster_ids)

# Create single npz file for testing

In [206]:
videoids = ['zPFptdATk_s', 'VqF96Um0HQw']
save_test_files = []
for videoid in videoids:
    save_test_files.append(f'/datadrive/dalon/uis-rnn/Notebooks/{videoid}_test.npz')

In [207]:
test_sequences = []
test_cluster_ids = []

In [208]:
512//125

4

In [209]:
# convert these into smaller chunks
def split_to_chunks(test_sequences, test_cluster_ids, ids, seqs):
    max_size = 125
    start = 0
    for i in range(len(ids) // max_size):
        test_sequences.append(seqs[start:max_size * (i + 1)])
        test_cluster_ids.append(ids[start:max_size * (i + 1)])
        start = max_size * (i + 1)
    test_sequences.append(seqs[start:])
    test_cluster_ids.append(ids[start:])

In [210]:
for file in save_test_files:
    _tmp = np.load(file)
    if len(list(_tmp['test_cluster_ids'])) > 150:
        split_to_chunks(test_sequences,
                        test_cluster_ids,
                        list(_tmp['test_cluster_ids']),
                        np.float64(_tmp['test_sequences']))
    else:
        test_sequences.append(np.float64(_tmp['test_sequences']))
        test_cluster_ids.append(list(_tmp['test_cluster_ids']))

In [211]:
test_sequences = np.array(test_sequences)
test_cluster_ids = np.array(test_cluster_ids)

In [212]:
test_data_path = '/datadrive/dalon/uis-rnn/data/testing_data_custom_2vid.npz'
np.savez(test_data_path,
         test_sequences=test_sequences,
         test_cluster_ids=test_cluster_ids)

## VCTK data prep for UIS-RNN

In [25]:
import os
from glob import glob
from collections import defaultdict

In [22]:

# path to vctk dataset
# There are sub folders for each speaker with wav file inside them
dataset_path = '/datadrive/dalon/diarizer-dataset/VCTK-Corpus/wav48/'


In [27]:
speakers_desc = defaultdict(list) # speaker id is the key and value is list of utterences
for folder in os.listdir(dataset_path):
    speakerid = folder
    folder = os.path.join(dataset_path, folder)
    for utter_path in os.listdir(folder):
        utter_path = os.path.join(folder, utter_path)
        speakers_desc[f'vctk-{speakerid}'].append(utter_path)

In [30]:
utter_desc = []
for key in speakers_desc:
#     print(f'{key} = {len(speakers_desc[key])}')
    utter_desc.append(len(speakers_desc[key]))
utter_desc = np.array(utter_desc)

In [31]:
utter_desc.min(), utter_desc.max(), utter_desc.mean()

(172, 503, 405.89908256880733)