In [1]:
import os
import logging
import datetime
import time
import math
import json
import librosa
import numpy as np
from utils import normalize

import tensorflow as tf
from tensorflow.contrib import rnn
from sklearn.preprocessing import normalize as sk_normalize
from sklearn.cluster import KMeans
from scipy.ndimage.filters import gaussian_filter

from collections import defaultdict
from configuration import get_config
from rttm import load_rttm
from VAD_segments import VAD_chunk

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Namespace(audio_file='../data/voxconverse/sample/abjxc.wav', hidden=768, hop=0.01, log_path='main.logs', model_num=3, model_path='../models/model.ckpt-46', nfft=512, num_layer=3, number_of_speakers=2, output_dir='output.json', proj=256, random_state=123, restore=False, sr=16000, srt_path='xxx.en.srt', tdsv=False, tdsv_frame=160, tisv_frame=160, tisv_frame_min=50, window=0.025)


In [2]:
config = get_config()
config.log_path = 'voxconverse-sample-embeddings.logs'
log_file = os.path.abspath(config.log_path)
logging.basicConfig(
    filename=log_file,
    level=logging.DEBUG,
    format="%(asctime)s:%(levelname)s:%(message)s"
    )
print(f'Log path: {log_file}')

Log path: /home/jovyan/work/voxsrc21-dia/embeddings/voxconverse-sample-embeddings.logs


In [3]:
data_path = '/home/jovyan/work/voxsrc21-dia/data/voxconverse/sample/wav'
rttm_path = '/home/jovyan/work/voxsrc21-dia/data/voxconverse/sample/rttm'
save_dir_path = '/home/jovyan/work/voxsrc21-dia/embeddings/sequences/voxconverse-sample'
os.makedirs(save_dir_path, exist_ok=True)

# Data prep
# I'm saving only 2 embeddings i.e. first and last tisv_frames for given interval in an audio. So each .npy
# embedding file will have a shape of (2, 256)
tf.reset_default_graph()
batch_size = 2 # Fixing to 2 since we take 2 for each interval #utter_batch.shape[1]
verif = tf.placeholder(shape=[None, batch_size, 40], dtype=tf.float32)  # verification batch (time x batch x n_mel)
batch = tf.concat([verif,], axis=1)
# embedding lstm (3-layer default)
with tf.variable_scope("lstm"):
    lstm_cells = [tf.contrib.rnn.LSTMCell(num_units=config.hidden, num_proj=config.proj) for i in range(config.num_layer)]
    lstm = tf.contrib.rnn.MultiRNNCell(lstm_cells)    # make lstm op and variables
    outputs, _ = tf.nn.dynamic_rnn(cell=lstm, inputs=batch, dtype=tf.float32, time_major=True)   # for TI-VS must use dynamic rnn
    embedded = outputs[-1]                            # the last ouput is the embedded d-vector
    embedded = normalize(embedded)                    # normalize
config_tensorflow = tf.ConfigProto(device_count = {'GPU': 0})
saver = tf.train.Saver(var_list=tf.global_variables())

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [4]:
def concat_segs(times, segs):
    # Concatenate continuous voiced segments
    # with segment time information (onset and offset)
    concat_seg = []
    concat_times=[]
    seg_concat = segs[0]
    seg_times = [times[0][0]]
    for i in range(0, len(times)-1):
        if times[i][1] == times[i+1][0]:
            # If segments are continuous, concatenate them
            seg_concat = np.concatenate((seg_concat, segs[i+1]))
        else:
            # If not, append a new segment sequence
            concat_seg.append(seg_concat)
            seg_concat = segs[i+1]
            # Save segment time offset and append a new one
            seg_times = np.concatenate((seg_times, [times[i][1]]))
            concat_times.append(seg_times)
            seg_times = [times[i+1][0]]
    else:
        concat_seg.append(seg_concat)
        # Save last time offset
        seg_times = np.concatenate((seg_times, [times[i+1][1]]))
        concat_times.append(seg_times)
        
    return concat_seg, concat_times

def get_STFTs(segs):
    #Get 240ms STFT windows with 50% overlap, in pairs
    sr = config.sr
    STFT_windows = []
    for seg in segs:
        S = librosa.core.stft(y=seg, n_fft=config.nfft, win_length=int(config.window * sr), hop_length=int(config.hop * sr))
        S = np.abs(S) ** 2
        mel_basis = librosa.filters.mel(sr=sr, n_fft=config.nfft, n_mels=40)
        # log mel spectrogram of utterances
        S = np.log10(np.dot(mel_basis, S) + 1e-6)        
        for j in range(0, S.shape[1], int(.24/config.hop)):
            if j + 36 < S.shape[1]:
                # in order to fit on the expected shape of the embedding network we double the window
                STFT_windows.append([S[:, j:j+24], S[:, j+12:j+36]])                
            else:
                break
    return np.array(STFT_windows)

def align_embeddings(embeddings):
    partitions = []
    start = 0
    end = 0
    j = 1
    for i, embedding in enumerate(embeddings):
        if (i*.12)+.24 < j*.401:
            end = end + 1
        else:
            partitions.append((start,end))
            start = end
            end = end + 1
            j += 1
    else:
        partitions.append((start,end))
    avg_embeddings = np.zeros((len(partitions),256))
    for i, partition in enumerate(partitions):
        avg_embeddings[i] = np.average(embeddings[partition[0]:partition[1]],axis=0) 
    return avg_embeddings

In [5]:
all_unique_extensions = []
# Using List as default factory
audio_files = defaultdict(list)
rttm_files = defaultdict(list)

for audio_file in os.listdir(data_path):
    if audio_file.startswith('.'): #hidden folders
        continue;
    audio_id = os.path.splitext(audio_file)[0]
    extension = os.path.splitext(audio_file)[1]
    all_unique_extensions.append(extension)
#     print(f'Audio id: {audio_id}')
    if extension == '.wav':
        audio_files[audio_id].append(os.path.join(data_path, audio_file))
        rttm_files[audio_id].append(os.path.join(rttm_path, audio_id + '.rttm'))
    else:
        print(f'Wrong file type in {os.path.join(data_path, audio_file)}')

audio_quantity = len(audio_files)
print(f'Unique file extensions: {set(all_unique_extensions)}')
print(f'Number of audios: {audio_quantity}')
print(f'Number of rttms: {len(rttm_files)}')

Unique file extensions: {'.wav'}
Number of audios: 19
Number of rttms: 19


In [6]:
for audio_id, rttm_path in rttm_files.items():
    _, speakers, _ = load_rttm(rttm_files.get(audio_id)[0])
    print(audio_id, len(speakers))

bwzyf 4
aufkn 3
akthc 2
aisvi 8
blwmj 2
ahnss 4
bspxd 3
bkwns 2
bdopb 7
bxpwa 5
abjxc 1
ampme 3
azisu 4
afjiv 5
asxwr 3
bravd 3
bydui 3
atgpi 1
bauzd 5


In [14]:
turns, _, _ = load_rttm(rttm_files.get('abjxc')[0])

turns[0].onset
turns[0].offset
turns[0].dur
turns[0].speaker_id

'spk00'

In [15]:
times, segs = VAD_chunk(2, audio_files.get(audio_id)[0])

In [51]:
concat_seg, concat_times = concat_segs(times, segs)

In [61]:
STFT_windows = get_STFTs(concat_seg, concat_times)

709 [0.1  7.18]
127 [7.32 8.58]
273 [ 8.74 11.46]
275 [11.5  14.24]
661 [14.28 20.88]
1017 [21.46 31.62]
249 [31.76 34.24]
1161 [34.54 46.14]
349 [46.26 49.74]
5745 [ 49.88 107.32]
701 [107.96 114.96]
817 [115.66 123.82]
514 [123.9  129.04]
579 [129.08 134.86]
1345 [134.94 148.38]
397 [148.46 152.42]
509 [153.12 158.2 ]
809 [158.4  166.48]
1061 [166.94 177.54]
181 [178.02 179.82]
329 [180.1  183.38]
361 [183.94 187.54]
691 [188.12 195.02]
1277 [195.38 208.14]
1489 [208.44 223.32]
291 [223.34 226.24]
865 [226.42 235.06]
473 [235.7  240.42]
1519 [240.62 255.8 ]
389 [255.92 259.8 ]
135 [260.78 262.12]
213 [262.32 264.44]
609 [264.48 270.56]
465 [271.2  275.84]
287 [276.   278.86]
123 [279.02 280.24]
81 [280.5 281.3]
241 [281.62 284.02]
175 [284.24 285.98]
205 [286.02 288.06]
1305 [288.44 301.48]
285 [301.66 304.5 ]
791 [305.34 313.24]
383 [313.26 317.08]
301 [317.16 320.16]
411 [320.32 324.42]
627 [325.22 331.48]
133 [331.5  332.82]
597 [332.98 338.94]
179 [339.36 341.14]
225 [342.04 344.

In [69]:
print(len(STFT_windows), STFT_windows[0].shape)

1900 (2, 40, 24)


In [60]:
def get_STFTs(segs, time_segs):
    #Get 240ms STFT windows with 50% overlap, in pairs
    sr = config.sr
    STFT_windows = []
    for i, seg in enumerate(segs):
        S = librosa.core.stft(y=seg, n_fft=config.nfft, win_length=int(config.window * sr), hop_length=int(config.hop * sr))
        S = np.abs(S) ** 2
        mel_basis = librosa.filters.mel(sr=sr, n_fft=config.nfft, n_mels=40)
        # log mel spectrogram of utterances
        S = np.log10(np.dot(mel_basis, S) + 1e-6)
        # S.shape[1] ~= math.ceil((time_segs[i][1] - time_segs[i][0])*100+1)
        for j in range(0, S.shape[1], int(.24/config.hop)):
            if j + 36 < S.shape[1]:
                # in order to fit on the expected shape of the embedding network we double the window
                STFT_windows.append([S[:, j:j+24], S[:, j+12:j+36]])                
            else:
                break
    return np.array(STFT_windows)

In [None]:
# Extract embeddings
# Each embedding saved file will have (2, 256)
with tf.Session(config=config_tensorflow) as sess:
    tf.global_variables_initializer().run()
    saver.restore(sess, config.model_path)
   
    audio_count = 0
    train_sequence = np.array([]).reshape(0,256)
    train_cluster_ids = []
    
    for audio_id, audio_path in audio_files.items():
        logging.info(f'loading {audio_id} {audio_count}/{audio_quantity}')

        # voice activity detection            
        times, segs = VAD_chunk(2, audio_path)
        concat_seg = concat_segs(times, segs)
        STFT_windows = get_STFTs(concat_seg)
        # print(len(STFT_windows), STFT_windows[0].shape)

        embeddings = np.array([]).reshape(0,256)
        for STFT_window in STFT_windows:
            STFT_batch = np.transpose(STFT_window, axes=(2,0,1))
            # print(STFT_frames2.shape) (24, 2, 40) (240ms window * batch 2 * mels 40)
            embeddings_batch = sess.run(embedded, feed_dict={verif:STFT_batch})
            embeddings = np.concatenate((embeddings, embeddings_batch))
            
        print(len(STFT_windows))
        print(len(embeddings))
        aligned_embeddings = align_embeddings(embeddings) # Turn window-level embeddings to segment-level (400ms)
        print(len(aligned_embeddings))
            
        train_sequence = np.concatenate((train_sequence, aligned_embeddings))

        # Precisa obter o speaker a partir de cada intervalo de 0.4s definido no aligned_embeddings
        # Levar em consideração o 'times' retornado pelo VAD
        # Comparar com o turns retornado pelo load_rttm
        for embedding in aligned_embeddings:
            train_cluster_ids.append(str(speaker_count))

        audio_count += 1
                    
    # Verificar se não estamos concatenando tudo em uma sequencia só. Pode atrapalhar no treinamento
    train_sequence_path = os.path.join(save_dir_path, f'voxcon-dev-train-sequence.npy')
    np.save(train_sequence_path, train_sequence)
            
    train_cluster_ids_path = os.path.join(save_dir_path, f'voxcon-dev-train-cluster-ids.npy')
    train_cluster_ids = np.asarray(train_cluster_ids)
    np.save(train_cluster_ids_path, train_cluster_ids)
    logging.info(f'saved train sequence')

In [None]:
train_sequence = []
train_sequence.append(aligned_embeddings)
print(len(train_sequence))

In [None]:
intervals = librosa.effects.split(utter, top_db=vad_threshold)
for idx, current_interval in enumerate(intervals):
    utterances_spec = []
    utter_part = utter[current_interval[0]:current_interval[1]]
    S = librosa.core.stft(y=utter_part, n_fft=config.nfft, win_length=int(config.window * sr), hop_length=int(config.hop * sr))
    S = np.abs(S) ** 2
    mel_basis = librosa.filters.mel(sr=sr, n_fft=config.nfft, n_mels=40)
    # log mel spectrogram of utterances
    S = np.log10(np.dot(mel_basis, S) + 1e-6)
    
    utterances_spec.append(S[:, :config.tisv_frame])
    utterances_spec.append(S[:, -config.tisv_frame:])
    
    utterances_spec = np.array(utterances_spec)

In [None]:
# duration

In [None]:
# data.shape

In [None]:
# all_data.dim

In [None]:
# if __name__ == "__main__":
#     """
#     Speaker embeddings program:
#     input: audio files
#     output: npy file with shape (2, 256) [first and last tisv_frames for given interval in an audio]
#     """
#     main()
#     print('Program completed!')

In [None]:
# def get_STFTs(segs):
#     #Get 240ms STFT windows with 50% overlap
#     sr = config.sr
#     STFT_frames = []
#     for seg in segs:
#         S = librosa.core.stft(y=seg, n_fft=config.nfft, win_length=int(config.window * sr), hop_length=int(config.hop * sr))
#         S = np.abs(S) ** 2
#         mel_basis = librosa.filters.mel(sr=sr, n_fft=config.nfft, n_mels=40)
#         # log mel spectrogram of utterances
#         S = np.log10(np.dot(mel_basis, S) + 1e-6)        
#         for j in range(0, S.shape[1], int(.12/config.hop)):
#             if j + 24 < S.shape[1]:
#                 STFT_frames.append(S[:,j:j+24])
#             else:
#                 break
#     return STFT_frames

# import torch
# import torch.nn as nn

# from hparam import hparam as hp

# class SpeechEmbedder(nn.Module):
    
#     def __init__(self):
#         super(SpeechEmbedder, self).__init__()    
#         self.LSTM_stack = nn.LSTM(hp.data.nmels, hp.model.hidden, num_layers=hp.model.num_layer, batch_first=True)
#         for name, param in self.LSTM_stack.named_parameters():
#           if 'bias' in name:
#              nn.init.constant_(param, 0.0)
#           elif 'weight' in name:
#              nn.init.xavier_normal_(param)
#         self.projection = nn.Linear(hp.model.hidden, hp.model.proj)
        
#     def forward(self, x):
#         x, _ = self.LSTM_stack(x.float()) #(batch, frames, n_mels)
#         #only use last frame
#         x = x[:,x.size(1)-1]
#         x = self.projection(x.float())
#         x = x / torch.norm(x, dim=1).unsqueeze(1)
#         return x

# embedder_net = SpeechEmbedder()
# # embedder_net.load_state_dict(torch.load(hp.model.model_path))
# embedder_net.eval()

# times, segs = VAD_chunk(2, audio_path)
# concat_seg = concat_segs(times, segs)
# STFT_frames = get_STFTs(concat_seg)
# STFT_frames = np.stack(STFT_frames, axis=2)
# STFT_frames = torch.tensor(np.transpose(STFT_frames, axes=(2,1,0)))

# embeddings = embedder_net(STFT_frames)

# embeddings.shape

# STFT_frames.shape