In [1]:
import os
import logging
import datetime
import time
import math
import json
import librosa
import numpy as np
from utils import normalize

import tensorflow as tf
from tensorflow.contrib import rnn
from sklearn.preprocessing import normalize as sk_normalize
from sklearn.cluster import KMeans
from scipy.ndimage.filters import gaussian_filter

from collections import defaultdict
from configuration import get_config
from rttm import load_rttm
from VAD_segments import VAD_chunk

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Namespace(audio_file='../data/voxconverse/sample/abjxc.wav', hidden=768, hop=0.01, log_path='main.logs', model_num=3, model_path='../models/model.ckpt-46', nfft=512, num_layer=3, number_of_speakers=2, output_dir='output.json', proj=256, random_state=123, restore=False, sr=16000, srt_path='xxx.en.srt', tdsv=False, tdsv_frame=160, tisv_frame=160, tisv_frame_min=50, window=0.025)


In [2]:
config = get_config()
config.log_path = 'voxconverse-sample-embeddings.logs'
log_file = os.path.abspath(config.log_path)
logging.basicConfig(
    filename=log_file,
    level=logging.DEBUG,
    format="%(asctime)s:%(levelname)s:%(message)s"
    )
print(f'Log path: {log_file}')

Log path: /home/jovyan/work/voxsrc21-dia/embeddings/voxconverse-sample-embeddings.logs


In [30]:
data_path = '/home/jovyan/work/voxsrc21-dia/data/voxconverse/sample/wav'
rttm_path = '/home/jovyan/work/voxsrc21-dia/data/voxconverse/sample/rttm'
# data_path = '/app/datasets/voxconverse/test/wav'
# rttm_path = '/app/datasets/voxconverse/test/rttm'
save_dir_path = '/home/jovyan/work/voxsrc21-dia/embeddings/sequences/voxconverse-sample'
os.makedirs(save_dir_path, exist_ok=True)

# Data prep
# I'm saving only 2 embeddings i.e. first and last tisv_frames for given interval in an audio. So each .npy
# embedding file will have a shape of (2, 256)
tf.reset_default_graph()
batch_size = 2 # Fixing to 2 since we take 2 for each interval #utter_batch.shape[1]
verif = tf.placeholder(shape=[None, batch_size, 40], dtype=tf.float32)  # verification batch (time x batch x n_mel)
batch = tf.concat([verif,], axis=1)
# embedding lstm (3-layer default)
with tf.variable_scope("lstm"):
    lstm_cells = [tf.contrib.rnn.LSTMCell(num_units=config.hidden, num_proj=config.proj) for i in range(config.num_layer)]
    lstm = tf.contrib.rnn.MultiRNNCell(lstm_cells)    # make lstm op and variables
    outputs, _ = tf.nn.dynamic_rnn(cell=lstm, inputs=batch, dtype=tf.float32, time_major=True)   # for TI-VS must use dynamic rnn
    embedded = outputs[-1]                            # the last ouput is the embedded d-vector
    embedded = normalize(embedded)                    # normalize
config_tensorflow = tf.ConfigProto(device_count = {'GPU': 0})
saver = tf.train.Saver(var_list=tf.global_variables())

In [158]:
def concat_segs(times, segs):
    # Concatenate continuous voiced segments
    # with segment time information (onset and offset)
    concat_seg = []
    concat_times=[]
    seg_concat = segs[0]
    seg_onset = times[0][0]
    for i in range(0, len(times)-1):
        if times[i][1] == times[i+1][0]:
            # If segments are continuous, concatenate them
            seg_concat = np.concatenate((seg_concat, segs[i+1]))
        else:
            # If not, append a new segment sequence
            concat_seg.append(seg_concat)
            seg_concat = segs[i+1]
            # Save segment time offset and append a new one
            seg_offset = times[i][1]
            seg_interval = [seg_onset, seg_offset]
            concat_times.append(seg_interval)
            seg_onset = times[i+1][0]
    else:
        concat_seg.append(seg_concat)
        # Save last time offset
        seg_offset = times[i+1][1]
        seg_interval = [seg_onset, seg_offset]
        concat_times.append(seg_interval)
        
    return concat_seg, concat_times

def get_STFTs(segs, time_segs):
    #Get 240ms STFT windows with 50% overlap, in pairs
    sr = config.sr
    STFT_windows = []
    time_windows = []
    for i, seg in enumerate(segs):
        S = librosa.core.stft(y=seg, n_fft=config.nfft, win_length=int(config.window * sr), hop_length=int(config.hop * sr))
        S = np.abs(S) ** 2
        mel_basis = librosa.filters.mel(sr=sr, n_fft=config.nfft, n_mels=40)
        # log mel spectrogram of utterances
        S = np.log10(np.dot(mel_basis, S) + 1e-6)
        
        # S.shape[1] ~= math.ceil((time_segs[i][1] - time_segs[i][0])*100+1)
        segment_time_onset = time_segs[i][0]    
        for j in range(0, S.shape[1], int(.24/config.hop)): # 0.24 / 0.01 = 24.0
            # if hop != 0.01, we can't use 12, 24, 36 frames (they stop making sense)
            # 36 frames are related to .36s of the audio
            if j + 36 < S.shape[1]:
                # in order to fit on the expected shape of the embedding network we double the window
                STFT_windows.append([S[:, j:j+24], S[:, j+12:j+36]])
                # returns the time intervals for each STFT window
                window_onset = segment_time_onset + 0.01*j
                time_windows.extend([[window_onset, window_onset+0.24], [window_onset+0.12, window_onset+0.36]])
            else:
                break
    return np.array(STFT_windows), np.array(time_windows)

def align_embeddings(embeddings, intervals):
    partitions = []
    start = 0
    end = 0
    j = 1
    for i, embedding in enumerate(embeddings):
        if (i*.12)+.24 < j*.401:
            end = end + 1
        else:
            partitions.append((start,end))
            start = end
            end = end + 1
            j += 1
    else:
        partitions.append((start,end))
    
    avg_embeddings = np.zeros((len(partitions),256))
    segment_intervals = [] 
    for i, partition in enumerate(partitions):
        avg_embeddings[i] = np.average(embeddings[partition[0]:partition[1]],axis=0)

        partition_interval = intervals[partition[0]:partition[1]]
        interval_onset = partition_interval[0][0]   #start of first partition
        interval_offset = partition_interval[-1][1] #end of last partition
        segment_intervals.append([interval_onset, interval_offset])
    return avg_embeddings, np.array(segment_intervals)

In [31]:
all_unique_extensions = []
# Using List as default factory
audio_files = defaultdict(list)
rttm_files = defaultdict(list)

for audio_file in os.listdir(data_path):
    if audio_file.startswith('.'): #hidden folders
        continue;
    audio_id = os.path.splitext(audio_file)[0]
    extension = os.path.splitext(audio_file)[1]
    all_unique_extensions.append(extension)
#     print(f'Audio id: {audio_id}')
    if extension == '.wav':
        audio_files[audio_id].append(os.path.join(data_path, audio_file))
        rttm_files[audio_id].append(os.path.join(rttm_path, audio_id + '.rttm'))
    else:
        print(f'Wrong file type in {os.path.join(data_path, audio_file)}')

audio_quantity = len(audio_files)
print(f'Unique file extensions: {set(all_unique_extensions)}')
print(f'Number of audios: {audio_quantity}')
print(f'Number of rttms: {len(rttm_files)}')

Unique file extensions: {'.wav'}
Number of audios: 19
Number of rttms: 19


In [33]:
for audio_id, rttm_path in rttm_files.items():
    _, speakers, _ = load_rttm(rttm_files.get(audio_id)[0])
    print(audio_id, len(speakers))

abjxc 1
afjiv 5
ahnss 4
aisvi 8
akthc 2
ampme 3
asxwr 3
atgpi 1
aufkn 3
azisu 4
bauzd 5
bdopb 7
bkwns 2
blwmj 2
bravd 3
bspxd 3
bwzyf 4
bxpwa 5
bydui 3


In [14]:
turns, _, _ = load_rttm(rttm_files.get('abjxc')[0])

turns[0].onset
turns[0].offset
turns[0].dur
turns[0].speaker_id

'spk00'

In [134]:
times, segs = VAD_chunk(2, audio_files.get(audio_id)[0])

In [145]:
concat_seg, concat_times = concat_segs(times, segs)

In [159]:
STFT_windows, time_windows = get_STFTs(concat_seg, concat_times)

In [164]:
with tf.Session(config=config_tensorflow) as sess:
    tf.global_variables_initializer().run()
    saver.restore(sess, config.model_path)
   
    embeddings = np.array([]).reshape(0,256)
    for idx, STFT_window in enumerate(STFT_windows):
        STFT_batch = np.transpose(STFT_window, axes=(2,0,1))
        # print(STFT_batch.shape) (24, 2, 40) (240ms window * batch 2 * mels 40)
        embeddings_batch = sess.run(embedded, feed_dict={verif:STFT_batch})
        embeddings = np.concatenate((embeddings, embeddings_batch))

INFO:tensorflow:Restoring parameters from ../models/model.ckpt-46


In [191]:
aligned_embeddings, segment_intervals = align_embeddings(embeddings, time_windows)

In [206]:
for i in range(0, len(segment_intervals)-1):
    if (abs(segment_intervals[i][1] - segment_intervals[i+1][0]) > 0.001):
        print(segment_intervals[i], segment_intervals[i+1],segment_intervals[i][1] - segment_intervals[i+1][0])

[99.56 99.92] [100.6  100.96] -0.6799999999999926
[104.2  104.56] [104.76 105.24] -0.20000000000001705
[107.16 107.64] [108.08 108.44] -0.4399999999999977
[129.68 130.04] [130.32 130.8 ] -0.28000000000000114
[181.32 181.68] [181.84 182.32] -0.1599999999999966
[206.92 207.28] [207.96 208.32] -0.6799999999999784
[230.88 231.24] [231.4  231.76] -0.1599999999999966
[253.5  253.98] [254.4  254.76] -0.4199999999999875
[264.   264.36] [264.7  265.18] -0.339999999999975


In [208]:
times

[(0.04, 0.44),
 (0.44, 0.84),
 (0.84, 1.24),
 (1.24, 1.64),
 (1.64, 2.04),
 (2.04, 2.44),
 (2.44, 2.84),
 (2.84, 3.24),
 (3.24, 3.64),
 (3.64, 4.04),
 (4.04, 4.44),
 (4.44, 4.84),
 (4.84, 5.24),
 (5.24, 5.64),
 (5.64, 6.04),
 (6.04, 6.44),
 (6.44, 6.84),
 (6.84, 7.24),
 (7.24, 7.64),
 (7.64, 8.04),
 (8.04, 8.44),
 (8.44, 8.84),
 (8.84, 9.24),
 (9.24, 9.64),
 (9.64, 10.04),
 (10.04, 10.44),
 (10.44, 10.84),
 (10.84, 11.24),
 (11.24, 11.64),
 (11.64, 12.04),
 (12.04, 12.44),
 (12.44, 12.84),
 (12.84, 13.24),
 (13.24, 13.64),
 (13.64, 14.04),
 (14.04, 14.44),
 (14.44, 14.84),
 (14.84, 15.24),
 (15.24, 15.64),
 (15.64, 16.04),
 (16.04, 16.44),
 (16.44, 16.84),
 (16.84, 17.24),
 (17.24, 17.64),
 (17.64, 18.04),
 (18.04, 18.44),
 (18.44, 18.84),
 (18.84, 19.24),
 (19.24, 19.64),
 (19.64, 20.04),
 (20.04, 20.44),
 (20.44, 20.84),
 (20.84, 21.24),
 (21.24, 21.64),
 (21.64, 22.04),
 (22.04, 22.44),
 (22.44, 22.84),
 (22.84, 23.24),
 (23.24, 23.64),
 (23.64, 24.04),
 (24.04, 24.44),
 (24.44, 24.

In [190]:
def align_embeddings(embeddings, intervals):
    partitions = []
    start = 0
    end = 0
    j = 1
    for i, embedding in enumerate(embeddings):
        if (i*.12)+.24 < j*.401:
            end = end + 1
        else:
            partitions.append((start,end))
            start = end
            end = end + 1
            j += 1
    else:
        partitions.append((start,end))
    
    avg_embeddings = np.zeros((len(partitions),256))
    segment_intervals = [] 
    for i, partition in enumerate(partitions):
        avg_embeddings[i] = np.average(embeddings[partition[0]:partition[1]],axis=0)
        
        partition_interval = intervals[partition[0]:partition[1]]
        interval_onset = partition_interval[0][0]   #start of first partition
        interval_offset = partition_interval[-2][1] #end of last partition
        segment_intervals.append([interval_onset, interval_offset])
    return avg_embeddings, np.array(segment_intervals)

In [None]:
# Extract embeddings
# Each embedding saved file will have (2, 256)
with tf.Session(config=config_tensorflow) as sess:
    tf.global_variables_initializer().run()
    saver.restore(sess, config.model_path)
   
    audio_count = 0
    train_sequence = np.array([]).reshape(0,256)
    train_cluster_ids = []
    
    for audio_id, audio_path in audio_files.items():
        logging.info(f'loading {audio_id} {audio_count}/{audio_quantity}')

        # voice activity detection            
        times, segs = VAD_chunk(2, audio_path)
        concat_seg = concat_segs(times, segs)
        STFT_windows, time = get_STFTs(concat_seg)
        # print(len(STFT_windows), STFT_windows[0].shape)

        embeddings = np.array([]).reshape(0,256)
        for idx, STFT_window in enumerate(STFT_windows):
            STFT_batch = np.transpose(STFT_window, axes=(2,0,1))
            # print(STFT_batch.shape) (24, 2, 40) (240ms window * batch 2 * mels 40)
            embeddings_batch = sess.run(embedded, feed_dict={verif:STFT_batch})
            embeddings = np.concatenate((embeddings, embeddings_batch))
            
        aligned_embeddings = align_embeddings(embeddings) # Turn window-level embeddings to segment-level (400ms)
        
        train_sequence = np.concatenate((train_sequence, aligned_embeddings))

        # Precisa obter o speaker a partir de cada intervalo de 0.4s definido no aligned_embeddings
        # Levar em consideração o 'times' retornado pelo VAD
        # Comparar com o turns retornado pelo load_rttm
        for embedding in aligned_embeddings:
            train_cluster_ids.append(str(speaker_count))

        audio_count += 1
                    
    # Verificar se não estamos concatenando tudo em uma sequencia só. Pode atrapalhar no treinamento
    train_sequence_path = os.path.join(save_dir_path, f'voxcon-dev-train-sequence.npy')
    np.save(train_sequence_path, train_sequence)
            
    train_cluster_ids_path = os.path.join(save_dir_path, f'voxcon-dev-train-cluster-ids.npy')
    train_cluster_ids = np.asarray(train_cluster_ids)
    np.save(train_cluster_ids_path, train_cluster_ids)
    logging.info(f'saved train sequence')

In [None]:
train_sequence = []
train_sequence.append(aligned_embeddings)
print(len(train_sequence))

In [None]:
# all_data.dim

In [None]:
# def get_STFTs(segs):
#     #Get 240ms STFT windows with 50% overlap
#     sr = config.sr
#     STFT_frames = []
#     for seg in segs:
#         S = librosa.core.stft(y=seg, n_fft=config.nfft, win_length=int(config.window * sr), hop_length=int(config.hop * sr))
#         S = np.abs(S) ** 2
#         mel_basis = librosa.filters.mel(sr=sr, n_fft=config.nfft, n_mels=40)
#         # log mel spectrogram of utterances
#         S = np.log10(np.dot(mel_basis, S) + 1e-6)        
#         for j in range(0, S.shape[1], int(.12/config.hop)):
#             if j + 24 < S.shape[1]:
#                 STFT_frames.append(S[:,j:j+24])
#             else:
#                 break
#     return STFT_frames

# import torch
# import torch.nn as nn

# from hparam import hparam as hp

# class SpeechEmbedder(nn.Module):
    
#     def __init__(self):
#         super(SpeechEmbedder, self).__init__()    
#         self.LSTM_stack = nn.LSTM(hp.data.nmels, hp.model.hidden, num_layers=hp.model.num_layer, batch_first=True)
#         for name, param in self.LSTM_stack.named_parameters():
#           if 'bias' in name:
#              nn.init.constant_(param, 0.0)
#           elif 'weight' in name:
#              nn.init.xavier_normal_(param)
#         self.projection = nn.Linear(hp.model.hidden, hp.model.proj)
        
#     def forward(self, x):
#         x, _ = self.LSTM_stack(x.float()) #(batch, frames, n_mels)
#         #only use last frame
#         x = x[:,x.size(1)-1]
#         x = self.projection(x.float())
#         x = x / torch.norm(x, dim=1).unsqueeze(1)
#         return x

# embedder_net = SpeechEmbedder()
# # embedder_net.load_state_dict(torch.load(hp.model.model_path))
# embedder_net.eval()

# times, segs = VAD_chunk(2, audio_path)
# concat_seg = concat_segs(times, segs)
# STFT_frames = get_STFTs(concat_seg)
# STFT_frames = np.stack(STFT_frames, axis=2)
# STFT_frames = torch.tensor(np.transpose(STFT_frames, axes=(2,1,0)))

# embeddings = embedder_net(STFT_frames)

# embeddings.shape

# STFT_frames.shape