In [1]:
import os
import logging
import datetime
import time
import math
import json
import librosa
import numpy as np
from utils import normalize

import tensorflow as tf
from tensorflow.contrib import rnn
from sklearn.preprocessing import normalize as sk_normalize
from sklearn.cluster import KMeans
from scipy.ndimage.filters import gaussian_filter

from collections import defaultdict
from configuration import get_config
from rttm import load_rttm, Turn
from VAD_segments import VAD_chunk

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Namespace(audio_file='../data/voxconverse/sample/abjxc.wav', hidden=768, hop=0.01, log_path='main.logs', model_num=3, model_path='../models/model.ckpt-46', nfft=512, num_layer=3, number_of_speakers=2, output_dir='output.json', proj=256, random_state=123, restore=False, sr=16000, srt_path='xxx.en.srt', tdsv=False, tdsv_frame=160, tisv_frame=160, tisv_frame_min=50, window=0.025)


In [2]:
config = get_config()
config.log_path = 'voxconverse-sample-embeddings.logs'
log_file = os.path.abspath(config.log_path)
logging.basicConfig(
    filename=log_file,
    level=logging.DEBUG,
    format="%(asctime)s:%(levelname)s:%(message)s"
    )
print(f'Log path: {log_file}')

Log path: /home/jovyan/work/voxsrc21-dia/embeddings/voxconverse-sample-embeddings.logs


In [3]:
data_path = '/home/jovyan/work/voxsrc21-dia/data/voxconverse/sample/wav'
rttm_path = '/home/jovyan/work/voxsrc21-dia/data/voxconverse/sample/rttm'
# data_path = '/app/datasets/voxconverse/test/wav'
# rttm_path = '/app/datasets/voxconverse/test/rttm'
save_dir_path = '/home/jovyan/work/voxsrc21-dia/embeddings/sequences/voxconverse-sample'
os.makedirs(save_dir_path, exist_ok=True)

# Data prep
# I'm saving only 2 embeddings i.e. first and last tisv_frames for given interval in an audio. So each .npy
# embedding file will have a shape of (2, 256)
tf.reset_default_graph()
batch_size = 2 # Fixing to 2 since we take 2 for each interval #utter_batch.shape[1]
verif = tf.placeholder(shape=[None, batch_size, 40], dtype=tf.float32)  # verification batch (time x batch x n_mel)
batch = tf.concat([verif,], axis=1)
# embedding lstm (3-layer default)
with tf.variable_scope("lstm"):
    lstm_cells = [tf.contrib.rnn.LSTMCell(num_units=config.hidden, num_proj=config.proj) for i in range(config.num_layer)]
    lstm = tf.contrib.rnn.MultiRNNCell(lstm_cells)    # make lstm op and variables
    outputs, _ = tf.nn.dynamic_rnn(cell=lstm, inputs=batch, dtype=tf.float32, time_major=True)   # for TI-VS must use dynamic rnn
    embedded = outputs[-1]                            # the last ouput is the embedded d-vector
    embedded = normalize(embedded)                    # normalize
config_tensorflow = tf.ConfigProto(device_count = {'GPU': 0})
saver = tf.train.Saver(var_list=tf.global_variables())

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [4]:
def concat_segs(times, segs):
    # Concatenate continuous voiced segments
    # with segment time information (onset and offset)
    concat_seg = []
    concat_times=[]
    seg_concat = segs[0]
    seg_onset = times[0][0]
    for i in range(0, len(times)-1):
        if times[i][1] == times[i+1][0]:
            # If segments are continuous, concatenate them
            seg_concat = np.concatenate((seg_concat, segs[i+1]))
        else:
            # If not, append a new segment sequence
            concat_seg.append(seg_concat)
            seg_concat = segs[i+1]
            # Save segment time offset and append a new one
            seg_offset = times[i][1]
            seg_interval = [seg_onset, seg_offset]
            concat_times.append(seg_interval)
            seg_onset = times[i+1][0]
    else:
        concat_seg.append(seg_concat)
        # Save last time offset
        seg_offset = times[i+1][1]
        seg_interval = [seg_onset, seg_offset]
        concat_times.append(seg_interval)
        
    return concat_seg, concat_times

def get_STFTs(segs, time_segs):
    #Get 240ms STFT windows with 50% overlap
    sr = config.sr
    STFT_windows = []
    time_windows = []
    for i, seg in enumerate(segs):
        S = librosa.core.stft(y=seg, n_fft=config.nfft, win_length=int(config.window * sr), hop_length=int(config.hop * sr))
        S = np.abs(S) ** 2
        mel_basis = librosa.filters.mel(sr=sr, n_fft=config.nfft, n_mels=40)
        # log mel spectrogram of utterances
        S = np.log10(np.dot(mel_basis, S) + 1e-6)
        
        STFT_couple = []
        segment_time_onset = time_segs[i][0]
        for j in range(0, S.shape[1], int(.12/config.hop)):# 0.24 / 0.01 = 24.0
            # if hop != 0.01, we can't use 12 or 24 frames (they stop making sense)
            if j + 24 < S.shape[1]:
                if len(STFT_couple) < 2:
                    STFT_couple.append(S[:,j:j+24])
                else:
                    STFT_windows.append(STFT_couple)
                    STFT_couple = [S[:,j:j+24]]
                    # returns the time intervals for each STFT window
                    window_onset = segment_time_onset + 0.01*j
                    time_windows.extend([[window_onset-0.12, window_onset+0.12], [window_onset, window_onset+0.24]])
            else:
                break
    return np.array(STFT_windows), np.array(time_windows)

def align_embeddings(embeddings, intervals):
    partitions = []
    start = 0
    end = 0
    j = 1
    for i, embedding in enumerate(embeddings):
        if (i*.12)+.24 < j*.401:
            end = end + 1
        else:
            partitions.append((start,end))
            start = end
            end = end + 1
            j += 1
    else:
        partitions.append((start,end))
    
    avg_embeddings = np.zeros((len(partitions),256))
    segment_intervals = [] 
    for i, partition in enumerate(partitions):
        avg_embeddings[i] = np.average(embeddings[partition[0]:partition[1]],axis=0)
        
        partition_interval = intervals[partition[0]:partition[1]]
        interval_onset = partition_interval[0][0]   #start of first partition
        interval_offset = partition_interval[-2][1] #end of last partition
        segment_intervals.append([interval_onset, interval_offset])
    return avg_embeddings, np.array(segment_intervals)

def getOnsets(turn):
    return turn.onset

In [78]:
all_unique_extensions = []
# Using List as default factory
audio_files = defaultdict(list)
rttm_files = defaultdict(list)

for audio_file in os.listdir(data_path):
    if audio_file.startswith('.'): #hidden folders
        continue;
    audio_id = os.path.splitext(audio_file)[0]
    extension = os.path.splitext(audio_file)[1]
    all_unique_extensions.append(extension)
#     print(f'Audio id: {audio_id}')
    if extension == '.wav':
        audio_files[audio_id].append(os.path.join(data_path, audio_file))
        rttm_files[audio_id].append(os.path.join(rttm_path, audio_id + '.rttm'))
    else:
        print(f'Wrong file type in {os.path.join(data_path, audio_file)}')

audio_quantity = len(audio_files)
print(f'Unique file extensions: {set(all_unique_extensions)}')
print(f'Number of audios: {audio_quantity}')
print(f'Number of rttms: {len(rttm_files)}')

Unique file extensions: {'.wav'}
Number of audios: 19
Number of rttms: 19


In [79]:
for audio_id, rttm_path in rttm_files.items():
    _, speakers, _ = load_rttm(rttm_files.get(audio_id)[0])
    print(audio_id, len(speakers))

bwzyf 4
aufkn 3
akthc 2
aisvi 8
blwmj 2
ahnss 4
bspxd 3
bkwns 2
bdopb 7
bxpwa 5
abjxc 1
ampme 3
azisu 4
afjiv 5
asxwr 3
bravd 3
bydui 3
atgpi 1
bauzd 5


In [82]:
turns, _, _ = load_rttm(rttm_files.get(audio_id)[0])

print(
turns[0].onset,
turns[0].offset,
turns[0].dur,
turns[0].speaker_id)

35.48 36.36 0.88 spk00


In [72]:
for i in range(0, len(segment_intervals)-1):
    if (abs(segment_intervals[i][1] - segment_intervals[i+1][0]) > 0.0001):
        print(segment_intervals[i], segment_intervals[i+1],segment_intervals[i][1] - segment_intervals[i+1][0])

[0.22 0.58] [0.46 0.94] 0.12
[0.46 0.94] [0.82 1.42] 0.12
[0.82 1.42] [1.3  1.78] 0.11999999999999988
[1.3  1.78] [1.66 2.14] 0.11999999999999988
[1.66 2.14] [2.02 2.62] 0.1200000000000001
[2.02 2.62] [2.5  2.98] 0.1200000000000001
[2.5  2.98] [2.86 3.34] 0.12000000000000055
[2.86 3.34] [3.22 3.82] 0.1200000000000001
[3.22 3.82] [3.7  4.18] 0.1200000000000001
[3.7  4.18] [4.06 4.54] 0.1200000000000001
[4.06 4.54] [4.42 5.02] 0.1200000000000001
[4.42 5.02] [4.9  5.38] 0.1200000000000001
[4.9  5.38] [5.26 5.74] 0.1200000000000001
[5.26 5.74] [5.62 6.22] 0.1200000000000001
[5.62 6.22] [6.1  6.58] 0.1200000000000001
[6.1  6.58] [6.46 6.94] 0.1200000000000001
[6.46 6.94] [6.82 7.92] 0.1200000000000001
[6.82 7.92] [7.8  8.28] 0.1200000000000001
[7.8  8.28] [8.16 9.1 ] 0.11999999999999922
[8.16 9.1 ] [8.98 9.58] 0.11999999999999922
[8.98 9.58] [9.46 9.94] 0.11999999999999922
[9.46 9.94] [ 9.82 10.3 ] 0.11999999999999922
[ 9.82 10.3 ] [10.18 10.78] 0.11999999999999922
[10.18 10.78] [10.66 11.1

[88.52 89.  ] [88.88 89.48] 0.12000000000000455
[88.88 89.48] [89.36 89.84] 0.12000000000000455
[89.36 89.84] [89.72 90.2 ] 0.12000000000000455
[89.72 90.2 ] [90.08 90.68] 0.12000000000000455
[90.08 90.68] [90.56 91.04] 0.11999999999999034
[90.56 91.04] [90.92 91.4 ] 0.12000000000000455
[90.92 91.4 ] [91.28 91.88] 0.11999999999999034
[91.28 91.88] [91.76 92.24] 0.12000000000000455
[91.76 92.24] [92.12 92.6 ] 0.12000000000000455
[92.12 92.6 ] [92.48 93.08] 0.12000000000001876
[92.48 93.08] [92.96 93.44] 0.11999999999999034
[92.96 93.44] [93.32 93.8 ] 0.12000000000000455
[93.32 93.8 ] [93.68 94.28] 0.11999999999999034
[93.68 94.28] [94.16 94.64] 0.12000000000000455
[94.16 94.64] [94.52 95.  ] 0.12000000000000455
[94.52 95.  ] [94.88 95.48] 0.12000000000000455
[94.88 95.48] [95.36 95.84] 0.12000000000000455
[95.36 95.84] [95.72 96.2 ] 0.12000000000000455
[95.72 96.2 ] [96.08 96.68] 0.12000000000000455
[96.08 96.68] [96.56 97.04] 0.11999999999999034
[96.56 97.04] [96.92 97.4 ] 0.1200000000

[165.12 165.6 ] [165.48 166.08] 0.12000000000003297
[165.48 166.08] [165.96 167.3 ] 0.12000000000000455
[165.96 167.3 ] [167.18 167.66] 0.12000000000000455
[167.18 167.66] [167.54 168.14] 0.12000000000000455
[167.54 168.14] [168.02 168.5 ] 0.12000000000003297
[168.02 168.5 ] [168.38 168.86] 0.12000000000000455
[168.38 168.86] [168.74 169.34] 0.12000000000003297
[168.74 169.34] [169.22 169.7 ] 0.12000000000000455
[169.22 169.7 ] [169.58 170.06] 0.12000000000000455
[169.58 170.06] [169.94 170.54] 0.12000000000000455
[169.94 170.54] [170.42 170.9 ] 0.12000000000003297
[170.42 170.9 ] [170.78 171.38] 0.12000000000000455
[170.78 171.38] [171.26 171.74] 0.12000000000000455
[171.26 171.74] [171.62 172.1 ] 0.12000000000000455
[171.62 172.1 ] [171.98 172.58] 0.12000000000000455
[171.98 172.58] [172.46 172.94] 0.12000000000000455
[172.46 172.94] [172.82 173.3 ] 0.12000000000000455
[172.82 173.3 ] [173.18 173.78] 0.12000000000000455
[173.18 173.78] [173.66 174.14] 0.12000000000000455
[173.66 174.

[247.34 247.82] [247.7  248.18] 0.12000000000003297
[247.7  248.18] [248.06 248.66] 0.12000000000000455
[248.06 248.66] [248.54 249.02] 0.12000000000000455
[248.54 249.02] [248.9  249.38] 0.12000000000000455
[248.9  249.38] [249.26 249.86] 0.12000000000000455
[249.26 249.86] [249.74 250.22] 0.12000000000000455
[249.74 250.22] [250.1  250.58] 0.12000000000003297
[250.1  250.58] [250.46 251.06] 0.12000000000000455
[250.46 251.06] [250.94 251.42] 0.12000000000000455
[250.94 251.42] [251.3  251.78] 0.12000000000000455
[251.3  251.78] [251.66 252.26] 0.12000000000000455
[251.66 252.26] [252.14 252.62] 0.12000000000000455
[252.14 252.62] [252.5  252.98] 0.12000000000000455
[252.5  252.98] [252.86 253.46] 0.12000000000000455
[252.86 253.46] [253.34 253.82] 0.12000000000000455
[253.34 253.82] [253.7  254.18] 0.12000000000003297
[253.7  254.18] [254.06 254.66] 0.12000000000000455
[254.06 254.66] [254.54 255.02] 0.12000000000000455
[254.54 255.02] [254.9  255.38] 0.12000000000000455
[254.9  255.

[334.18 334.66] [334.54 335.14] 0.12000000000000455
[334.54 335.14] [335.02 335.5 ] 0.12000000000000455
[335.02 335.5 ] [335.38 335.86] 0.12000000000000455
[335.38 335.86] [335.74 336.34] 0.12000000000000455
[335.74 336.34] [336.22 336.7 ] 0.12000000000000455
[336.22 336.7 ] [336.58 337.06] 0.12000000000000455
[336.58 337.06] [336.94 337.54] 0.12000000000000455
[336.94 337.54] [337.42 337.9 ] 0.12000000000000455
[337.42 337.9 ] [337.78 338.26] 0.12000000000000455
[337.78 338.26] [338.14 338.74] 0.12000000000006139
[338.14 338.74] [339.48 339.96] -0.7400000000000091
[339.48 339.96] [339.84 340.32] 0.12000000000000455
[339.84 340.32] [340.2 340.8] 0.12000000000006139
[340.2 340.8] [340.68 342.4 ] 0.12000000000000455
[340.68 342.4 ] [342.28 342.76] 0.12000000000000455
[342.28 342.76] [342.64 343.24] 0.12000000000000455
[342.64 343.24] [343.12 343.6 ] 0.12000000000000455
[343.12 343.6 ] [343.48 343.96] 0.12000000000000455
[343.48 343.96] [343.84 345.04] 0.12000000000000455
[343.84 345.04] 

[413.5  413.98] [413.86 414.46] 0.12000000000000455
[413.86 414.46] [414.34 414.82] 0.12000000000000455
[414.34 414.82] [414.7  415.18] 0.12000000000000455
[414.7  415.18] [415.06 415.66] 0.12000000000000455
[415.06 415.66] [415.54 416.02] 0.12000000000006139
[415.54 416.02] [415.9  416.38] 0.12000000000000455
[415.9  416.38] [416.26 416.86] 0.12000000000000455
[416.26 416.86] [416.74 417.22] 0.12000000000000455
[416.74 417.22] [417.1  417.58] 0.12000000000000455
[417.1  417.58] [417.46 418.06] 0.12000000000000455
[417.46 418.06] [417.94 418.42] 0.12000000000000455
[417.94 418.42] [418.3  418.78] 0.12000000000000455
[418.3  418.78] [418.66 419.26] 0.12000000000006139
[418.66 419.26] [419.14 419.62] 0.12000000000000455
[419.14 419.62] [419.5  419.98] 0.12000000000000455
[419.5  419.98] [419.86 420.46] 0.12000000000000455
[419.86 420.46] [420.34 420.82] 0.12000000000000455
[420.34 420.82] [420.7  421.18] 0.12000000000000455
[420.7  421.18] [421.06 421.66] 0.12000000000000455
[421.06 421.

[497.62 498.1 ] [497.98 498.58] 0.12000000000000455
[497.98 498.58] [498.46 498.94] 0.12000000000000455
[498.46 498.94] [498.82 499.3 ] 0.12000000000000455
[498.82 499.3 ] [499.18 499.78] 0.12000000000000455


In [98]:
time_windows[0]

array([0.22, 0.46])

In [101]:
turns

[Turn(0.160000, 7.280000, None, 'spk03', 'bauzd'),
 Turn(7.320000, 20.560000, None, 'spk03', 'bauzd'),
 Turn(21.520000, 29.440000, None, 'spk01', 'bauzd'),
 Turn(30.080000, 31.760000, None, 'spk01', 'bauzd'),
 Turn(31.800000, 34.360000, None, 'spk01', 'bauzd'),
 Turn(34.520000, 43.600000, None, 'spk01', 'bauzd'),
 Turn(35.480000, 36.360000, None, 'spk00', 'bauzd'),
 Turn(44.400000, 45.920000, None, 'spk01', 'bauzd'),
 Turn(46.560000, 62.560000, None, 'spk01', 'bauzd'),
 Turn(50.840000, 51.840000, None, 'spk02', 'bauzd'),
 Turn(62.800000, 64.520000, None, 'spk00', 'bauzd'),
 Turn(62.800000, 67.080000, None, 'spk02', 'bauzd'),
 Turn(64.840000, 66.080000, None, 'spk01', 'bauzd'),
 Turn(67.200000, 68.000000, None, 'spk01', 'bauzd'),
 Turn(68.480000, 70.120000, None, 'spk02', 'bauzd'),
 Turn(68.800000, 69.600000, None, 'spk01', 'bauzd'),
 Turn(70.160000, 72.040000, None, 'spk01', 'bauzd'),
 Turn(70.440000, 70.760000, None, 'spk00', 'bauzd'),
 Turn(71.640000, 75.760000, None, 'spk02', 'bauzd

In [83]:
# Comparar com o turns retornado pelo load_rttm para montar o train_cluster_ids
turns, _, _ = load_rttm(rttm_files.get(audio_id)[0])
turns.sort(key=getOnsets)

interval_turn = Turn(0, offset=0.001)
for interval in time_windows:
    if interval[0] > interval_turn.offset:
        interval_turn
    train_cluster_ids.append(str(speaker_count))

In [None]:
# Extract embeddings
# Each embedding saved file will have (2, 256)
with tf.Session(config=config_tensorflow) as sess:
    tf.global_variables_initializer().run()
    saver.restore(sess, config.model_path)
   
    audio_count = 0
    train_sequences = np.array([]).reshape(0, 256)
    train_cluster_ids = []
    
    for audio_id, audio_path in audio_files.items():
        # Path: audio_files.get(audio_id)[0]
        logging.info(f'loading {audio_id} {audio_count}/{audio_quantity}')

        # voice activity detection            
        times, segs = VAD_chunk(2, audio_path)
        concat_seg, concat_times = concat_segs(times, segs)
        STFT_windows, time_windows = get_STFTs(concat_seg, concat_times)
        # print(len(STFT_windows), STFT_windows[0].shape)

        embeddings = np.array([]).reshape(0,256)
        for STFT_window in STFT_windows:
            STFT_batch = np.transpose(STFT_window, axes=(2,0,1))
            # print(STFT_batch.shape) (24, 2, 40) (240ms window * batch 2 * mels 40)
            embeddings_batch = sess.run(embedded, feed_dict={verif:STFT_batch})
            embeddings = np.concatenate((embeddings, embeddings_batch))
            
        # Turn window-level embeddings to segment-level (400ms)
        aligned_embeddings, segment_intervals = align_embeddings(embeddings, time_windows)
        
        # Comparar com o turns retornado pelo load_rttm para montar o train_cluster_ids
        turns, _, _ = load_rttm(rttm_files.get(audio_id)[0])
        for interval in time_windows:
            train_cluster_ids.append(str(speaker_count))
            
        train_sequences = np.stack((train_sequences, aligned_embeddings))

        audio_count += 1
        
        if (audio_count == audio_quantity or audio_count % 2 == 0):
            train_sequences_path = os.path.join(save_dir_path, f'voxcon-dev-train-sequences.npy')
            np.save(train_sequences_path, train_sequence)
            
            train_cluster_ids_path = os.path.join(save_dir_path, f'voxcon-dev-train-cluster-ids.npy')
            train_cluster_ids = np.asarray(train_cluster_ids)
            np.save(train_cluster_ids_path, train_cluster_ids)
            logging.info(f'saved train sequence')

In [None]:
train_sequence = []
train_sequence.append(aligned_embeddings)
print(len(train_sequence))

In [None]:
# all_data.dim

In [None]:
# def get_STFTs(segs):
#     #Get 240ms STFT windows with 50% overlap
#     sr = config.sr
#     STFT_frames = []
#     for seg in segs:
#         S = librosa.core.stft(y=seg, n_fft=config.nfft, win_length=int(config.window * sr), hop_length=int(config.hop * sr))
#         S = np.abs(S) ** 2
#         mel_basis = librosa.filters.mel(sr=sr, n_fft=config.nfft, n_mels=40)
#         # log mel spectrogram of utterances
#         S = np.log10(np.dot(mel_basis, S) + 1e-6)        
#         for j in range(0, S.shape[1], int(.12/config.hop)):
#             if j + 24 < S.shape[1]:
#                 STFT_frames.append(S[:,j:j+24])
#             else:
#                 break
#     return STFT_frames

# import torch
# import torch.nn as nn

# from hparam import hparam as hp

# class SpeechEmbedder(nn.Module):
    
#     def __init__(self):
#         super(SpeechEmbedder, self).__init__()    
#         self.LSTM_stack = nn.LSTM(hp.data.nmels, hp.model.hidden, num_layers=hp.model.num_layer, batch_first=True)
#         for name, param in self.LSTM_stack.named_parameters():
#           if 'bias' in name:
#              nn.init.constant_(param, 0.0)
#           elif 'weight' in name:
#              nn.init.xavier_normal_(param)
#         self.projection = nn.Linear(hp.model.hidden, hp.model.proj)
        
#     def forward(self, x):
#         x, _ = self.LSTM_stack(x.float()) #(batch, frames, n_mels)
#         #only use last frame
#         x = x[:,x.size(1)-1]
#         x = self.projection(x.float())
#         x = x / torch.norm(x, dim=1).unsqueeze(1)
#         return x

# embedder_net = SpeechEmbedder()
# # embedder_net.load_state_dict(torch.load(hp.model.model_path))
# embedder_net.eval()

# times, segs = VAD_chunk(2, audio_path)
# concat_seg = concat_segs(times, segs)
# STFT_frames = get_STFTs(concat_seg)
# STFT_frames = np.stack(STFT_frames, axis=2)
# STFT_frames = torch.tensor(np.transpose(STFT_frames, axes=(2,1,0)))

# embeddings = embedder_net(STFT_frames)

# embeddings.shape

# STFT_frames.shape