In [2]:
import os
import logging
import datetime
import time
import json
import librosa
import numpy as np
from utils import normalize

import tensorflow as tf
from tensorflow.contrib import rnn
from sklearn.preprocessing import normalize as sk_normalize
from sklearn.cluster import KMeans
from scipy.ndimage.filters import gaussian_filter

from collections import defaultdict
from configuration import get_config

from VAD_segments import VAD_chunk

In [3]:
config = get_config()
log_file = os.path.abspath(config.log_path)
logging.basicConfig(
    filename=log_file,
    level=logging.DEBUG,
    format="%(asctime)s:%(levelname)s:%(message)s"
    )
print(f'Log path: {log_file}')

Log path: /home/jovyan/work/voxsrc21-dia/embeddings/main.logs


In [22]:
# Changing to 25, which will give slightly better intervals, 20 gives very short intervals
vad_threshold = 25 # threshold for voice activity detection

data_path = '/home/jovyan/work/datasets/voxceleb-1/sample/wav'
save_dir_path = '/home/jovyan/work/voxsrc21-dia/embeddings/sequences'
os.makedirs(save_dir_path, exist_ok=True)

# Data prep
# I'm saving only 2 embeddings i.e. first and last tisv_frames for given interval in an audio. So each .npy
# embedding file will have a shape of (2, 256)
tf.reset_default_graph()
batch_size = 2 # Fixing to 2 since we take 2 for each interval #utter_batch.shape[1]
verif = tf.placeholder(shape=[None, batch_size, 40], dtype=tf.float32)  # verification batch (time x batch x n_mel)
batch = tf.concat([verif,], axis=1)
# embedding lstm (3-layer default)
with tf.variable_scope("lstm"):
    lstm_cells = [tf.contrib.rnn.LSTMCell(num_units=config.hidden, num_proj=config.proj) for i in range(config.num_layer)]
    lstm = tf.contrib.rnn.MultiRNNCell(lstm_cells)    # make lstm op and variables
    outputs, _ = tf.nn.dynamic_rnn(cell=lstm, inputs=batch, dtype=tf.float32, time_major=True)   # for TI-VS must use dynamic rnn
    embedded = outputs[-1]                            # the last ouput is the embedded d-vector
    embedded = normalize(embedded)                    # normalize
config_tensorflow = tf.ConfigProto(device_count = {'GPU': 0})
saver = tf.train.Saver(var_list=tf.global_variables())

In [5]:
all_unique_extensions = []
all_files = defaultdict(list)
audio_quantity = 0
for base_id in os.listdir(data_path):
#     print(f'Base id: {base_id}')
    if base_id.startswith('.'): #hidden folders
        continue;
    for video_id in os.listdir(os.path.join(data_path, base_id)):
#         print(f'Base id: {base_id} Video id: {video_id}')
        if video_id.startswith('.'): #hidden folders
            continue;
        for audio_id in os.listdir(os.path.join(data_path, base_id, video_id)):
#             print(f'Base id: {base_id} Video id: {video_id} Audio id: {audio_id}')
            all_unique_extensions.append(os.path.splitext(audio_id)[1])
            if os.path.splitext(audio_id)[1] == '.wav':
                # append the file path and save path to all_files
                all_files[base_id].append(os.path.join(data_path, base_id, video_id, audio_id))
                audio_quantity += 1
            else:
                print(f'Wrong file type in {os.path.join(data_path, base_id, video_id, audio_id)}')
print(f'Unique file extensions: {set(all_unique_extensions)}')
print(f'Number of speakers: {len(all_files)}')
print(f'Number of audios: {audio_quantity}')

Unique file extensions: {'.wav'}
Number of speakers: 19
Number of audios: 2198


In [127]:
def concat_segs(times, segs):
    #Concatenate continuous voiced segments
    concat_seg = []
    seg_concat = segs[0]
    for i in range(0, len(times)-1):
        if times[i][1] == times[i+1][0]:
            seg_concat = np.concatenate((seg_concat, segs[i+1]))
        else:
            concat_seg.append(seg_concat)
            seg_concat = segs[i+1]
    else:
        concat_seg.append(seg_concat)
    return concat_seg

def align_embeddings(embeddings):
    partitions = []
    start = 0
    end = 0
    j = 1
    for i, embedding in enumerate(embeddings):
        if (i*.12)+.24 < j*.401:
            end = end + 1
        else:
            partitions.append((start,end))
            start = end
            end = end + 1
            j += 1
    else:
        partitions.append((start,end))
    avg_embeddings = np.zeros((len(partitions),256))
    for i, partition in enumerate(partitions):
        avg_embeddings[i] = np.average(embeddings[partition[0]:partition[1]],axis=0) 
    return avg_embeddings

def get_STFTs(segs):
    #Get 240ms STFT windows with 50% overlap
    sr = config.sr
    STFT_windows = []
    for seg in segs:
        S = librosa.core.stft(y=seg, n_fft=config.nfft, win_length=int(config.window * sr), hop_length=int(config.hop * sr))
        S = np.abs(S) ** 2
        mel_basis = librosa.filters.mel(sr=sr, n_fft=config.nfft, n_mels=40)
        # log mel spectrogram of utterances
        S = np.log10(np.dot(mel_basis, S) + 1e-6)        
        for j in range(0, S.shape[1], int(.12/config.hop)):
            if j + 48 < S.shape[1]:
                # in order to fit on the expected shape of the embedding network we double the window
                STFT_windows.append([S[:, j:j+24], S[:, j+24:j+48]])                
            else:
                break
    return np.array(STFT_windows)


In [149]:
## AACCEEERRTTAAARRRR

a = np.load(train_sequences_path)
print(a.shape)

(1, 43, 256)


In [146]:
# Extract embeddings
# Each embedding saved file will have (2, 256)
with tf.Session(config=config_tensorflow) as sess:
    tf.global_variables_initializer().run()
    saver.restore(sess, config.model_path)
    
    speaker_count = 0
    total_speakers = len(all_files)
    speakers_per_batch = 50 # config.N
    
    speaker_label = 0
    train_sequences = np.array([]).reshape(0,256)
    train_cluster_ids = []
    
    for speaker_id, audio_paths in all_files.items():
        for audio_path in audio_paths:
            video_id = audio_path.split('/')[-2]
            audio_id = audio_path.split('/')[-1].replace('.wav','')
            audio_count += 1
            
            logging.info(f'loading {speaker_id}-{video_id}-{audio_id} {audio_count}/{audio_quantity}')
            utter, sr = librosa.core.load(audio_path, sr=config.sr)
            # lower bound of utterance length
            # utter_min_len = (config.tisv_frame_min * config.hop + config.window) * sr
            # Get the duration
            # duration = librosa.get_duration(utter, sr)
            # Duration of each window
            # duration_per_frame = (duration / utter.shape[0])
            # logging.info(f'Duration: {duration}\nDuration per frame: {duration_per_frame}s\nMin length of utterance: {utter_min_len * duration_per_frame}s')
            # tisv_frame_duration_s = utter_min_len * duration_per_frame
            
            # voice activity detection            
            times, segs = VAD_chunk(2, audio_path)
            concat_seg = concat_segs(times, segs)
            STFT_windows = get_STFTs(concat_seg)
            # print(len(STFT_windows), STFT_windows[0].shape)

            embeddings = np.array([]).reshape(0,256)
            for STFT_window in STFT_windows:
                STFT_batch = np.transpose(STFT_window, axes=(2,0,1))
                # print(STFT_frames2.shape) (24, 2, 40) (240ms window * batch 2 * mels 40)
                embeddings_batch = sess.run(embedded, feed_dict={verif:STFT_batch})
                embeddings = np.concatenate((embeddings, embeddings_batch))
    
            aligned_embeddings = align_embeddings(embeddings) # Turn window-level embeddings to segment-level (400ms)
            
            train_sequences = np.concatenate((train_sequences, aligned_embeddings))
            for embedding in aligned_embeddings:
                train_cluster_ids.append(str(speaker_label))
        
        speaker_count += 1
        speaker_label += 1
        if (speaker_count != total_speakers or speaker_count % speakers_per_batch == 0):
            train_sequences_path = os.path.join(save_dir_path, f'vox1-train-sequences-{speaker_count}.npy')
            np.save(train_sequences_path, train_sequence)
            
            train_cluster_ids_path = os.path.join(save_dir_path, f'vox1-train-cluster-ids-{speaker_count}.npy')
            train_cluster_ids = np.asarray(train_cluster_ids)
            np.save(train_cluster_ids_path, train_cluster_ids)
            logging.info(f'saved batch {speaker_count/total_speakers}/{speakers_per_batch/total_speakers}')
            
            train_sequences = np.array([]).reshape(0,256)
            train_cluster_ids = []

INFO:tensorflow:Restoring parameters from ../models/model.ckpt-46


KeyboardInterrupt: 

In [141]:
train_sequence = []
train_sequence.append(aligned_embeddings)
print(len(train_sequence))

1


In [128]:
intervals = librosa.effects.split(utter, top_db=vad_threshold)
for idx, current_interval in enumerate(intervals):
    utterances_spec = []
    utter_part = utter[current_interval[0]:current_interval[1]]
    S = librosa.core.stft(y=utter_part, n_fft=config.nfft, win_length=int(config.window * sr), hop_length=int(config.hop * sr))
    S = np.abs(S) ** 2
    mel_basis = librosa.filters.mel(sr=sr, n_fft=config.nfft, n_mels=40)
    # log mel spectrogram of utterances
    S = np.log10(np.dot(mel_basis, S) + 1e-6)
    
    utterances_spec.append(S[:, :config.tisv_frame])
    utterances_spec.append(S[:, -config.tisv_frame:])
    
    utterances_spec = np.array(utterances_spec)

(2, 40, 36)
(2, 40, 160)


In [None]:
# duration

In [None]:
# data.shape

In [None]:
# all_data.dim

In [None]:
# if __name__ == "__main__":
#     """
#     Speaker embeddings program:
#     input: audio files
#     output: npy file with shape (2, 256) [first and last tisv_frames for given interval in an audio]
#     """
#     main()
#     print('Program completed!')

In [119]:
# def get_STFTs(segs):
#     #Get 240ms STFT windows with 50% overlap
#     sr = config.sr
#     STFT_frames = []
#     for seg in segs:
#         S = librosa.core.stft(y=seg, n_fft=config.nfft, win_length=int(config.window * sr), hop_length=int(config.hop * sr))
#         S = np.abs(S) ** 2
#         mel_basis = librosa.filters.mel(sr=sr, n_fft=config.nfft, n_mels=40)
#         # log mel spectrogram of utterances
#         S = np.log10(np.dot(mel_basis, S) + 1e-6)        
#         for j in range(0, S.shape[1], int(.12/config.hop)):
#             if j + 24 < S.shape[1]:
#                 STFT_frames.append(S[:,j:j+24])
#             else:
#                 break
#     return STFT_frames

# import torch
# import torch.nn as nn

# from hparam import hparam as hp

# class SpeechEmbedder(nn.Module):
    
#     def __init__(self):
#         super(SpeechEmbedder, self).__init__()    
#         self.LSTM_stack = nn.LSTM(hp.data.nmels, hp.model.hidden, num_layers=hp.model.num_layer, batch_first=True)
#         for name, param in self.LSTM_stack.named_parameters():
#           if 'bias' in name:
#              nn.init.constant_(param, 0.0)
#           elif 'weight' in name:
#              nn.init.xavier_normal_(param)
#         self.projection = nn.Linear(hp.model.hidden, hp.model.proj)
        
#     def forward(self, x):
#         x, _ = self.LSTM_stack(x.float()) #(batch, frames, n_mels)
#         #only use last frame
#         x = x[:,x.size(1)-1]
#         x = self.projection(x.float())
#         x = x / torch.norm(x, dim=1).unsqueeze(1)
#         return x

# embedder_net = SpeechEmbedder()
# # embedder_net.load_state_dict(torch.load(hp.model.model_path))
# embedder_net.eval()

# times, segs = VAD_chunk(2, audio_path)
# concat_seg = concat_segs(times, segs)
# STFT_frames = get_STFTs(concat_seg)
# STFT_frames = np.stack(STFT_frames, axis=2)
# STFT_frames = torch.tensor(np.transpose(STFT_frames, axes=(2,1,0)))

# embeddings = embedder_net(STFT_frames)

# embeddings.shape

# STFT_frames.shape