In [44]:
import os
import logging
import datetime
import time
import json
import librosa
import numpy as np
from utils import normalize

import tensorflow as tf
from tensorflow.contrib import rnn
from sklearn.preprocessing import normalize as sk_normalize
from sklearn.cluster import KMeans
from scipy.ndimage.filters import gaussian_filter

from collections import defaultdict
from configuration import get_config

In [None]:
config = get_config()
log_file = os.path.abspath(config.log_path)
logging.basicConfig(
    filename=log_file,
    level=logging.DEBUG,
    format="%(asctime)s:%(levelname)s:%(message)s"
    )
print(f'Log path: {log_file}')

In [None]:
# Changing to 25, which will give slightly better intervals, 20 gives very short intervals
vad_threshold = 25 # threshold for voice activity detection

data_path = '/home/jovyan/work/datasets/voxceleb-1/sample/wav'
save_dir_path = '/home/jovyan/work/voxsrc21-dia/embeddings/embeddings'
os.makedirs(save_dir_path, exist_ok=True)

# Data prep
# I'm saving only 2 embeddings i.e. first and last tisv_frames for given interval in an audio. So each .npy
# embedding file will have a shape of (2, 256)
tf.reset_default_graph()
batch_size = 2 # Fixing to 2 since we take 2 for each interval #utter_batch.shape[1]
verif = tf.placeholder(shape=[None, batch_size, 40], dtype=tf.float32)  # verification batch (time x batch x n_mel)
batch = tf.concat([verif,], axis=1)
# embedding lstm (3-layer default)
with tf.variable_scope("lstm"):
    lstm_cells = [tf.contrib.rnn.LSTMCell(num_units=config.hidden, num_proj=config.proj) for i in range(config.num_layer)]
    lstm = tf.contrib.rnn.MultiRNNCell(lstm_cells)    # make lstm op and variables
    outputs, _ = tf.nn.dynamic_rnn(cell=lstm, inputs=batch, dtype=tf.float32, time_major=True)   # for TI-VS must use dynamic rnn
    embedded = outputs[-1]                            # the last ouput is the embedded d-vector
    embedded = normalize(embedded)                    # normalize
config_tensorflow = tf.ConfigProto(device_count = {'GPU': 0})
saver = tf.train.Saver(var_list=tf.global_variables())

In [None]:
all_unique_extensions = []
all_files = defaultdict(list)
for base_id in os.listdir(data_path):
#     print(f'Base id: {base_id}')
    if base_id.startswith('.'): #hidden folders
        continue;
    for video_id in os.listdir(os.path.join(data_path, base_id)):
#         print(f'Base id: {base_id} Video id: {video_id}')
        if video_id.startswith('.'): #hidden folders
            continue;
        for audio_id in os.listdir(os.path.join(data_path, base_id, video_id)):
#             print(f'Base id: {base_id} Video id: {video_id} Audio id: {audio_id}')
            all_unique_extensions.append(os.path.splitext(audio_id)[1])
            if os.path.splitext(audio_id)[1] == '.wav':
                # append the file path and save path to all_files
                all_files[base_id].append(os.path.join(data_path, base_id, video_id, audio_id))
            else:
                print(f'Wrong file type in {os.path.join(data_path, base_id, video_id, audio_id)}')
print(f'Unique file extensions: {set(all_unique_extensions)}')
print(f'Number of speakers: {len(all_files)}')

In [45]:
# Extract embeddings
# Each embedding saved file will have (2, 256)
with tf.Session(config=config_tensorflow) as sess:
    tf.global_variables_initializer().run()
    saver.restore(sess, config.model_path)
    
    for speaker_id, audio_paths in all_files.items():
        for audio_path in audio_paths:
            video_id = audio_path.split('/')[-2]
            audio_id = audio_path.split('/')[-1].replace('.wav','')
            
            logging.info(f'loading audio {speaker_id}-{video_id}-{audio_id}')
            utter, sr = librosa.core.load(audio_path, sr=config.sr)
            # lower bound of utterance length
            utter_min_len = (config.tisv_frame_min * config.hop + config.window) * sr
            # Get the duration
            duration = librosa.get_duration(utter, sr)
            # Duration of each window
            duration_per_frame = (duration / utter.shape[0])
            logging.info(f'Duration: {duration}\nDuration per frame: {duration_per_frame}s\nMin length of utterance: {utter_min_len * duration_per_frame}s')
            tisv_frame_duration_s = utter_min_len * duration_per_frame
            # voice activity detection
            intervals = librosa.effects.split(utter, top_db=vad_threshold)

            for idx, current_interval in enumerate(intervals):
                utterances_spec = []
                
                utter_part = utter[current_interval[0]:current_interval[1]]
                S = librosa.core.stft(y=utter_part, n_fft=config.nfft, win_length=int(config.window * sr), hop_length=int(config.hop * sr))
                S = np.abs(S) ** 2
                mel_basis = librosa.filters.mel(sr=sr, n_fft=config.nfft, n_mels=40)
                # log mel spectrogram of utterances
                S = np.log10(np.dot(mel_basis, S) + 1e-6)
                # save first and last 160 frames of spectrogram.
                utterances_spec.append(S[:, :config.tisv_frame])
                utterances_spec.append(S[:, -config.tisv_frame:])
                utterances_spec = np.array(utterances_spec)
                # transpose [frames, batch, n_mels]
                utter_batch = np.transpose(utterances_spec, axes=(2,0,1))

                data = sess.run(embedded, feed_dict={verif:utter_batch})
                save_embedding_path = os.path.join(save_dir_path, f'vox1-{speaker_id}-{video_id}-{audio_id}-{idx}.npy')
                np.save(save_embedding_path, data)

INFO:tensorflow:Restoring parameters from ../models/model.ckpt-46


  n_fft, y.shape[-1]


KeyboardInterrupt: 

In [47]:
emb = np.load(save_embedding_path)
emb.shape

(2, 256)

In [None]:
# duration

In [None]:
# data.shape

In [None]:
# all_data.dim

In [None]:
# if __name__ == "__main__":
#     """
#     Speaker embeddings program:
#     input: audio files
#     output: npy file with shape (2, 256) [first and last tisv_frames for given interval in an audio]
#     """
#     main()
#     print('Program completed!')