<a href="https://colab.research.google.com/github/sedol1339/voice_score/blob/main/search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone -q https://github.com/sedol1339/voice_score
%cd voice_score
%run utils2.ipynb
data.sample(5)

/content/voice_score


Unnamed: 0,subset,system,utterance,file,file_exists,score_mean,score_std,n_votes,votes1,votes2,votes3,votes4,votes5,fold
5580,train,sysf53fb,utt8395a56,sysf53fb-utt8395a56.wav,True,1.125,0.3536,8,7,1,0,0,0,4
4618,train,sysc276f,utt0699af9,sysc276f-utt0699af9.wav,True,2.5,0.5345,8,0,4,4,0,0,3
4368,train,sysba29c,utt1638187,sysba29c-utt1638187.wav,True,2.125,0.6409,8,1,5,2,0,0,3
6195,val,sys64e2f,utt0c4d719,sys64e2f-utt0c4d719.wav,False,3.75,0.7071,8,0,0,3,4,1,2
6841,val,sysfd5c5,utt64abfa3,sysfd5c5-utt64abfa3.wav,False,3.75,1.2817,8,1,0,1,4,2,4


In [None]:
import time

device = 'cuda'
wave2vec2_model = get_wave2vec2_model(device).eval()

## Источники данных

In [None]:
data_sources = {
    f'wave2vec2.{layer}': lambda file: file_to_wave2vec2_outputs(wave2vec2_model, file, device, output_layers=[layer])[0]
    for layer in [
          'feature_extractor.conv_layers.5',
          'feature_extractor.conv_layers.6',
          'transformer.layers.0',
          'transformer.layers.2',
          'transformer.layers.4',
          'transformer.layers.6',
          'transformer.layers.8',
          'transformer.layers.10',
    ]
}

data_sources['wave2vec2.mix1'] = lambda file: np.concatenate(file_to_wave2vec2_outputs(wave2vec2_model, file, device,
                          output_layers=['feature_extractor.conv_layers.6', 'transformer.layers.6']), axis=1)
data_sources['wave2vec2.mix2'] = lambda file: np.concatenate(file_to_wave2vec2_outputs(wave2vec2_model, file, device,
                          output_layers=['feature_extractor.conv_layers.6', 'transformer.layers.6', 'transformer.layers.10']), axis=1)
data_sources['wave2vec2.mix3'] = lambda file: np.concatenate(file_to_wave2vec2_outputs(wave2vec2_model, file, device,
                          output_layers=['transformer.layers.6', 'transformer.layers.10']), axis=1)

## Источники данных: проверка времени работы

In [None]:
test_files = ['sys02a43-uttd08a661.wav', 'sys05205-utt61fd125.wav', 'sys19236-uttc22d6bc.wav',
              'sys35c79-utt834bb2b.wav', 'sys4017d-uttb8d822a.wav'] #all files are around 3.4 sec length - mean length of all dataset
for name, func in data_sources.items():
  start_time = time.time()
  for file in test_files:
    result = func(file)
    assert result.ndim == 2
  time_per_file = (time.time() - start_time) / len(test_files)
  print(f'[{time_per_file:g} sec/file, {time_per_file*2641:g} sec/dataset] {name}')

[0.0174157 sec/file, 45.9948 sec/dataset] wave2vec2.feature_extractor.conv_layers.5
[0.0166201 sec/file, 43.8936 sec/dataset] wave2vec2.feature_extractor.conv_layers.6
[0.0158499 sec/file, 41.8596 sec/dataset] wave2vec2.transformer.layers.0
[0.0153485 sec/file, 40.5355 sec/dataset] wave2vec2.transformer.layers.2
[0.0152452 sec/file, 40.2626 sec/dataset] wave2vec2.transformer.layers.4
[0.0153266 sec/file, 40.4777 sec/dataset] wave2vec2.transformer.layers.6
[0.0148312 sec/file, 39.1691 sec/dataset] wave2vec2.transformer.layers.8
[0.0150768 sec/file, 39.8178 sec/dataset] wave2vec2.transformer.layers.10
[0.0152872 sec/file, 40.3734 sec/dataset] wave2vec2.mix1
[0.0161923 sec/file, 42.764 sec/dataset] wave2vec2.mix2
[0.0154552 sec/file, 40.8173 sec/dataset] wave2vec2.mix3


## Способы обработки данных

In [None]:
!pip install -q tensorflow_addons

In [None]:
from tensorflow.keras import layers
import tensorflow_addons as tfa

data_processing = {
    'mean': lambda x: x.mean(axis=0),
    'mean_std': lambda x: np.concatenate([x.mean(axis=0), x.std(axis=0)]),
    'max': lambda x: x.max(axis=0),
}

random_projection_layers = {}
def random_projection(x):
  n_features = x.shape[1]
  if not n_features in random_projection_layers:
    random_projection_layers[n_features] = layers.Dense(8192, 'tanh')
    random_projection_layers[n_features].build(input_shape=(None, n_features))
  layer = random_projection_layers[n_features]
  return layer(x).numpy().mean(axis=0)

data_processing['random_projection'] = random_projection

random_rnn_layers = {}
def random_rnn(x, reduction='last'):
  n_features = x.shape[1]
  if not n_features in random_rnn_layers:
    random_rnn_layers[n_features] = layers.GRU(512, 'tanh', return_sequences=True, time_major=True)
    random_rnn_layers[n_features].build(input_shape=(None, None, n_features))
  layer = random_rnn_layers[n_features]
  inputs = x[:, None, :] #(timesteps, batch, feature)
  outputs = layer(inputs)[:, 0, :].numpy() #(timesteps, feature)
  if reduction == 'last':
    return outputs[-1]
  else:
    return reduction(outputs, axis=0)

data_processing['random_rnn_last_state'] = lambda x: random_rnn(x, reduction='last')
data_processing['random_rnn_mean_state'] = lambda x: random_rnn(x, reduction=np.mean)
data_processing['random_rnn_max_state'] = lambda x: random_rnn(x, reduction=np.max)

random_esn_layers = {}
def random_esn(x, reduction='last'):
  n_features = x.shape[1]
  if not n_features in random_esn_layers:
    random_esn_layers[n_features] = tfa.layers.ESN(256, return_sequences=True)
    random_esn_layers[n_features].build(input_shape=(None, None, n_features))
  layer = random_esn_layers[n_features]
  inputs = x[None, :, :] #(batch, timesteps, feature)
  outputs = layer(inputs)[0, :, :].numpy() #(timesteps, feature)
  if reduction == 'last':
    return outputs[-1]
  else:
    return reduction(outputs, axis=0)

data_processing['random_esn_last_state'] = lambda x: random_esn(x, reduction='last')
data_processing['random_esn_mean_state'] = lambda x: random_esn(x, reduction=np.mean)
data_processing['random_esn_max_state'] = lambda x: random_esn(x, reduction=np.max)

## Способы обработки данных: проверка времени работы

In [None]:
for name, func in data_processing.items():
  assert func(np.zeros((1, 768))).shape == func(np.zeros((200, 768))).shape
  start_time = time.time()
  for file in test_files:
    result = func(np.zeros((170, 768))) #170 frames (3.4 sec, 50 Hz), 768 features
    assert result.ndim == 1
  time_per_file = (time.time() - start_time) / len(test_files)
  print(f'[{time_per_file:g} sec/file, {time_per_file*2641:g} sec/dataset] {name}')

[0.00024395 sec/file, 0.644272 sec/dataset] mean
[0.00112209 sec/file, 2.96345 sec/dataset] mean_std
[0.000265837 sec/file, 0.702075 sec/dataset] max
[0.00638776 sec/file, 16.8701 sec/dataset] random_projection
[0.00685287 sec/file, 18.0984 sec/dataset] random_rnn_last_state
[0.00775075 sec/file, 20.4697 sec/dataset] random_rnn_mean_state
[0.00728836 sec/file, 19.2486 sec/dataset] random_rnn_max_state
[0.153161 sec/file, 404.499 sec/dataset] random_esn_last_state
[0.156729 sec/file, 413.921 sec/dataset] random_esn_mean_state
[0.153354 sec/file, 405.007 sec/dataset] random_esn_max_state
