In [None]:
# !pip install torchaudio PySoundFile -q

In [None]:
URL = 'https://storage.googleapis.com/oleg-zyablov/misc/VoiceMOS'
!wget -q {URL}/main.zip
!wget -q {URL}/ood.zip
!wget -q {URL}/data_with_annotators.csv
!wget -q {URL}/data.csv
!unzip -q -o main.zip && rm main.zip
!unzip -q -o ood.zip && rm ood.zip

In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from tqdm.notebook import tqdm

data = pd.read_csv('data.csv')
data.sample(10)

Unnamed: 0,subset,system,utterance,file,file_exists,score_mean,score_std,n_scores
4723,train,sysc84fa,uttd39ad61,sysc84fa-uttd39ad61.wav,True,3.75,1.165,8
4043,train,sysa7a61,utt5920dc6,sysa7a61-utt5920dc6.wav,True,2.375,0.5175,8
4108,train,sysac5dd,uttaed5009,sysac5dd-uttaed5009.wav,True,4.25,0.8864,8
1467,train,sys1e139,utte8b03a9,sys1e139-utte8b03a9.wav,True,2.0,0.7559,8
4496,train,sysbce79,utt23c83ce,sysbce79-utt23c83ce.wav,True,3.625,0.9161,8
740,ood_val,sys40775,uttdfaf42e,sys40775-uttdfaf42e.wav,True,3.0,1.2649,16
5086,train,syse0aaa,utt6bdfc63,syse0aaa-utt6bdfc63.wav,True,3.0,0.9258,8
3217,train,sys7b162,utt1f5e8ce,sys7b162-utt1f5e8ce.wav,True,2.375,0.744,8
4964,train,sysd81da,utt800c659,sysd81da-utt800c659.wav,True,3.5,1.4142,8
5918,val,sys1f128,utt222982e,sys1f128-utt222982e.wav,True,1.625,0.744,8


In [None]:
import torchaudio
import os, base64
from IPython import display

def get_waveform(wav_file):
  waveform, sample_rate = torchaudio.load(f'wav/{wav_file}')
  assert sample_rate == 16000
  return waveform[0]

def get_spectrogram(waveform, **kwargs):
  return torchaudio.transforms.Spectrogram(**kwargs)(waveform)

def show_audio(file_path, width=300):
  audio = open(file_path, 'rb').read()
  data_url = "data:audio/mp3;base64," + base64.b64encode(audio).decode()
  style = '''<style>audio::-webkit-media-controls-current-time-display,
    audio::-webkit-media-controls-time-remaining-display {display: none;}</style>'''
  display.display(display.HTML(style + f'<audio controls style="width: {width}px; ">'
                                       f'<source src="{data_url}"></audio>'))

def visualize_wav(waveform_or_filename):
  if type(waveform_or_filename) == str:
    waveform = get_waveform(waveform_or_filename)
    #display.display(display.Audio(f'wav/{waveform_or_filename}'))
    show_audio(f'wav/{waveform_or_filename}', width=760)
  else:
    waveform = waveform_or_filename
  fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(8, 6), dpi=100,
                                 gridspec_kw={'height_ratios': [1, 3]})
  ax1.plot(np.arange(len(waveform))[::20], waveform[::20])
  ax1.set_xlim(0, len(waveform))
  ax2.imshow(np.log(1e-6 + get_spectrogram(waveform)), aspect='auto')
  plt.show()

def show_random_waveform_with_score(dataframe, score_range=None, score_std_range=None):
  dataframe = dataframe[dataframe.file_exists]
  if score_range is not None:
    dataframe = dataframe[(dataframe.score_mean >= score_range[0]) & (dataframe.score_mean <= score_range[1])]
  if score_std_range is not None:
    dataframe = dataframe[(dataframe.score_std >= score_std_range[0]) & (dataframe.score_std <= score_std_range[1])]
  if len(dataframe) == 0:
    print('No samples')
    return
  row = dataframe.sample(1)
  row.drop(columns=['subset', 'system', 'utterance', 'file_exists'], inplace=True)
  display.display(row)
  visualize_wav(row.file.tolist()[0])
  
#visualize_wav('syse33d4-utt5155cf2.wav')

In [None]:
# show_random_waveform_with_score(data, score_range=(4, 4.5))

In [None]:
import torch

def get_wave2vec2_model(device='cpu'):
  bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
  return bundle.get_model().to(device)

def get_wave2vec2_output(model, waveforms, lengths=None, output_layers=['aux']):
  outputs = []
  x = waveforms
  transformer = model.encoder.transformer
  if x.ndim != 2:
    raise ValueError("Expected the input Tensor to be 2D (batch, time), but received {list(x.shape)}")
  #feature_extractor
  x = x[:, None, :]  # (batch, channel==1, frame)
  for i, layer in enumerate(model.feature_extractor.conv_layers):
      x, lengths = layer(x, lengths)  # (batch, feature, frame)
      if f'feature_extractor.conv_layers.{i}' in output_layers: outputs.append(x)
  x = x.transpose(1, 2)  # (batch, frame, feature)
  if 'feature_extractor' in output_layers: outputs.append(x)
  #encoder
  x, mask = model.encoder._preprocess(x, lengths)
  if 'feature_projection' in output_layers: outputs.append(x)
  x = x + transformer.pos_conv_embed(x)
  x = transformer.layer_norm(x)
  x = transformer.dropout(x)
  if 'transformer.preprocess' in output_layers: outputs.append(x)
  for i, layer in enumerate(transformer.layers):
      if not (transformer.training and torch.rand(1).item() <= transformer.layer_drop):
          x = layer(x, attention_mask=None)
      if f'transformer.layers.{i}' in output_layers: outputs.append(x)
  if 'transformer' in output_layers: outputs.append(x)
  #aux
  x = model.aux(x)
  if 'aux' in output_layers: outputs.append(x)
  return outputs, lengths
  #raise ValueError(f"Unknown output_layer {output_layer}")

In [None]:
device = 'cuda'
model = get_wave2vec2_model(device)
model.eval();

Downloading: "https://download.pytorch.org/torchaudio/models/wav2vec2_fairseq_base_ls960_asr_ls960.pth" to /root/.cache/torch/hub/checkpoints/wav2vec2_fairseq_base_ls960_asr_ls960.pth


  0%|          | 0.00/360M [00:00<?, ?B/s]

In [None]:
output_layers = [f'feature_extractor.conv_layers.{i}' for i in range(7)] + \
                ['feature_projection', 'transformer.preprocess'] + \
                [f'transformer.layers.{i}' for i in range(11)] + ['aux']

wave2vec_results = {}

for i, row in tqdm(data.iterrows()):
  if row.file_exists:
    waveform = get_waveform(row.file)
    with torch.no_grad():
      outputs, lengths = get_wave2vec2_output(model, torch.Tensor(waveform[None]).to(device),
                                              output_layers=output_layers)
    wave2vec_results[row.file] = {x: y.cpu().detach().numpy() for x, y in zip(output_layers, outputs)}
  break

0it [00:00, ?it/s]

In [None]:
audio_len = len(waveform) / 16000
total_len = 29875

for i, layer in enumerate(output_layers):
  array = wave2vec_results['sys0c3c7-utt330976a.wav'][layer][0]
  if i > 6:
    array = array.T
  n_time_steps = array.shape[1]
  total_size = array.nbytes / audio_len * total_len
  print(f'Cлой {layer}: частота дискретизации {n_time_steps / audio_len:.1f} гц, данные займут объем {total_size/10**9:.3f} Гб')
  #plt.figure(figsize=(20, 4))
  # array = np.abs(array)**(1/2) * np.sign(array)
  # absmax = max(abs(array.max()), abs(array.min()))
  # plt.imshow(array, aspect='auto', vmin=-absmax, vmax=absmax);
  # plt.show()

Cлой feature_extractor.conv_layers.0: частота дискретизации 3199.8 гц, данные займут объем 195.776 Гб
Cлой feature_extractor.conv_layers.1: частота дискретизации 1599.8 гц, данные займут объем 97.880 Гб
Cлой feature_extractor.conv_layers.2: частота дискретизации 799.8 гц, данные займут объем 48.936 Гб
Cлой feature_extractor.conv_layers.3: частота дискретизации 399.8 гц, данные займут объем 24.464 Гб
Cлой feature_extractor.conv_layers.4: частота дискретизации 199.8 гц, данные займут объем 12.224 Гб
Cлой feature_extractor.conv_layers.5: частота дискретизации 99.9 гц, данные займут объем 6.112 Гб
Cлой feature_extractor.conv_layers.6: частота дискретизации 49.9 гц, данные займут объем 3.056 Гб
Cлой feature_projection: частота дискретизации 49.9 гц, данные займут объем 4.584 Гб
Cлой transformer.preprocess: частота дискретизации 49.9 гц, данные займут объем 4.584 Гб
Cлой transformer.layers.0: частота дискретизации 49.9 гц, данные займут объем 4.584 Гб
Cлой transformer.layers.1: частота дискр

In [None]:
# визуализацию убрал, если надо верну

In [None]:
output_layers = ['transformer.layers.10']

wave2vec_results = {}

for i, row in tqdm(data.iterrows()):
  if row.file_exists:
    waveform = get_waveform(row.file)
    with torch.no_grad():
      outputs, lengths = get_wave2vec2_output(model, torch.Tensor(waveform[None]).to(device),
                                              output_layers=output_layers)
    wave2vec_results[row.file] = {x: y.cpu().detach() for x, y in zip(output_layers, outputs)}

0it [00:00, ?it/s]

In [None]:
!mkdir torch

In [None]:
for file, data in tqdm(wave2vec_results.items()):
  torch.save(data['transformer.layers.10'], 'torch/' + file)

  0%|          | 0/6667 [00:00<?, ?it/s]

In [None]:
!tar -czvf filename.tar.gz torch

In [None]:
!ls -la

total 4192604
drwxr-xr-x 1 root root       4096 Feb 22 12:05 .
drwxr-xr-x 1 root root       4096 Feb 22 11:49 ..
drwxr-xr-x 4 root root       4096 Feb  1 14:31 .config
-rw-r--r-- 1 root root     490397 Feb 21 19:41 data.csv
-rw-r--r-- 1 root root    5118339 Feb 21 19:30 data_with_annotators.csv
-rw-r--r-- 1 root root 4286989678 Feb 22 12:08 filename.tar.gz
drwxr-xr-x 1 root root       4096 Feb  1 14:32 sample_data
drwxr-xr-x 2 root root     282624 Feb 22 12:04 torch
drwxr-xr-x 2 root root     319488 Feb 22 11:58 wav
