<a href="https://colab.research.google.com/github/sedol1339/voice_score/blob/main/utils2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
URL = 'https://storage.googleapis.com/oleg-zyablov/misc/VoiceMOS'
!wget -q {URL}/main.zip
!wget -q {URL}/ood.zip
!wget -q {URL}/data_with_annotators.csv
!wget -q {URL}/data.csv
!unzip -q -o main.zip && rm main.zip
!unzip -q -o ood.zip && rm ood.zip

In [2]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from tqdm.notebook import tqdm

data = pd.read_csv('data.csv')
data.sample(5)

Unnamed: 0,subset,system,utterance,file,file_exists,score_mean,score_std,n_votes,votes1,votes2,votes3,votes4,votes5
234,ood_unlabeled,sys2dea8,utt776dfcc,sys2dea8-utt776dfcc.wav,False,,,0,0,0,0,0,0
4715,train,sysc84fa,utt9f81089,sysc84fa-utt9f81089.wav,False,3.625,0.744,8,0,0,4,3,1
5211,train,syse975a,uttac3facf,syse975a-uttac3facf.wav,False,3.125,0.3536,8,0,0,7,1,0
533,ood_unlabeled,sysc0232,utt302efa5,sysc0232-utt302efa5.wav,False,,,0,0,0,0,0,0
688,ood_val,sys288fc,utt178b639,sys288fc-utt178b639.wav,False,1.0667,0.2582,15,14,1,0,0,0


In [3]:
train_and_val = data[data.subset.isin(['train', 'val'])]
n_folds = 5
fold_size = len(train_and_val) // n_folds
cumsum = train_and_val.groupby('system').file.count().cumsum()
system_to_fold = {}
for i in range(n_folds):
  systems = cumsum[(cumsum > fold_size*i) & (cumsum <= fold_size*(i+1))].index.tolist()
  for s in systems:
    system_to_fold[s] = i

data['fold'] = None
for i in data.index:
  if data.loc[i, 'subset'] in ['train', 'val']:
    data.loc[i, 'fold'] = system_to_fold[data.loc[i, 'system']]

data.sample(5)

Unnamed: 0,subset,system,utterance,file,file_exists,score_mean,score_std,n_votes,votes1,votes2,votes3,votes4,votes5,fold
2697,train,sys56eb0,utt221ac38,sys56eb0-utt221ac38.wav,True,2.75,0.7071,8,0,3,4,1,0,1
1535,train,sys20cc3,utt0681c6e,sys20cc3-utt0681c6e.wav,False,3.25,1.0351,8,1,0,3,4,0,0
2320,train,sys433f2,uttab23733,sys433f2-uttab23733.wav,True,3.0,0.7559,8,0,2,4,2,0,1
6141,val,sys4bafa,uttb6e8415,sys4bafa-uttb6e8415.wav,False,2.75,0.8864,8,0,4,2,2,0,1
6682,val,sysd79da,utt8e09e09,sysd79da-utt8e09e09.wav,False,3.0,0.0,8,0,0,8,0,0,4


In [5]:
# from sklearn.model_selection import KFold

# train_and_val = data[data.subset.isin(['train', 'val'])]
# kf = KFold(n_splits=5, shuffle=True, random_state=0)
# data['fold'] = None
# for i, (train, val) in enumerate(kf.split(train_and_val)):
#   indices = train_and_val.index[val]
#   data.loc[indices, 'fold'] = i

# # Бейзлайн: предсказываем без использования звуковых дорожек

# for fold in range(5):
#   train = data[~data.fold.isnull() & (data.fold != fold)]
#   val = data[data.fold == fold]

#   system_to_score = {}
#   for system in set(val.system):
#     train_data_for_system = train[train.system == system]
#     if len(train_data_for_system) == 0:
#       score = data.score_mean.mean()
#     else:
#       score = train_data_for_system.score_mean.mean()
#     system_to_score[system] = score
  
#   val_scores = [system_to_score[s] for s in val.system]
#   mse = ((val_scores - val.score_mean) ** 2).mean()

#   #plt.scatter(val_scores, val.score_mean)
#   #plt.show()

#   print(mse)

In [22]:
import torch, torchaudio
import os, base64
from IPython import display

def get_waveform(wav_file):
  waveform, sample_rate = torchaudio.load(f'wav/{wav_file}')
  assert sample_rate == 16000
  return waveform[0]

def get_spectrogram(waveform, **kwargs):
  return torchaudio.transforms.Spectrogram(**kwargs)(waveform)

def get_mel_spectrogram(waveform, **kwargs):
  return torchaudio.transforms.MelSpectrogram(**kwargs)(waveform)

def show_audio(file_path, width=300):
  audio = open(file_path, 'rb').read()
  data_url = "data:audio/mp3;base64," + base64.b64encode(audio).decode()
  style = '''<style>audio::-webkit-media-controls-current-time-display,
    audio::-webkit-media-controls-time-remaining-display {display: none;}</style>'''
  display.display(display.HTML(style + f'<audio controls style="width: {width}px; ">'
                                       f'<source src="{data_url}"></audio>'))

def visualize_wav(waveform_or_filename, aspect='auto', **kwargs):
  if type(waveform_or_filename) == str:
    waveform = get_waveform(waveform_or_filename)
    #display.display(display.Audio(f'wav/{waveform_or_filename}'))
    show_audio(f'wav/{waveform_or_filename}', width=760)
  else:
    waveform = waveform_or_filename
  fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(8, 6), dpi=100,
                                 gridspec_kw={'height_ratios': [1, 3]})
  ax1.plot(np.arange(len(waveform))[::20], waveform[::20])
  ax1.set_xlim(0, len(waveform))
  ax2.imshow(np.log(1e-6 + get_mel_spectrogram(waveform, **kwargs)), aspect=aspect)
  plt.show()

def show_random_waveform_with_score(dataframe, score_range=None, score_std_range=None, **kwargs):
  dataframe = dataframe[dataframe.file_exists]
  if score_range is not None:
    dataframe = dataframe[(dataframe.score_mean >= score_range[0]) & (dataframe.score_mean <= score_range[1])]
  if score_std_range is not None:
    dataframe = dataframe[(dataframe.score_std >= score_std_range[0]) & (dataframe.score_std <= score_std_range[1])]
  if len(dataframe) == 0:
    print('No samples')
    return
  row = dataframe.sample(1)
  row.drop(columns=['subset', 'system', 'utterance', 'file_exists'], inplace=True)
  display.display(row)
  visualize_wav(row.file.tolist()[0], **kwargs)
  
def get_wave2vec2_model(device='cpu'):
  bundle = torchaudio.pipelines.WAV2VEC2_ASR_BASE_960H
  return bundle.get_model().to(device)

def get_wave2vec2_output(model, waveforms, lengths=None, output_layers=['aux']):
  should_return = False
  passed_layers = []
  outputs = []
  def update_outputs(layer_name, x):
    passed_layers.append(layer_name)
    if layer_name in output_layers:
      outputs.append(x)
      if len(set(output_layers).difference(set(passed_layers))) == 0:
        should_return = True
  x = waveforms
  transformer = model.encoder.transformer
  if x.ndim != 2:
    raise ValueError("Expected the input Tensor to be 2D (batch, time), but received {list(x.shape)}")
  #feature_extractor
  x = x[:, None, :]  # (batch, channel==1, frame)
  for i, layer in enumerate(model.feature_extractor.conv_layers):
      x, lengths = layer(x, lengths)  # (batch, feature, frame)
      update_outputs(f'feature_extractor.conv_layers.{i}', x.transpose(1, 2))
      if should_return: return outputs, lengths
  x = x.transpose(1, 2)  # (batch, frame, feature)
  update_outputs(f'feature_extractor', x)
  if should_return: return outputs, lengths
  #encoder
  x, mask = model.encoder._preprocess(x, lengths)
  update_outputs(f'feature_projection', x)
  if should_return: return outputs, lengths
  x = x + transformer.pos_conv_embed(x)
  x = transformer.layer_norm(x)
  x = transformer.dropout(x)
  update_outputs(f'transformer.preprocess', x)
  if should_return: return outputs, lengths
  for i, layer in enumerate(transformer.layers):
      if not (transformer.training and torch.rand(1).item() <= transformer.layer_drop):
          x = layer(x, attention_mask=None)
      update_outputs(f'transformer.layers.{i}', x)
      if should_return: return outputs, lengths
  update_outputs('transformer', x)
  if should_return: return outputs, lengths
  #aux
  x = model.aux(x)
  update_outputs('aux', x)
  return outputs, lengths

def file_to_wave2vec2_outputs(model, file, device, output_layers=['aux'], numpy=True):
  waveform = get_waveform(file)
  with torch.no_grad():
    outputs, lengths = get_wave2vec2_output(model, torch.Tensor(waveform[None]).to(device),
                                            output_layers=output_layers)
  outputs_no_batch = [x[0].cpu().detach() for x in outputs]
  if numpy:
    outputs_no_batch = [x.numpy() for x in outputs_no_batch]
  return outputs_no_batch

In [None]:
model = get_wave2vec2_model('cpu')

In [24]:
arrays, _ = get_wave2vec2_output(model, get_waveform('sys56eb0-utt221ac38.wav')[None], output_layers=['feature_extractor.conv_layers.3'])
print(arrays[0].shape)

arrays, _ = get_wave2vec2_output(model, get_waveform('sys56eb0-utt221ac38.wav')[None], output_layers=['transformer.layers.3'])
print(arrays[0].shape)

arrays, _ = get_wave2vec2_output(model, get_waveform('sys56eb0-utt221ac38.wav')[None],
                                 output_layers=['feature_extractor.conv_layers.3', 'transformer.layers.3'])
print(arrays[0].shape)
print(arrays[1].shape)

torch.Size([1, 1059, 512])
torch.Size([1, 132, 768])
torch.Size([1, 1059, 512])
torch.Size([1, 132, 768])


```
Cлой feature_extractor.conv_layers.0: частота дискретизации 3199.8 гц, данные займут объем 195.776 Гб
Cлой feature_extractor.conv_layers.1: частота дискретизации 1599.8 гц, данные займут объем 97.880 Гб
Cлой feature_extractor.conv_layers.2: частота дискретизации 799.8 гц, данные займут объем 48.936 Гб
Cлой feature_extractor.conv_layers.3: частота дискретизации 399.8 гц, данные займут объем 24.464 Гб
Cлой feature_extractor.conv_layers.4: частота дискретизации 199.8 гц, данные займут объем 12.224 Гб
Cлой feature_extractor.conv_layers.5: частота дискретизации 99.9 гц, данные займут объем 6.112 Гб
Cлой feature_extractor.conv_layers.6: частота дискретизации 49.9 гц, данные займут объем 3.056 Гб
Cлой feature_projection: частота дискретизации 49.9 гц, данные займут объем 4.584 Гб
Cлой transformer.preprocess: частота дискретизации 49.9 гц, данные займут объем 4.584 Гб
Cлой transformer.layers.0: частота дискретизации 49.9 гц, данные займут объем 4.584 Гб
Cлой transformer.layers.1: частота дискретизации 49.9 гц, данные займут объем 4.584 Гб
Cлой transformer.layers.2: частота дискретизации 49.9 гц, данные займут объем 4.584 Гб
Cлой transformer.layers.3: частота дискретизации 49.9 гц, данные займут объем 4.584 Гб
Cлой transformer.layers.4: частота дискретизации 49.9 гц, данные займут объем 4.584 Гб
Cлой transformer.layers.5: частота дискретизации 49.9 гц, данные займут объем 4.584 Гб
Cлой transformer.layers.6: частота дискретизации 49.9 гц, данные займут объем 4.584 Гб
Cлой transformer.layers.7: частота дискретизации 49.9 гц, данные займут объем 4.584 Гб
Cлой transformer.layers.8: частота дискретизации 49.9 гц, данные займут объем 4.584 Гб
Cлой transformer.layers.9: частота дискретизации 49.9 гц, данные займут объем 4.584 Гб
Cлой transformer.layers.10: частота дискретизации 49.9 гц, данные займут объем 4.584 Гб
Cлой aux: частота дискретизации 49.9 гц, данные займут объем 0.191 Гб
```


In [None]:
def get_wave2vec2_emot_model(device='cpu'):
  !pip install transformers -q
  from transformers import AutoConfig, Wav2Vec2FeatureExtractor, Wav2Vec2Model
  model_name = 'harshit345/xlsr-wav2vec-speech-emotion-recognition'
  config = AutoConfig.from_pretrained(model_name)
  feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
  model = Wav2Vec2Model.from_pretrained(model_name)
  return model