In [114]:
import torch
import wave
import torchaudio
from sympy.codegen.fnodes import dsign
from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
from datasets import load_dataset, load_metric
from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor, TrainingArguments, Trainer, \
    DataCollatorForSeq2Seq

In [115]:
model_checkpoint = "facebook/s2t-small-librispeech-asr"

In [116]:
model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")

processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")

ds_russian = load_dataset("bond005/sberdevices_golos_10h_crowd")

Some weights of Speech2TextForConditionalGeneration were not initialized from the model checkpoint at facebook/s2t-small-librispeech-asr and are newly initialized: ['model.encoder.embed_positions.weights', 'model.decoder.embed_positions.weights']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [137]:
processor.tokenizer.save_vocabulary("./")

('./sentencepiece.bpe.model',)

In [138]:
input_features = processor(
    ds_russian['train'][2]["audio"]["array"],
    sampling_rate=16_000,
    return_tensors="pt"
).input_features

AttributeError: 

In [121]:
import IPython.display as ipd
import numpy as np
import random

rand_int = random.randint(0, len(ds_russian["train"]))

print(ds_russian["train"][rand_int]["transcription"])
ipd.Audio(data=np.asarray(ds_russian["train"][rand_int]["audio"]["array"]), autoplay=True, rate=16000)

киноработа статус свободен


In [122]:
rand_int = random.randint(0, len(ds_russian["train"]))

print("Target text:", ds_russian["train"][rand_int]["transcription"])
print("Input array shape:", np.asarray(ds_russian["train"][rand_int]["audio"]["array"]).shape)
print("Sampling rate:", ds_russian["train"][rand_int]["audio"]["sampling_rate"])

Target text: жасура петровича храмова
Input array shape: (40861,)
Sampling rate: 16000


In [123]:
from transformers import AutoFeatureExtractor

feature_extractor = AutoFeatureExtractor.from_pretrained(model_checkpoint)

In [124]:
def extract_all_chars(batch):
  all_text = " ".join(filter(lambda x: x is not None, batch["transcription"]))
  vocab = list(set(all_text))
  return {"vocab": [vocab], "all_text": [all_text]}

In [125]:
vocabs = ds_russian.map(
  extract_all_chars,
  batched=True,
  batch_size=-1,
  keep_in_memory=True, 
  remove_columns=ds_russian.column_names["train"]
)

vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
# vocab_dict = {v: k for k, v in enumerate(['s', 'a', 's', 'a', 's', 'a', 's', 'a', 's', 'a', 's', 'a', 's', 'a', 's', 'a', 's', 'a', 's', 'a', 's', 'a', 's', 'a', 'h', 'm', 'n', 'b', ])}

Map:   0%|          | 0/7993 [00:00<?, ? examples/s]

Map:   0%|          | 0/793 [00:00<?, ? examples/s]

Map:   0%|          | 0/9994 [00:00<?, ? examples/s]

In [126]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [127]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

35

In [128]:
import json
with open('vocab.json', 'w', encoding='utf-8') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [129]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained(model_checkpoint)

tokenizer_type = config.model_type if config.tokenizer_class is None else None
config = config if config.tokenizer_class is not None else None
tokenizer_type

'speech_to_text'

In [132]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
  "./",
  config=config,
  tokenizer_type=tokenizer_type,
  unk_token="[UNK]",
  pad_token="[PAD]",
  word_delimiter_token="|",
)

`use_fast` is set to `True` but the tokenizer class does not have a fast version.  Falling back to the slow version.


In [133]:
from transformers import SpeechEncoderDecoderModel, Wav2Vec2Processor
MODEL_ID = "bond005/wav2vec2-mbart50-ru"

processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)

In [142]:
def prepare_dataset(batch):
    print(batch)
    audio = batch["audio"]

    # batched output is "un-batched" to ensure mapping is correct
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["input_length"] = len(batch["input_values"])
    
    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcription"]).input_ids
    return batch

In [143]:
ds_russian = ds_russian.map(prepare_dataset, remove_columns=ds_russian.column_names["train"], num_proc=4)

Map (num_proc=4):   0%|          | 0/7993 [00:00<?, ? examples/s]

{'audio': {'path': None, 'array': array([ 0.00186157,  0.00714111,  0.01473999, ..., -0.01364136,
       -0.01638794, -0.01467896]), 'sampling_rate': 16000}, 'transcription': 'джой есть свободное время или занят'}
{'audio': {'path': None, 'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00057983,
       -0.00048828, -0.00039673]), 'sampling_rate': 16000}, 'transcription': 'у тебя в запасе есть сериал удивительные странствия геракла второй сезон'}{'audio': {'path': None, 'array': array([ 3.05175781e-05,  3.05175781e-05,  0.00000000e+00, ...,
       -1.09863281e-03, -7.93457031e-04, -1.52587891e-04]), 'sampling_rate': 16000}, 'transcription': 'шестнадцатая часть сезона пять сериала лемони сникет тридцать три несчастья'}

{'audio': {'path': None, 'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
       -3.05175781e-05,  0.00000000e+00,  0.00000000e+00]), 'sampling_rate': 16000}, 'transcription': 'футбольная встреча манчестер сити и арсенал'}







{'audio': {'path': None, 'array': array([-1.22070312e-04, -6.10351562e-05,  0.00000000e+00, ...,
       -1.83105469e-04, -2.74658203e-04, -1.83105469e-04]), 'sampling_rate': 16000}, 'transcription': 'у тебя есть фильм грязь'}
{'audio': {'path': None, 'array': array([-3.05175781e-05,  0.00000000e+00,  0.00000000e+00, ...,
        6.10351562e-05,  6.10351562e-05,  6.10351562e-05]), 'sampling_rate': 16000}, 'transcription': 'три ангела на смотрешке'}{'audio': {'path': None, 'array': array([-3.05175781e-05,  0.00000000e+00, -3.05175781e-05, ...,
        3.05175781e-05,  0.00000000e+00,  0.00000000e+00]), 'sampling_rate': 16000}, 'transcription': 'игра на выживание найти'}

{'audio': {'path': None, 'array': array([0.        , 0.        , 0.        , ..., 0.00030518, 0.00045776,
       0.00012207]), 'sampling_rate': 16000}, 'transcription': 'закажи мне двухлитровую бутылку воды шишкин лес с газом'}{'audio': {'path': None, 'array': array([0.00653076, 0.00671387, 0.00601196, ..., 0.09439087, 0

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



{'audio': {'path': None, 'array': array([ 0.        ,  0.        ,  0.        , ..., -0.0007019 ,
       -0.00082397, -0.00067139]), 'sampling_rate': 16000}, 'transcription': 'отзывы якутск'}

{'audio': {'path': None, 'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00079346,
       -0.00088501, -0.00054932]), 'sampling_rate': 16000}, 'transcription': 'какой курс бальбоа на второе июля'}{'audio': {'path': None, 'array': array([0.        , 0.        , 0.        , ..., 0.01119995, 0.0112915 ,
       0.01083374]), 'sampling_rate': 16000}, 'transcription': 'сереже дворецкому'}

{'audio': {'path': None, 'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00018311,
        0.00015259,  0.00024414]), 'sampling_rate': 16000}, 'transcription': 'сколько стоит обменять двадцать пять фунтов стерлингов в казахских тенге'}{'audio': {'path': None, 'array': array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
       0.00000000e+00, 0.00000000e+00, 3.05175781e-05]), 'sam

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





{'audio': {'path': None, 'array': array([0.        , 0.        , 0.        , ..., 0.00149536, 0.00088501,
       0.00039673]), 'sampling_rate': 16000}, 'transcription': 'первая кровельная'}{'audio': {'path': None, 'array': array([ 0.00018311,  0.00024414,  0.00018311, ..., -0.00201416,
       -0.00210571, -0.00219727]), 'sampling_rate': 16000}, 'transcription': 'хочу прямой эфир апл юнайтед с челси'}

{'audio': {'path': None, 'array': array([ 0.00000000e+00, -3.05175781e-05,  0.00000000e+00, ...,
       -3.16467285e-02, -3.50646973e-02, -2.97546387e-02]), 'sampling_rate': 16000}, 'transcription': 'покажи два эскобара'}{'audio': {'path': None, 'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00021362,
       -0.00036621, -0.00057983]), 'sampling_rate': 16000}, 'transcription': 'сбер что делает в сбербанке сергей мальцев'}

{'audio': {'path': None, 'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00088501,
       -0.00085449, -0.00045776]), 'sampling_rate': 

Map (num_proc=4):   0%|          | 0/9994 [00:00<?, ? examples/s]

{'audio': {'path': None, 'array': array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
       3.05175781e-05, 1.52587891e-04, 1.83105469e-04]), 'sampling_rate': 16000}, 'transcription': 'афина включи холидэй'}{'audio': {'path': None, 'array': array([ 0.        ,  0.        ,  0.        , ...,  0.0012207 ,
        0.00057983, -0.00082397]), 'sampling_rate': 16000}, 'transcription': 'шестьдесят тысяч тенге сколько будет стоить'}
{'audio': {'path': None, 'array': array([0.        , 0.        , 0.        , ..., 0.00265503, 0.00384521,
       0.00271606]), 'sampling_rate': 16000}, 'transcription': 'напиток тархун ноль пять объемом'}





{'audio': {'path': None, 'array': array([-0.00045776, -0.0007019 , -0.0007019 , ...,  0.00558472,
        0.00057983,  0.00192261]), 'sampling_rate': 16000}, 'transcription': 'у тебя будет фильм дайан лэйн'}
{'audio': {'path': None, 'array': array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
       3.05175781e-05, 3.05175781e-05, 3.05175781e-05]), 'sampling_rate': 16000}, 'transcription': 'покажи мне на смотрешке телеканал синергия тв'}{'audio': {'path': None, 'array': array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
       -3.05175781e-05,  6.10351562e-05,  9.15527344e-05]), 'sampling_rate': 16000}, 'transcription': 'четырнадцатый сезон сериал готэм'}

{'audio': {'path': None, 'array': array([0.        , 0.        , 0.        , ..., 0.00213623, 0.00192261,
       0.00183105]), 'sampling_rate': 16000}, 'transcription': 'билет фонд модернизации и развития жкх муниципальных образований новосибирской области'}




{'audio': {'path': None, 'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00189209,
       -0.0015564 , -0.00146484]), 'sampling_rate': 16000}, 'transcription': 'заказать яблоки зеленые'}{'audio': {'path': None, 'array': array([0.        , 0.        , 0.        , ..., 0.01635742, 0.01431274,
       0.01281738]), 'sampling_rate': 16000}, 'transcription': 'картина последний неандерталец'}
{'audio': {'path': None, 'array': array([ 0.00000000e+00, -3.05175781e-05, -3.05175781e-05, ...,
       -9.15527344e-05,  0.00000000e+00,  6.10351562e-05]), 'sampling_rate': 16000}, 'transcription': 'врубай канал нц тв на тв'}

{'audio': {'path': None, 'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00201416,
        0.0007019 ,  0.00549316]), 'sampling_rate': 16000}, 'transcription': 'кино мировая прогулка тунис'}
{'audio': {'path': None, 'array': array([ 0.00088501,  0.00045776,  0.00015259, ..., -0.0005188 ,
       -0.00067139, -0.00054932]), 'sampling_rate': 16000}, 'tr

ValueError: You need to specify either `text` or `text_target`.

In [144]:
ds_russian

DatasetDict({
    train: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 7993
    })
    validation: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 793
    })
    test: Dataset({
        features: ['audio', 'transcription'],
        num_rows: 9994
    })
})

In [145]:
max_input_length_in_sec = 4.0
ds_russian["train"] = ds_russian["train"].filter(lambda x: x < max_input_length_in_sec * processor.feature_extractor.sampling_rate, input_columns=["input_length"])

ValueError: Input column input_length not in the dataset. Current columns in the dataset: ['audio', 'transcription']