In [1]:
import pandas as pd
import numpy as np
import os
import re
import json

import torch
import torch.nn as nn
# from torch.utils.data import DataLoader, Dataset

import soundfile as sf
import torchaudio

from datasets import load_metric, Dataset

import transformers
from transformers import Trainer, Wav2Vec2ForCTC, TrainingArguments, Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor

import wandb

from types import SimpleNamespace
from typing import Any, Dict, List, Optional, Union
from tqdm import tqdm

from audiomentations import Compose, Gain, AddGaussianNoise, PitchShift, AddBackgroundNoise, ApplyImpulseResponse

from sklearn.model_selection import train_test_split

import IPython.display as ipd
import random

from dataclasses import dataclass, field

In [2]:
cfg = SimpleNamespace()
cfg.checkpoint_path = ''
cfg.output_dir = 'DATA/wav2vec2-xls-r-300m-Russian-small'
cfg.data_dir = 'C:/Datasets/RussianOpenSpeechToText/'
cfg.model_name = 'emre/wav2vec2-xls-r-300m-Russian-small'
cfg.audio_augmentation = []
cfg.switch = 1

# audio formats
cfg.sampling_rate = 16000
cfg.chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"\“\%\‘\”\�]'

# model parametrs
cfg.attention_dropout = 0.1
cfg.hidden_dropout = 0.1
cfg.feat_proj_fropout = 0.0
cfg.mask_feature_prob = 0.05
cfg.layerdrop = 0.1
cfg.gradient_checkpointing = True
# cfg.apply_spec_augment
# cfg.mask_time_length
# cfg.mask_feature_length

# # train parametrs
cfg.batch_size = 2
cfg.num_loader_workers = 0
cfg.gradient_accumulation_steps = 2
# cfg.seed
cfg.epochs = 10
# cfg.mixed_precision
cfg.logging_steps = 500
cfg.learning_rate = 2e-5
cfg.warmup_steps = 500
cfg.warmup_ratio = 0.1
# cfg.load_best_model_at_end
cfg.save_total_limit = 2

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
wer_metric = load_metric("wer")
tqdm.pandas()

In [6]:
# transformers.logging.set_verbosity_info()
# wandb.login()

# wer_metric = load_metric("wer")

# Data Load and Preprocessing

In [3]:
data = pd.read_csv(cfg.data_dir + 'df.csv')
data['path'] = cfg.data_dir + 'audio_files/' + data['audio_id'] + '.wav'
data = data[['text', 'path']]
data

Unnamed: 0,text,path
0,"По его словам, на вчерашний вечер у Донбанка «...",C:/Datasets/RussianOpenSpeechToText/audio_file...
1,"Этот процесс так захватывает человека, что он ...",C:/Datasets/RussianOpenSpeechToText/audio_file...
2,Это крупнейшее левое движение Западного полуша...,C:/Datasets/RussianOpenSpeechToText/audio_file...
3,А результаты проекта будут представлены на еже...,C:/Datasets/RussianOpenSpeechToText/audio_file...
4,За последний месяц лидеры двух стран встречают...,C:/Datasets/RussianOpenSpeechToText/audio_file...
...,...,...
117995,В двенадцать лет он сочинил псалом для голоса ...,C:/Datasets/RussianOpenSpeechToText/audio_file...
117996,"Перейти на страницу членства, где вы можете по...",C:/Datasets/RussianOpenSpeechToText/audio_file...
117997,"В сопровождении музыкантов, он отмечает ритм с...",C:/Datasets/RussianOpenSpeechToText/audio_file...
117998,"Адаптированная к ограничениям на рынке США, он...",C:/Datasets/RussianOpenSpeechToText/audio_file...


In [8]:
train_data, test_data = train_test_split(data, test_size=0.008)
train_data = train_data.sample(10000)
train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

In [9]:
train_ds = Dataset.from_pandas(train_data)
test_ds = Dataset.from_pandas(test_data)

In [10]:
def clean_text(batch):
    batch['text'] = re.sub('\s+', ' ', batch['text'])
    batch['text'] = ' '.join(re.findall(r'[А-яЁё]+', batch['text']))
    batch['text'] = re.sub(cfg.chars_to_ignore_regex, '', batch['text']).lower() + ' '
    return batch

train_ds = train_ds.map(clean_text)
test_ds = test_ds.map(clean_text)

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [19]:
# # load model and function
# model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad',
#                               model='silero_vad',
#                               force_reload=True)

# (get_speech_timestamps,
#  save_audio,
#  read_audio,
#  VADIterator,
#  collect_chunks) = utils

# def clean_speech(path):    
#     wav = read_audio(path, sampling_rate=cfg.sampling_rate)
#     # get speech timestamps from full audio file
#     speech_timestamps = get_speech_timestamps(wav, model, sampling_rate=cfg.sampling_rate)
#     # merge all speech chunks to one audio
#     save_audio(path,
#     collect_chunks(speech_timestamps, wav), sampling_rate=cfg.sampling_rate)

# data['path'].progress_apply(clean_speech)

In [11]:
# def extract_all_chars(batch):
#     all_text = " ".join(batch['text'])
#     vocab = list(set(all_text))
#     return {"vocab": [vocab], "all_text": [all_text]}

# vocab_train = train_ds.map(extract_all_chars, batch_size=-1, batched=True, keep_in_memory=True, remove_columns=train_ds.column_names)
# vocab_test = test_ds.map(extract_all_chars, batch_size=-1, batched=True, keep_in_memory=True, remove_columns=test_ds.column_names)

# vocab_list = list(set(vocab_train['vocab'][0]) | set(vocab_test['vocab'][0]))
# vocab = {v: k for k, v in enumerate(vocab_list)}
# vocab['|'] = vocab[' ']
# del vocab[' ']
# vocab[f'{tokenizer.unk_token}'] = len(vocab)
# vocab[f'{tokenizer.pad_token}'] = len(vocab)

# with open(cfg.output_dir + 'vocab.json', 'w') as vocab_file:
#     json.dump(vocab, vocab_file)
    
# tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

# vocab

In [5]:
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(cfg.model_name)
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1,
                                             sampling_rate=cfg.sampling_rate,
                                             padding_value=0.0,
                                             do_normalize=True,
                                             return_attention_mask=True
                                            )

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [13]:
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = sf.read(batch["path"])
    batch["speech"] = speech_array
    batch["sampling_rate"] = sampling_rate
    if cfg.switch:
        batch['target_text'] = batch['text']
    return batch

train_ds = train_ds.map(speech_file_to_array_fn, remove_columns=train_ds.column_names)
test_ds = test_ds.map(speech_file_to_array_fn, remove_columns=test_ds.column_names)

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [14]:
rnd_idx = random.randint(0, len(train_ds) -1)
ipd.Audio(data=np.asarray(train_ds[rnd_idx]['speech']), autoplay=True, rate=16000.)

In [15]:
def prepare_dataset(batch):
    # check that all files have the correct sampling rate
    assert (
        len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."

    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
    if cfg.switch:
        with processor.as_target_processor():
            batch["labels"] = processor(batch["target_text"]).input_ids
    return batch

train_ds = train_ds.map(prepare_dataset, remove_columns=train_ds.column_names, batch_size=8, batched=True)
test_ds = test_ds.map(prepare_dataset, remove_columns=test_ds.column_names, batch_size=8, batched=True)

  0%|          | 0/1250 [00:00<?, ?ba/s]

  tensor = as_tensor(value)


  0%|          | 0/118 [00:00<?, ?ba/s]

In [16]:
test_ds.save_to_disk(cfg.data_dir + 'test_ds')
train_ds.save_to_disk(cfg.data_dir + 'train_ds')

In [3]:
test_ds = Dataset.load_from_disk(cfg.data_dir + 'test_ds')
train_ds = Dataset.load_from_disk(cfg.data_dir + 'train_ds')

In [6]:
@dataclass
class DataCollatorCTCWithPadding:
    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None
        
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]
        
        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length, # self.max_length
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        
        with self.processor.as_target_processor():
            labels_batch = self.processor.pad(
                label_features,
                padding=self.padding,
                max_length=self.max_length_labels, # self.max_length_labels
                pad_to_multiple_of=self.pad_to_multiple_of_labels,
                return_tensors='pt',
            )
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        batch["labels"] = labels
        return batch
    
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [7]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    
    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

In [20]:
# class SpeechDataset(Dataset):
#     def __init__(self, vocab, data, tokenizer):
#         self.vocab = vocab
#         self.data = data
        
#     def __len__(self):
#         return len(self.data)
    
#     def __getitem__(self, item):
#         audio_id = self.data.audio_id[item]
#         text = self.data.text[item]
        

In [21]:
# def audio_preprocess_and_prepare_dataset(self):

#     def prepare_dataset(batch):
#         if self.files_path:
#             batch[self.audio_path_column] = os.path.join(self.files_path, batch[self.audio_path_column])
#         batch["input_values"] = batch[self.audio_path_column]
#         with self.processor.as_target_processor():
#             batch["labels"] = self.processor(batch[self.text_column]).input_ids
#         batch["length"] = len(batch["labels"])
#         return batch

#     print("> Prepare dataloader")
#     self.train_dataset = self.train_dataset.map(prepare_dataset, remove_columns=self.train_dataset.column_names, num_proc=self.config['num_loader_workers'], batched=False)
#     self.devel_dataset = self.devel_dataset.map(prepare_dataset, remove_columns=self.devel_dataset.column_names, num_proc=self.config['num_loader_workers'], batched=False)


In [22]:
# # data augmentation block
# def map_data_augmentation(aug_config):
#     aug_name = aug_config['name']
#     del aug_config['name']
#     if aug_name == 'additive':
#         return AddBackgroundNoise(**aug_config)
#     elif aug_name == 'gaussian':
#         return AddGaussianNoise(**aug_config)
#     elif aug_name == 'rir':
#         return ApplyImpulseResponse(**aug_config)
#     elif aug_name == 'gain':
#         return Gain(**aug_config)
#     elif aug_name == 'pitch_shift':
#         return PitchShift(**aug_config)
#     else:
#         raise ValueError("The data augmentation '" + aug_name + "' doesn't exist !!")
        
# audio_augmentator = Compose([map_data_augmentation(aug_config) for aug_config in cfg.audio_augmentation])

In [8]:
# load model to trainer
model = Wav2Vec2ForCTC.from_pretrained(
    cfg.model_name,
    attention_dropout = cfg.attention_dropout,
    hidden_dropout=cfg.hidden_dropout,
    feat_proj_dropout=cfg.feat_proj_fropout,
    mask_feature_prob = cfg.mask_feature_prob,
    layerdrop=cfg.layerdrop,
#     gradient_checkpointing=cfg.gradient_checkpointing,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
#     vocab_size=len(tokenizer),
#     ctc_zero_infinity=True,
#     apply_spec_augmnet=cfg.apply_spec_augment,
#     mask_time_length=cfg.mask_time_length,
#     mask_feature_length=cfg.mask_feature_length
)

model.freeze_feature_extractor()

training_args = TrainingArguments(
    output_dir = cfg.output_dir,
#     logging_dir=os.path.join(cfg.output_dir, "tensorboard"),
#     report_to='all',
#     run_name='RussianOpenSpeech-norm-spontaneous_speech-inf_train',
    group_by_length=True,
#     logging_first_step=True,
    per_device_train_batch_size=cfg.batch_size,
    per_device_eval_batch_size=cfg.batch_size * 2,
#     dataloader_num_workers=cfg.num_loader_workers,
    gradient_accumulation_steps=cfg.gradient_accumulation_steps,
#     seed=cfg.seed,
    num_train_epochs=cfg.epochs,
    fp16=True,
    logging_steps=cfg.logging_steps,
    learning_rate=cfg.learning_rate,
    warmup_steps=cfg.warmup_steps,
    warmup_ratio=cfg.warmup_ratio,
    save_strategy='epoch',
    evaluation_strategy='epoch',
#     load_best_model_at_end=cfg.load_best_model_at_end,
    metric_for_best_model="eval_loss",
#     greater_is_better=False,
    save_total_limit=cfg.save_total_limit
)

trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=processor.feature_extractor,
)

Using amp half precision backend


In [9]:
trainer.train()

***** Running training *****
  Num examples = 10000
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 2
  Total optimization steps = 25000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mresquilleur[0m (use `wandb login --relogin` to force relogin)


Epoch,Training Loss,Validation Loss,Wer
1,0.5079,0.340478,0.492406
2,0.4503,0.311418,0.463824
3,0.4133,0.295762,0.442505
4,0.3761,0.288195,0.428922
5,0.3495,0.283975,0.417508
6,0.3484,0.272899,0.408075
7,0.3173,0.272708,0.399868
8,0.3068,0.268768,0.395246
9,0.3128,0.266803,0.392416
10,0.3,0.267667,0.39119


***** Running Evaluation *****
  Num examples = 944
  Batch size = 4
Saving model checkpoint to DATA/wav2vec2-xls-r-300m-Russian-small\checkpoint-2500
Configuration saved in DATA/wav2vec2-xls-r-300m-Russian-small\checkpoint-2500\config.json
Model weights saved in DATA/wav2vec2-xls-r-300m-Russian-small\checkpoint-2500\pytorch_model.bin
Feature extractor saved in DATA/wav2vec2-xls-r-300m-Russian-small\checkpoint-2500\preprocessor_config.json
***** Running Evaluation *****
  Num examples = 944
  Batch size = 4
Saving model checkpoint to DATA/wav2vec2-xls-r-300m-Russian-small\checkpoint-5000
Configuration saved in DATA/wav2vec2-xls-r-300m-Russian-small\checkpoint-5000\config.json
Model weights saved in DATA/wav2vec2-xls-r-300m-Russian-small\checkpoint-5000\pytorch_model.bin
Feature extractor saved in DATA/wav2vec2-xls-r-300m-Russian-small\checkpoint-5000\preprocessor_config.json
***** Running Evaluation *****
  Num examples = 944
  Batch size = 4
Saving model checkpoint to DATA/wav2vec2-xl

TrainOutput(global_step=25000, training_loss=0.3744827801513672, metrics={'train_runtime': 9640.8417, 'train_samples_per_second': 10.373, 'train_steps_per_second': 2.593, 'total_flos': 2.08480066936002e+19, 'train_loss': 0.3744827801513672, 'epoch': 10.0})

# Inference

In [3]:
transformers.logging.set_verbosity_error()

In [4]:
model = Wav2Vec2ForCTC.from_pretrained(cfg.output_dir + '/checkpoint-25000').to(device)
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(cfg.model_name)
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1,
                                             sampling_rate=cfg.sampling_rate,
                                             padding_value=0.0,
                                             do_normalize=True,
                                             return_attention_mask=True
                                            )

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
test_ds = Dataset.load_from_disk(cfg.data_dir + 'test_ds')

In [17]:
final_pred = []
true_label = []
for i in tqdm(range(test_ds.shape[0])):    
    input_dict = processor(test_ds[i]["input_values"], return_tensors="pt", padding=True)

    logits = model(input_dict.input_values.to("cuda")).logits

    pred_ids = torch.argmax(logits, dim=-1)[0]
    prediction = processor.decode(pred_ids)
    final_pred.append(prediction)
    label = processor.decode(test_ds[i]["labels"])
    true_label.append(label)

100%|██████████| 944/944 [01:13<00:00, 12.91it/s]


In [18]:
d = {'pred':final_pred, 'true':true_label}
pd.DataFrame(d)

Unnamed: 0,pred,true
0,деньги та конечно берут расуших предлагают с т...,деньги то конечно берут раз уж их предлагают с...
1,согласно ведомственным документом диреевия дол...,согласно ведомственым документам деревья должн...
2,нужны станки на которых заключенные будут учит...,нужны станки на которых заключеные будут учить...
3,пока не известно когда цем т возобновит работу...,пока неизвестно когда цмт возобновит работу в ...
4,новый регион навыходных появилось информация ч...,новый регион на выходных появилась информация ...
...,...,...
939,в обращении горов требовал провести проверку и...,в обращени егоров требовал провести проверку и...
940,пожар в подвали детского приюта хорошево мневн...,пожар в подвале детского приюта хорошево мневн...
941,но это понятно потому чты они тренировались на...,но это понятно потому что они тренировались на...
942,порой благодаря сексуальным отношением оказыва...,порой благодаря сексуальным отношениям оказыва...


# Distilation wav2vec2