# Install dependencies

In [1]:
#!pip install -U git+https://github.com/huggingface/accelerate.git

In [2]:
# !pip install --upgrade comet_ml -qq

# Imports

In [1]:
import comet_ml

from accelerate import notebook_launcher
from accelerate.utils import set_seed

import gc
import os
import numpy as np
import pandas as pd

import torch
import torchaudio
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
from transformers import Trainer, TrainingArguments
from tqdm import tqdm

from utils import clean_text

[2023-09-05 23:53:27,899] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


# Constants

WHISPER_MODEL examples:

* 1) openai/whisper-small (suitable for testing functionality, not very accurate on sentences, but capable of recognizing individual words or phrases. Requires low computational resources)
* 2) openai/whisper-medium (recommended medium model)
* 3) openai/whisper-large (sufficiently accurate on large sentences, but requires significant computational resources)
* 4) openai/whisper-large-v2 (sufficiently accurate on large sentences, but requires significant computational resources)
* 5) lorenzoncina/whisper-medium-ru (a model finetuned on the Russian language - recommended for training on Russian)

In [2]:
os.environ["COMET_LOG_ASSETS"] = "True"

WHISPER_MODEL = 'openai/whisper-small'
DATASET_DIR = '/kaggle/input/it-spectrum-dataset/'

# Whisper initializing

In [3]:
processor = AutoProcessor.from_pretrained(WHISPER_MODEL)
model = AutoModelForSpeechSeq2Seq.from_pretrained(WHISPER_MODEL)

In [4]:
# setting the model's language and defining the task of transcription
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="tatar", task="transcribe")

# Data initializing

Training dataset. When indexed, it returns a list containing:

* 1) filepath - path to the audio
* 2) text - transcribed text by annotators
* 3) input_features - audio features for prediction
* 4) labels - transcribed text by annotators converted into tokens
* 5) attention mask - an attention mask where each element indicates whether the model should pay attention to the token corresponding to the same index in the labels list.

In [5]:
class WhisperDataset(Dataset):
    def __init__(self, audio_dir: str, processor, max_length, only_char=True):
        self.audio_dir = audio_dir
        df = pd.read_csv(audio_dir[:-1] + '.csv', index_col='id')
        self.data = {}
        counter = 0
        for row in df.itertuples():
            if not os.path.exists(audio_dir + str(row[0]) + '.txt'):
                print('Отсутствует файл', str(row[0]) + '.txt')
                continue
            if not os.path.exists(audio_dir + str(row[0]) + '.wav'):
                print(f'Отсутствует файл', str(row[0]) + '.wav')
                continue
            self.data[counter] = {
                'text': str(row[0]) + '.txt',
                'audio': str(row[0]) + '.wav'
            }
            counter += 1
        self.len = counter - 1
        self.only_char = only_char
        del counter
        del df
        self.processor = processor
        self.max_length = max_length

    def __len__(self):
        return self.len
    
    def _get_audio_sample_path(self, index):
        return self.audio_dir + self.data[index]['audio']
    
    def _get_audio_sample_label(self, index):
        label_path = self.audio_dir + self.data[index]['text']
        with open(label_path, 'r') as f:
            label = clean_text(f.read()) if self.only_char else f.read() # Не учитывает ошибки в заполнение .txt
        return label

    def __getitem__(self, idx):
        filepath = self._get_audio_sample_path(idx)
        text = self._get_audio_sample_label(idx)
        
        audio, sample_rate = torchaudio.load(filepath)
        audio = torch.reshape(audio, (-1,))
        
        tokenized = self.processor.tokenizer(
            text, return_tensors='pt', padding='max_length', return_attention_mask=True, 
            max_length=self.max_length
        )
        
        labels, attention_mask = tokenized['input_ids'][0], tokenized['attention_mask'][0]
        
        input_features = self.processor(audio, return_tensors="pt", sampling_rate=sample_rate).input_features[0]
        
        return {
            'input_features': input_features, 
            'labels': labels,
            'attention_mask': attention_mask
        }

In [6]:
# create train/val/test datasets
train_dataset = WhisperDataset('../tatar_tts/train/', processor, model.config.max_length)
valid_dataset = WhisperDataset('../tatar_tts/valid/', processor, model.config.max_length)

Отсутствует файл 198.2.txt
Отсутствует файл 241.2.txt
Отсутствует файл 227.2.txt
Отсутствует файл 272.51.txt
Отсутствует файл 228.2.txt
Отсутствует файл 224.2.txt
Отсутствует файл 238.2.txt
Отсутствует файл 202.2.txt


In [7]:
len(train_dataset), len(valid_dataset)

(3171, 1859)

## GPU runtime

In [8]:
torch.cuda.empty_cache()
gc.collect()

148

In [9]:
!nvidia-smi

Tue Sep  5 23:53:34 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 520.61.05    Driver Version: 520.61.05    CUDA Version: 11.8     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  On   | 00000000:01:00.0 Off |                    0 |
| N/A   31C    P0    34W / 250W |   2250MiB / 32768MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  On   | 00000000:02:00.0 Off |                    0 |
| N/A   31C    P0    27W / 250W |      4MiB / 32768MiB |      0%      Default |
|       

# Training

In [10]:
comet_ml.init( project_name = "TatAsr-whisper", experiment_name = "TatAsr-cnn-rnn-accelerator-2gpu")

In [11]:
def training_function():
    global model
    training_args = TrainingArguments(
        output_dir='./whisper', 
        overwrite_output_dir=True, 
        num_train_epochs=1,
        per_device_train_batch_size=6,
        save_steps=500, 
        save_total_limit=2,
        do_train=True,
    )
    
    set_seed(42)
    torch.manual_seed(7)
    
    trainer = Trainer(
        model,
        training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
    )
    trainer.train()

In [12]:
notebook_launcher(training_function, num_processes=2, mixed_precision='fp16')

Launching training on 2 GPUs.


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/gumaonelove/tatasr-whisper/29a11dd378584871ac4d598bec28ab79



Step,Training Loss


Please double-check the directory path and the recursive parameter
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/gumaonelove/tatasr-whisper/29a11dd378584871ac4d598bec28ab79
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     epoch                    : 1.0
[1;38;5;39mCOMET INFO:[0m     total_flos               : 9.177015389677158e+17
[1;38;5;39mCOMET INFO:[0m     train_loss               : 0.01927664594830207
[1;38;5;39mCOMET INFO:[0m     train_runtime            : 415.7209
[1;38;5;39mCOMET INFO:[0m     train_samples_per_second : 7.628
[1;38;5;39mCOMET 

[1;38;5;39mCOMET INFO:[0m     args/optim_args                         : None
[1;38;5;39mCOMET INFO:[0m     args/output_dir                         : ./whisper
[1;38;5;39mCOMET INFO:[0m     args/overwrite_output_dir               : True
[1;38;5;39mCOMET INFO:[0m     args/parallel_mode                      : ParallelMode.DISTRIBUTED
[1;38;5;39mCOMET INFO:[0m     args/past_index                         : -1
[1;38;5;39mCOMET INFO:[0m     args/per_device_eval_batch_size         : 8
[1;38;5;39mCOMET INFO:[0m     args/per_device_train_batch_size        : 6
[1;38;5;39mCOMET INFO:[0m     args/per_gpu_eval_batch_size            : None
[1;38;5;39mCOMET INFO:[0m     args/per_gpu_train_batch_size           : None
[1;38;5;39mCOMET INFO:[0m     args/place_model_on_device              : True
[1;38;5;39mCOMET INFO:[0m     args/prediction_loss_only               : False
[1;38;5;39mCOMET INFO:[0m     args/process_index                      : 0
[1;38;5;39mCOMET INFO:[0m     args/

[1;38;5;39mCOMET INFO:[0m     config/median_filter_width              : 7
[1;38;5;39mCOMET INFO:[0m     config/min_length                       : 0
[1;38;5;39mCOMET INFO:[0m     config/model_type                       : whisper
[1;38;5;39mCOMET INFO:[0m     config/name_or_path                     : openai/whisper-small
[1;38;5;39mCOMET INFO:[0m     config/no_repeat_ngram_size             : 0
[1;38;5;39mCOMET INFO:[0m     config/num_beam_groups                  : 1
[1;38;5;39mCOMET INFO:[0m     config/num_beams                        : 1
[1;38;5;39mCOMET INFO:[0m     config/num_hidden_layers                : 12
[1;38;5;39mCOMET INFO:[0m     config/num_labels                       : 2
[1;38;5;39mCOMET INFO:[0m     config/num_mel_bins                     : 80
[1;38;5;39mCOMET INFO:[0m     config/num_return_sequences             : 1
[1;38;5;39mCOMET INFO:[0m     config/output_attentions                : False
[1;38;5;39mCOMET INFO:[0m     config/output_hidden_stat

In [13]:
experiment.end()

NameError: name 'experiment' is not defined

# Testing

In [17]:
# getting folder of the newest checkpoint
training_args = TrainingArguments(
    output_dir='./whisper', 
    overwrite_output_dir=True, 
    num_train_epochs=1,
    per_device_train_batch_size=6,
    save_steps=500, 
    save_total_limit=2,
    do_train=True,
)
checkpoint_path = max(os.listdir(training_args.output_dir), key=lambda x: int(x.split('-')[-1]) if 'checkpoint-' in x else 0)
checkpoint_path = os.path.join(training_args.output_dir, checkpoint_path)

ValueError: max() arg is an empty sequence

In [None]:
model = AutoModelForSpeechSeq2Seq.from_pretrained(checkpoint_path)

## Test dataset test

In [None]:
def predict(model, dataset: WhisperDataset) -> pd.DataFrame:
    predicted_df = pd.DataFrame([], columns=['filename', 'pred', 'gt'])
    for filepath, text, input_features, _, attention_mask in tqdm(test_dataset):
        filename = filepath.replace('\\', '/').split('/')[-1]
    
        input_features = torch.stack([input_features]).to('cuda')
        generated_ids = model.generate(inputs=input_features, attention_mask=attention_mask)
        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
        predicted_df.loc[len(predicted_df)] = [filename, transcription, text]
    return predicted_df

In [None]:
predicted_df = predict(model, valid_dataset)

In [None]:
predicted_df

In [24]:
# metric: accuracy
acc = sum((predicted_df['pred'] == predicted_df['gt'])) / len(predicted_df)
print(f'{acc * 100}%')

100.0%


## Custom sample test:

In [25]:
audio, sample_rate = librosa.load(os.path.join(DATASET_DIR, 'sample/eu.0124f456-13b8-4765-936a-36bfd483683e.wav'), sr=16000)

In [26]:
inputs = processor(audio, return_tensors='pt', sampling_rate=sample_rate)
input_features = inputs.input_features

In [27]:
generated_ids = model.generate(inputs=input_features)



In [28]:
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(transcription)

Новая
