# Install dependencies

In [None]:
#!pip install -U git+https://github.com/huggingface/accelerate.git

# Imports

In [1]:
import os
import numpy as np
import pandas as pd

import torch
import torchaudio
from torch.utils.data import Dataset
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
from transformers import Trainer, TrainingArguments
import librosa # for audio reading
from tqdm import tqdm

from utils import clean_text

comet_ml is installed but `COMET_API_KEY` is not set.


[2023-09-04 19:16:52,898] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


# Constants

WHISPER_MODEL examples:

* 1) openai/whisper-small (suitable for testing functionality, not very accurate on sentences, but capable of recognizing individual words or phrases. Requires low computational resources)
* 2) openai/whisper-medium (recommended medium model)
* 3) openai/whisper-large (sufficiently accurate on large sentences, but requires significant computational resources)
* 4) openai/whisper-large-v2 (sufficiently accurate on large sentences, but requires significant computational resources)
* 5) lorenzoncina/whisper-medium-ru (a model finetuned on the Russian language - recommended for training on Russian)

In [2]:
os.environ['WANDB_DISABLED'] = 'true' # disable logging of wandb

WHISPER_MODEL = 'openai/whisper-small'
DATASET_DIR = '/kaggle/input/it-spectrum-dataset/'
VAL_PERCENT, TEST_PERCENT = 0.05, 0.2 # dataset is divided into train, validation and test according to these values multiplied by 100%. TRAIN_PERCENT = 1 - (VAL_PERCENT + TEST_PERCENT)
TRAINING_ARGS = TrainingArguments(
    output_dir='./whisper', # the directory to save checkpoints
    overwrite_output_dir=True, # overwrite output directory if exists
    num_train_epochs=10, # number of epochs. One epoch is a single pass through the entire dataset. The number of epochs to use depends on the size of the dataset. Too many epochs can lead to overfitting, which can be detected by monitoring the validation loss during training. Too few epochs can result in underfitting, which can be identified by a consistently "sharp" decrease in the loss.
    per_device_train_batch_size=2, # The batch size per iteration on one GPU. It is ideally in the form of a power of two (2, 4, 8), but should not exceed 64 (using larger batch sizes can lead to worse results from the optimizer)
    save_steps=500, # save checkpoint each X iterations
    save_total_limit=2, # maximum number of checkpoints in a folder, where older checkpoints are deleted when new ones are saved
    do_train=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


# Whisper initializing

In [3]:
processor = AutoProcessor.from_pretrained(WHISPER_MODEL)
model = AutoModelForSpeechSeq2Seq.from_pretrained(WHISPER_MODEL).to('cuda')

In [4]:
# setting the model's language and defining the task of transcription
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="tatar", task="transcribe")

# Data initializing

Training dataset. When indexed, it returns a list containing:

* 1) filepath - path to the audio
* 2) text - transcribed text by annotators
* 3) input_features - audio features for prediction
* 4) labels - transcribed text by annotators converted into tokens
* 5) attention mask - an attention mask where each element indicates whether the model should pay attention to the token corresponding to the same index in the labels list.

In [5]:
class WhisperDataset(Dataset):
    def __init__(self, audio_dir: str, processor, only_char=True):
        self.audio_dir = audio_dir
        df = pd.read_csv(audio_dir[:-1] + '.csv', index_col='id')
        self.data = {}
        counter = 0
        for row in df.itertuples():
            self.data[counter] = {
                'text': str(row[0]) + '.txt',
                'audio': str(row[0]) + '.wav'
            }
            counter += 1
        self.len = counter
        self.only_char = only_char
        del counter
        del df
        self.processor = processor

    def __len__(self):
        return self.len
    
    def _get_audio_sample_path(self, index):
        return self.audio_dir + self.data[index]['audio']
    
    def _get_audio_sample_label(self, index):
        label_path = self.audio_dir + self.data[index]['text']
        with open(label_path, 'r') as f:
            label = clean_text(f.read()) if self.only_char else f.read() # Не учитывает ошибки в заполнение .txt
        return label

    def __getitem__(self, idx):
        filepath = self._get_audio_sample_path(idx)
        text = self._get_audio_sample_label(idx)
        
        # this is the sample rate, which represents audio frequency. Whisper models are pretrained on a sample rate of 16000, so it's recommended not to change this value.
        #audio, sample_rate = torchaudio.load(filepath)
        audio, _ = librosa.load(filepath, sr=16000)
        
        tokenized = self.processor.tokenizer(
            text, return_tensors='pt', padding='max_length', return_attention_mask=True, 
            max_length=model.config.max_length
        )
        
        labels, attention_mask = tokenized['input_ids'][0], tokenized['attention_mask'][0]
        
        input_features = self.processor(audio, return_tensors="pt", sampling_rate=16000).input_features[0]
        
        return [filepath, text, input_features, labels, attention_mask]

In [6]:
# create train/val/test datasets
train_dataset = WhisperDataset('../tatar_tts/train/', processor)
eval_dataset = WhisperDataset('../tatar_tts/valid/', processor)

In [7]:
len(train_dataset), len(eval_dataset)

(65891, 9691)

# Utils

In [8]:
def predict(model, dataset: WhisperDataset) -> pd.DataFrame:
    '''используется только для составления прогнозов по набору данных Whisper. 
    Он принимает в качестве входных данных модель и набор данных и возвращает прогнозы в виде панд.
    Фрейм данных (т.е. таблица)'''
    predicted_df = pd.DataFrame([], columns=['filename', 'pred', 'gt'])
    for filepath, text, input_features, _, attention_mask in tqdm(dataset):
        filename = filepath.replace('\\', '/').split('/')[-1]
    
        input_features = torch.stack([input_features]).to('cuda')
        generated_ids = model.generate(inputs=input_features, attention_mask=attention_mask)
        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    
        predicted_df.loc[len(predicted_df)] = [filename, transcription, text]
    return predicted_df

# Calculating current metric (not trained model)

In [None]:
predicted_df = predict(model, eval_dataset)

  1%|█▉                                                                                                                                | 145/9691 [00:58<57:38,  2.76it/s]

In [15]:
predicted_df

Unnamed: 0,filename,pred,gt
0,eu.c7125b33-46dd-4dcd-b0dd-b4370a7d3272.wav,Садовая.,Садовая
1,eu.518ad001-1e7f-4b1f-a03a-cb0e7ab213c0.wav,Центральное.,Центральная
2,eu.a610a47a-e238-44a9-bc51-a547f608ae12.wav,Центральное,Центральная
3,eu.33d50dea-014e-4ced-8eca-fda21dcba219.wav,Революции.,Революции
4,eu.1d1ff299-f454-4772-bfd9-680c2da6acd0.wav,Лесная,Лесная
5,eu.9a813c5e-4035-4170-b7fe-6324b76a38f7.wav,Зеленая,Зеленая
6,eu.98a6a38a-07b0-4b00-bebb-cfa853a60186.wav,Молодежная,Молодежная
7,eu.e907120a-a234-41cf-853d-5c53897f094b.wav,Советская.,Советская
8,eu.46de6638-3634-43f0-894d-03db84c2f763.wav,Советская.,Советская
9,eu.ab9ecaf2-0185-4679-9327-3673d59bcc69.wav,Молодежная.,Молодежная


In [16]:
acc = sum((predicted_df['pred'] == predicted_df['gt'])) / len(predicted_df)
print(f'{acc * 100}%')

0.0%


# Training

In [17]:
# function that transforms data after extracting it from the dataset. Here, the data_collate_fn simply reshapes the data.
def data_collate_fn(data_list):
    batch = len(data_list)
    data_numpy = np.array(data_list)
    input_features, labels, attention_mask = data_numpy[:, -3], data_numpy[:, -2], data_numpy[:, -1]
    return {'input_features': torch.stack(input_features.tolist()),
            'labels': torch.stack(labels.tolist()),
            'attention_mask': torch.stack(attention_mask.tolist())}

In [18]:
trainer = Trainer(
    model=model,
    args=TRAINING_ARGS,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collate_fn
)

In [19]:
trainer.train()

  data_numpy = np.array(data_list)


Step,Training Loss
500,0.0046


  data_numpy = np.array(data_list)


TrainOutput(global_step=830, training_loss=0.002763278443290549, metrics={'train_runtime': 716.1745, 'train_samples_per_second': 2.304, 'train_steps_per_second': 1.159, 'total_flos': 4.76165910528e+17, 'train_loss': 0.002763278443290549, 'epoch': 10.0})

# Testing

In [20]:
# getting folder of the newest checkpoint
checkpoint_path = max(os.listdir(TRAINING_ARGS.output_dir), key=lambda x: int(x.split('-')[-1]) if 'checkpoint-' in x else 0)
checkpoint_path = os.path.join(TRAINING_ARGS.output_dir, checkpoint_path)

In [21]:
model = AutoModelForSpeechSeq2Seq.from_pretrained(checkpoint_path).to('cuda')

## Test dataset test

In [22]:
predicted_df = predict(model, test_dataset)

100%|██████████| 11/11 [00:02<00:00,  3.82it/s]


In [23]:
predicted_df

Unnamed: 0,filename,pred,gt
0,eu.c7125b33-46dd-4dcd-b0dd-b4370a7d3272.wav,Садовая,Садовая
1,eu.518ad001-1e7f-4b1f-a03a-cb0e7ab213c0.wav,Центральная,Центральная
2,eu.a610a47a-e238-44a9-bc51-a547f608ae12.wav,Центральная,Центральная
3,eu.33d50dea-014e-4ced-8eca-fda21dcba219.wav,Революции,Революции
4,eu.1d1ff299-f454-4772-bfd9-680c2da6acd0.wav,Лесная,Лесная
5,eu.9a813c5e-4035-4170-b7fe-6324b76a38f7.wav,Зеленая,Зеленая
6,eu.98a6a38a-07b0-4b00-bebb-cfa853a60186.wav,Молодежная,Молодежная
7,eu.e907120a-a234-41cf-853d-5c53897f094b.wav,Советская,Советская
8,eu.46de6638-3634-43f0-894d-03db84c2f763.wav,Советская,Советская
9,eu.ab9ecaf2-0185-4679-9327-3673d59bcc69.wav,Молодежная,Молодежная


In [24]:
# metric: accuracy
acc = sum((predicted_df['pred'] == predicted_df['gt'])) / len(predicted_df)
print(f'{acc * 100}%')

100.0%


## Custom sample test:

In [25]:
audio, sample_rate = librosa.load(os.path.join(DATASET_DIR, 'sample/eu.0124f456-13b8-4765-936a-36bfd483683e.wav'), sr=16000)

In [26]:
inputs = processor(audio, return_tensors='pt', sampling_rate=sample_rate)
input_features = inputs.input_features.to('cuda')

In [27]:
generated_ids = model.generate(inputs=input_features)



In [28]:
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(transcription)

Новая
