In [1]:
import re
import json
import pandas as pd
from scipy.io import wavfile

import librosa
# libs para preparar customs datasets
from data_processor.dataset import MLS, CommonVoice
from data_processor.cleaner import CreateTidyDataset

# libs para conectar o custom dataset com a pipeline
import torch
from torch.utils.data import DataLoader

# Especifico de wav2vec
from datasets import load_metric
from transformers import (Wav2Vec2CTCTokenizer, 
                          Wav2Vec2FeatureExtractor, 
                          Wav2Vec2Processor, 
                          Wav2Vec2ForCTC,
                          TrainingArguments,
                          Trainer)

from core.utils import DataCollatorCTCWithPadding

In [2]:
mls = MLS(data_train_dir = "data/mls_portuguese/train", 
          data_test_dir  = "data/mls_portuguese/test",
          data_dev_dir   = "data/mls_portuguese/dev")

cov = CommonVoice(main_path = "data/common_voice/cv-corpus-7.0-2021-07-21/pt")

# databases = [(cov, False), (mls, True)]
databases = [(mls, False)]
tidy_dataset = CreateTidyDataset(databases)

In [3]:
tidy_dataset.converter_audio()

Base 0:


In [4]:
# pega dataset do modelo
train_df, test_df = tidy_dataset.parse_datasets()

# seleciona o vocabolario do modelo
regex = '[\,\?\.\!\-\;\:\"\'\“\&\«\´\»\”\ü\-]'
vocab = set(re.sub(regex, ' ', train_df["text"].str.cat(sep='').lower(), count=0, flags=0))
vocab.update({"[UNK]","[PAD]"})
vocab_dict = {v: k for k, v in enumerate(vocab)}
with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)
    
train_df = train_df.assign(text = train_df.text.str.replace(regex, ' ', regex=True))
test_df = test_df.assign(text = test_df.text.str.replace(regex, ' ', regex=True))

# Mapeia os caracters para números
train_df = train_df.assign(mapped_text = train_df.text.apply(lambda text: list(map(vocab_dict.get, 
                                                                               list(text)))),
                           audio_time = train_df.file.apply(lambda path: librosa.get_duration(filename=path)))

test_df = test_df.assign(mapped_text = test_df.text.apply(lambda text: list(map(vocab_dict.get, 
                                                                            list(text)))),
                         audio_time = test_df.file.apply(lambda path: librosa.get_duration(filename=path)))

In [5]:
class Wav2vecDataset(torch.utils.data.Dataset):
    
    def __init__(self, df:pd.DataFrame):
        self.df = df
        self.max_size = len(self.df)
        
    def __getitem__(self, idx):
        
        if idx >= self.max_size:
            raise IndexError
            
        temp = self.df.loc[idx,["file", "mapped_text"]].to_dict()
        sr, audio_data = wavfile.read(temp["file"])
        return {"input_values": torch.tensor(audio_data).to(device), "labels": torch.tensor(temp["mapped_text"]).to(device)}

    def __len__(self):
        return self.max_size

    
train_dataset = Wav2vecDataset(train_df.nsmallest(20,"audio_time").reset_index(drop=True))
test_dataset = Wav2vecDataset(test_df.nsmallest(20,"audio_time").reset_index(drop=True))

In [6]:
tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", 
                                 unk_token="[UNK]", 
                                 pad_token="[PAD]", 
                                 word_delimiter_token="|")

"""
É importante saber o sampling_rate do embedding onde os embeddings foram pré treinados.
"""
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, 
                                             sampling_rate=16000, 
                                             padding_value=0.0, 
                                             do_normalize=True, 
                                             return_attention_mask=False)

processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
wer_metric = load_metric("wer")

In [7]:
def compute_metrics(pred):
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)

    pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id

    pred_str = processor.batch_decode(pred_ids)
    # we do not want to group tokens when computing the metrics
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

    wer = wer_metric.compute(predictions=pred_str, references=label_str)

    return {"wer": wer}

In [8]:
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53", 
                                       gradient_checkpointing=True, 
                                       ctc_loss_reduction="mean", 
                                       pad_token_id=processor.tokenizer.pad_token_id,)

device = "cuda" if torch.cuda.is_available() else "cpu"
"""
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)
"""

model.to(device)
# To not finetunning the CNN layers which extract acoustic features
model.freeze_feature_extractor()

Some weights of the model checkpoint at facebook/wav2vec2-large-xlsr-53 were not used when initializing Wav2Vec2ForCTC: ['quantizer.codevectors', 'project_hid.weight', 'project_q.weight', 'quantizer.weight_proj.weight', 'project_q.bias', 'quantizer.weight_proj.bias', 'project_hid.bias']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to u

In [9]:
training_args = TrainingArguments(
  output_dir="./results",
  gradient_checkpointing=True,
  group_by_length=True,
  per_device_train_batch_size=1,
  gradient_accumulation_steps=2,
  evaluation_strategy="steps",
  num_train_epochs=30,
  fp16=True,
  save_steps=500,
  eval_steps=500,
  logging_steps=500,
  learning_rate=1e-4,
  weight_decay=0.005,
  warmup_steps=1000,
  save_total_limit=2,)

In [10]:
trainer = Trainer(model=model,
                  data_collator=data_collator,
                  args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=train_dataset,
                  eval_dataset=test_dataset,
                  tokenizer=processor.feature_extractor,)

Using amp fp16 backend


In [11]:
trainer.train()

***** Running training *****
  Num examples = 20
  Num Epochs = 30
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 2
  Total optimization steps = 300
  tensor = as_tensor(value)


RuntimeError: Input type (torch.cuda.ShortTensor) and weight type (torch.cuda.HalfTensor) should be the same