In [1]:
%%capture
! pip install transformers
! pip install jiwer
! pip install --upgrade librosa

In [2]:
import os
import json
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts

from kaggle_secrets import UserSecretsClient
from datetime import datetime

import librosa

import warnings
warnings.simplefilter('ignore')

In [3]:
from transformers import (
    Wav2Vec2ForCTC,
    Wav2Vec2Processor,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor
) 

In [4]:
Config = {
    'audio_dir': '/kaggle/input/bengaliai-speech/train_mp3s',
    'model_name': 'facebook/wav2vec2-base',
    'lr': 3e-4,
    'wd': 1e-5,
    'T_0': 10,
    'T_mult': 2,
    'eta_min': 1e-6,
    'nb_epochs': 5,
    'train_bs': 16,
    'valid_bs': 16,
    'sampling_rate': 16000,
}

In [5]:
def read_audio(mp3_path, target_sr=16000):
    audio, sr = librosa.load(mp3_path, sr=32000)
    audio_array = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
    return audio_array

def construct_vocab(texts):
    all_text = " ".join(texts)
    vocab = list(set(all_text))
    return vocab


def save_vocab(dataframe):
    vocab = construct_vocab(dataframe['sentence'].tolist())
    vocab_dict = {v: k for k, v in enumerate(vocab)}
    vocab_dict["__"] = vocab_dict[" "]
    _ = vocab_dict.pop(" ")
    vocab_dict["[UNK]"] = len(vocab_dict)
    vocab_dict["[PAD]"] = len(vocab_dict)

    with open('vocab.json', 'w') as fl:
        json.dump(vocab_dict, fl)

    print("Created Vocab file!")

In [6]:
class ASRDataset(Dataset):
    def __init__(self, df, config, is_test=False):
        self.df = df
        self.config = config
        self.is_test = is_test
    
    def __getitem__(self, idx):
        # First read and pre-process the audio file
        audio = read_audio(self.df.loc[idx]['path'])
        audio = processor(
            audio, 
            sampling_rate=self.config['sampling_rate']
        ).input_values[0]
        
        if self.is_test:
            return {'audio': audio, 'label': -1}
        else:
            # If we are training/validating, also process the labels (actual sentences)
            with processor.as_target_processor():
                labels = processor(self.df.loc[idx]['sentence']).input_ids
            return {'audio': audio, 'label': labels}
        
    def __len__(self):
        return len(self.df)
    
def ctc_data_collator(batch):
    input_features = [{"input_values": sample["audio"]} for sample in batch]
    label_features = [{"input_ids": sample["label"]} for sample in batch]
    batch = processor.pad(
        input_features,
        padding=True,
        return_tensors="pt",
    )
    with processor.as_target_processor():
        labels_batch = processor.pad(
            label_features,
            padding=True,
            return_tensors="pt",
        )
        
    labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
    batch["labels"] = labels
    return batch

In [7]:
def train_one_epoch(model, train_loader, optimizer, device='cuda:0'):
    model.train()
    pbar = tqdm(train_loader, total=len(train_loader))
    avg_loss = 0
    for data in pbar:
        data = {k: v.to(device) for k, v in data.items()}
        loss = model(**data).loss
        loss_itm = loss.item()
        
        avg_loss += loss_itm
        pbar.set_description(f"loss: {loss_itm:.4f}")
        
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        
    return avg_loss / len(train_loader)

@torch.no_grad()
def valid_one_epoch(model, valid_loader, device='cuda:0'):
    pbar = tqdm(valid_loader, total=len(valid_loader))
    avg_loss = 0
    for data in pbar:
        data = {k: v.to(device) for k, v in data.items()}
        loss = model(**data).loss
        loss_itm = loss.item()
        
        avg_loss += loss_itm
        pbar.set_description(f"val_loss: {loss_itm:.4f}")

    return avg_loss / len(valid_loader)

In [8]:
from transformers import SEWForCTC
df = pd.read_csv("/kaggle/input/bengaliai-speech/train.csv")

# Get a paths feature for reading in during dataloading
df['path'] = df['id'].apply(lambda x: os.path.join(Config['audio_dir'], x+'.mp3'))
train_df = df[df['split'] == 'train'].sample(frac=.001).reset_index(drop=True)
valid_df = df[df['split'] == 'valid'].sample(frac=.001).reset_index(drop=True)
print(f"Training on samples: {len(train_df)}, Validation on samples: {len(valid_df)}")

# Construct and save the vocab file
save_vocab(df)

# Init the tokenizer, feature_extractor, processor and model
tokenizer = Wav2Vec2CTCTokenizer(
    "./vocab.json", 
    unk_token="[UNK]",
    pad_token="[PAD]",
    word_delimiter_token="__"
)
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1, 
    sampling_rate=Config['sampling_rate'], 
    padding_value=0.0, 
    do_normalize=True, 
    return_attention_mask=False
)
processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor, 
    tokenizer=tokenizer
)

model = SEWForCTC.from_pretrained("asapp/sew-tiny-100k-ft-ls100h",
    ctc_loss_reduction="mean", 
    ignore_mismatched_sizes=True,
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size = len(tokenizer),
)

# Freeze the feature encoder part since we won't be training it
model.to('cuda')
model.freeze_feature_encoder()
optimizer = torch.optim.AdamW(
    model.parameters(), 
    lr=Config['lr'], 
    weight_decay=Config['wd']
)

# Construct training and validation dataloaders
train_ds = ASRDataset(train_df, Config)
valid_ds = ASRDataset(valid_df, Config)

train_loader = DataLoader(
    train_ds, 
    batch_size=Config['train_bs'], 
    collate_fn=ctc_data_collator, 
)
valid_loader = DataLoader(
    valid_ds,
    batch_size=Config['valid_bs'],
    collate_fn=ctc_data_collator,
)

# Train the model
best_loss = float('inf')
for epoch in range(Config['nb_epochs']):
    print(f"{'='*40} Epoch: {epoch+1} / {Config['nb_epochs']} {'='*40}")
    train_loss = train_one_epoch(model, train_loader, optimizer)
    valid_loss = valid_one_epoch(model, valid_loader)
    print(f"train_loss: {train_loss:.4f}, valid_loss: {valid_loss:.4f}")

    if valid_loss < best_loss:
        best_loss = valid_loss
        torch.save(model.state_dict(), f"sew_base_bengaliAI.pt")
        print(f"Saved the best model so far with val_loss: {valid_loss:.4f}")

Training on samples: 934, Validation on samples: 30
Created Vocab file!


Downloading config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/163M [00:00<?, ?B/s]

Some weights of SEWForCTC were not initialized from the model checkpoint at asapp/sew-tiny-100k-ft-ls100h and are newly initialized because the shapes did not match:
- lm_head.bias: found shape torch.Size([32]) in the checkpoint and torch.Size([89]) in the model instantiated
- lm_head.weight: found shape torch.Size([32, 512]) in the checkpoint and torch.Size([89, 512]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

train_loss: 5.7335, valid_loss: 4.8140
Saved the best model so far with val_loss: 4.8140


  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

train_loss: 4.8638, valid_loss: 4.7757
Saved the best model so far with val_loss: 4.7757


  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

train_loss: 4.8257, valid_loss: 4.7656
Saved the best model so far with val_loss: 4.7656


  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

train_loss: 4.9194, valid_loss: 4.8257


  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

train_loss: 4.8222, valid_loss: 4.7693
