In [1]:
import os
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
import json
import torch
import torchaudio
import random
from torch.utils.data import IterableDataset, DataLoader
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
from tqdm import tqdm
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift
import pytorch_lightning as pl
from omegaconf import OmegaConf
import torchaudio.transforms as T
import jsonlines
from datasets import Dataset

print("CUDA available:", torch.cuda.is_available())
print("Number of GPUs:", torch.cuda.device_count())
print("GPU Name:", torch.cuda.get_device_name(0))

CUDA available: True
Number of GPUs: 1
GPU Name: Tesla T4


### Defining Directories

In [2]:
cur_dir = os.getcwd()
src_dir = os.path.dirname(cur_dir)
til_dir = os.path.dirname(os.path.dirname(src_dir))
home_dir = os.path.dirname(til_dir)
test_dir = os.path.join(home_dir, 'novice')
audio_dir = os.path.join(test_dir, 'audio')
data_dir = os.path.join(cur_dir, 'data')
model_path = os.path.join(src_dir, "models", "whisper")
config_path = os.path.join(cur_dir, "config.yaml")

# paths for converting datasets to manifest files
train_manifest_path = os.path.join(data_dir, 'train_final.json')
val_manifest_path = os.path.join(data_dir, 'val_final.json')
test_manifest_path = os.path.join(data_dir, 'test_final.json')

test_dir

'/home/jupyter/novice'

In [3]:
# # import json

# def validate_jsonl(file_path):
#     count = 0
#     with open(file_path, 'r') as f:
#         for i, line in enumerate(f, 1):
#             try:
#                 json.loads(line)
#                 count += 1
#             except json.JSONDecodeError as e:
#                 print(f"Error decoding JSON on line {i}: {e}")
#                 break
#     print(f"{count} files found")

# validate_jsonl('./data/train_final.json')
# validate_jsonl('./data/val_final.json')
# validate_jsonl('./data/test_final.json')

### Select subsets (if any) and find max length

In [4]:
def load_manifest_in_chunks(manifest_path, chunk_size):
    with open(manifest_path, 'r') as f:
        chunk = []
        for line in f:
            chunk.append(json.loads(line))
            if len(chunk) >= chunk_size:
                yield chunk
                chunk = []
        if chunk:
            yield chunk

### Define Augmentations

In [5]:
augmentations = Compose([
    AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
    TimeStretch(min_rate=0.8, max_rate=1.25, p=0.5),
    PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
])

## Define Dataset Class and Create Datasets

In [6]:
# class AudioDataset(IterableDataset):
#     def __init__(self, chunk, augmentations=None):
#         self.chunk = chunk
#         self.augmentations = augmentations

#     def __iter__(self):
#         for entry in self.chunk:
#             audio, sample_rate = torchaudio.load(entry['audio'])
#             audio = audio.squeeze(0).numpy()  # Convert tensor to numpy array and remove channel dimension
#             if self.augmentations:
#                 audio = self.augmentations(samples=audio, sample_rate=sample_rate)
#             audio = torch.tensor(audio).unsqueeze(0)  # Convert back to tensor and add channel dimension
#             yield {
#                 'input_values': audio,
#                 'labels': entry['transcription']
#             }

### Define Collate Function and Create Dataloaders

In [7]:
def collate_fn(batch):
    input_values = [chunk for item in batch for chunk in item['input_values']]
    labels = [item['labels'] for item in batch]
    input_values = torch.stack(input_values)
    labels = torch.tensor([processor.tokenizer.encode(label) for label in labels], dtype=torch.long)
    return {'input_values': input_values, 'labels': labels}

## Define Model Class and Load Model

In [8]:
class WhisperASRModel(pl.LightningModule):
    def __init__(self, model, processor, lr):
        super().__init__()
        self.model = model
        self.processor = processor
        self.lr = lr

    def forward(self, input_values, labels):
        return self.model(input_features=input_values, labels=labels)

    def training_step(self, batch, batch_idx):
        outputs = self(batch['input_values'], batch['labels'])
        loss = outputs.loss
        self.log('train_loss', loss)
        torch.cuda.empty_cache()
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self(batch['input_values'], batch['labels'])
        val_loss = outputs.loss
        self.log('val_loss', val_loss)
        torch.cuda.empty_cache()
        return val_loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.lr)

    def save_model(self, save_path):
        self.model.save_pretrained(save_path)
        self.processor.save_pretrained(save_path)

### Setup and Train Model

In [9]:
# def get_dataloader(manifest_chunk, batch_size, shuffle, num_workers):
#     dataset = AudioDataset(manifest_chunk)
#     return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, collate_fn=custom_collate_fn)

# def train_and_evaluate(train_manifest_path, val_manifest_path, test_manifest_path, processor, model, config, chunk_size=50):
#     whisper_asr_model = WhisperASRModel(model, processor, config.optim.lr)
#     trainer = pl.Trainer(**config.trainer)

#     for epoch in range(config.trainer.max_epochs):
#         print(f"Epoch {epoch + 1}/{config.trainer.max_epochs}")

#         # Load training data in chunks
#         for train_chunk in load_manifest_in_chunks(train_manifest_path, chunk_size):
#             train_loader = get_dataloader(train_chunk, config.model.train_ds.batch_size, config.model.train_ds.shuffle, config.model.train_ds.num_workers)
#             trainer.fit(whisper_asr_model, train_dataloaders=train_loader)

#         # Validate
#         for val_chunk in load_manifest_in_chunks(val_manifest_path, chunk_size):
#             val_loader = get_dataloader(val_chunk, config.model.validation_ds.batch_size, config.model.validation_ds.shuffle, config.model.validation_ds.num_workers)
#             trainer.validate(whisper_asr_model, val_dataloaders=val_loader)

#     # Test
#     for test_chunk in load_manifest_in_chunks(test_manifest_path, chunk_size):
#         test_loader = get_dataloader(test_chunk, config.model.test_ds.batch_size, config.model.test_ds.shuffle, config.model.test_ds.num_workers)
#         trainer.test(whisper_asr_model, test_dataloaders=test_loader)

In [10]:
def load_and_preprocess_audio(file_path, chunk_size, target_length=3000, augmentations=None):
    waveform, sample_rate = torchaudio.load(os.path.join(audio_dir, file_path))
    waveform = waveform.numpy()  # Convert to NumPy array
    if augmentations:
        waveform = augmentations(waveform, sample_rate=sample_rate)
    waveform = torch.tensor(waveform)  # Convert back to tensor

    transform = T.MelSpectrogram(sample_rate=sample_rate, n_mels=80)
    mel_spec = transform(waveform)

    # Calculate the number of chunks
    num_chunks = (mel_spec.shape[-1] + chunk_size - 1) // chunk_size

    chunks = []
    for i in range(num_chunks):
        start_idx = i * chunk_size
        end_idx = start_idx + chunk_size
        chunk = mel_spec[:, :, start_idx:end_idx]

        # Pad the chunk to the target length
        if chunk.shape[-1] < target_length:
            pad = target_length - chunk.shape[-1]
            chunk = torch.nn.functional.pad(chunk, (0, pad))

        chunks.append(chunk[:, :, :target_length])

    return chunks

class AudioDataset(Dataset):
    def __init__(self, audio_data, chunk_size, target_length=3000, augmentations=None):
        self.audio_data = audio_data
        self.chunk_size = chunk_size
        self.target_length = target_length
        self.augmentations = augmentations

    def __len__(self):
        return len(self.audio_data)

    def __getitem__(self, idx):
        item = self.audio_data[idx]
        file_path = item['audio'][0]
        transcription = item['transcript'][0]
        chunks = load_and_preprocess_audio(file_path, self.chunk_size, self.target_length, self.augmentations)
        return {'input_values': chunks, 'labels': transcription}

In [11]:
def get_dataloader(audio_data, processor, batch_size, num_workers, chunk_size, target_length, augmentations=None):
    dataset = AudioDataset(audio_data, chunk_size, target_length, augmentations)
    return DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, collate_fn=collate_fn)


def train_and_evaluate(train_dataset, val_dataset, test_dataset, processor, model, config, chunk_size=100, target_length=3000, augmentations=None):
    whisper_asr_model = WhisperASRModel(model, processor, config.optim.lr)
    
    trainer = pl.Trainer(**config.trainer)

    train_loader = get_dataloader(train_dataset, processor, config.model.train_ds.batch_size, num_workers=config.model.train_ds.num_workers, chunk_size=chunk_size, target_length=target_length, augmentations=augmentations)
    val_loader = get_dataloader(val_dataset, processor, config.model.validation_ds.batch_size, num_workers=config.model.validation_ds.num_workers, chunk_size=chunk_size, target_length=target_length, augmentations=augmentations)
    test_loader = get_dataloader(test_dataset, processor, config.model.test_ds.batch_size, num_workers=config.model.test_ds.num_workers, chunk_size=chunk_size, target_length=target_length, augmentations=augmentations)

    trainer.fit(whisper_asr_model, train_dataloaders=train_loader, val_dataloaders=val_loader)
    trainer.test(whisper_asr_model, dataloaders=test_loader)

    # Save the model after training
    save_path = "../models/whisper_trained"
    whisper_asr_model.save_model(save_path)

In [12]:
# Load model

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_path, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
model.to(device)
processor = AutoProcessor.from_pretrained(model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
max_length = 220000

MAX_FILE_COUNT = None # Set if only want max files

data = {'key': [], 'audio': [], 'transcript': []}
data_path = os.path.join(test_dir, "asr.jsonl")
with jsonlines.open(data_path) as reader:
    for obj in reader:
        # for key, value in obj.items():
        #     print(value)
        if MAX_FILE_COUNT and len(data['key']) >= MAX_FILE_COUNT:
            break
        for key, value in obj.items():
            data[key].append(value)

# Convert to a Hugging Face dataset
dataset = Dataset.from_dict(data)

# Shuffle the dataset
dataset = dataset.shuffle(seed=42)

# Split the dataset into training, validation, and test sets
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset = dataset.select(range(train_size))
val_dataset = dataset.select(range(train_size, train_size + val_size))
test_dataset = dataset.select(range(train_size + val_size, train_size + val_size + test_size))

In [14]:
### Update config
config = OmegaConf.load(config_path)

config.model.train_ds.manifest_filepath = train_manifest_path
config.model.validation_ds.manifest_filepath = val_manifest_path
config.model.test_ds.manifest_filepath = test_manifest_path

train_and_evaluate(train_dataset, val_dataset, test_dataset, processor, model, config, chunk_size=10, augmentations=augmentations)

  rank_zero_warn(
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                            | Params
----------------------------------------------------------
0 | model | WhisperForConditionalGeneration | 394 M 
----------------------------------------------------------
392 M     Trainable params
1.5 M     Non-trainable params
394 M     Total params
1,577.501 Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


OutOfMemoryError: CUDA out of memory. Tried to allocate 8.85 GiB (GPU 0; 14.57 GiB total capacity; 6.54 GiB already allocated; 7.35 GiB free; 6.97 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF