In [7]:
import os
import torch
import jsonlines
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from datasets import DatasetDict, Audio, load_from_disk, concatenate_datasets
from transformers.models.whisper.english_normalizer import BasicTextNormalizer
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [2]:
cur_dir = os.getcwd()
src_dir = os.path.dirname(cur_dir)
til_dir = os.path.dirname(os.path.dirname(src_dir))
home_dir = os.path.dirname(til_dir)
test_dir = os.path.join(home_dir, 'novice')
audio_dir = os.path.join(test_dir, 'audio')
data_dir = os.path.join(cur_dir, 'data')
model_path = os.path.join(src_dir, "models", "whisper")
config_path = os.path.join(cur_dir, "config.yaml")

# paths for converting datasets to manifest files
train_path = os.path.join(data_dir, 'train_data')
val_path = os.path.join(data_dir, 'val_data')
test_path = os.path.join(data_dir, 'test_data')

In [3]:
# # Define variables directly instead of parsing from command line
# model_name = 'openai/whisper-small'
# language = 'English'
# sampling_rate = 16000
# num_proc = 2
# train_strategy = 'steps'
# learning_rate = 1.75e-5
# warmup = 20000
# train_batchsize = 48
# eval_batchsize = 32
# num_epochs = 20
# num_steps = 100000
# resume_from_ckpt = None
# output_dir = './test'
# train_datasets = [train_path]  # Add your paths
# eval_datasets = [val_path]  # Add your paths

# # Check train strategy validity
# if train_strategy not in ['steps', 'epoch']:
#     raise ValueError('The train strategy should be either steps or epoch.')

# print('\n\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n\n')
# print('ARGUMENTS OF INTEREST:')
# args = {
#     'model_name': model_name,
#     'language': language,
#     'sampling_rate': sampling_rate,
#     'num_proc': num_proc,
#     'train_strategy': train_strategy,
#     'learning_rate': learning_rate,
#     'warmup': warmup,
#     'train_batchsize': train_batchsize,
#     'eval_batchsize': eval_batchsize,
#     'num_epochs': num_epochs,
#     'num_steps': num_steps,
#     'resume_from_ckpt': resume_from_ckpt,
#     'output_dir': output_dir,
#     'train_datasets': train_datasets,
#     'eval_datasets': eval_datasets,
# }
# print(args)
# print('\n\n+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n\n')

# gradient_checkpointing = True
# freeze_feature_encoder = False
# freeze_encoder = False

# do_normalize_eval = True
# do_lower_case = False
# do_remove_punctuation = False
# normalizer = BasicTextNormalizer()

# #############################       MODEL LOADING       #####################################

# feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)
# tokenizer = WhisperTokenizer.from_pretrained(model_name, language=language, task="transcribe")
# processor = WhisperProcessor.from_pretrained(model_name, language=language, task="transcribe")
# model = WhisperForConditionalGeneration.from_pretrained(model_name)

# if model.config.decoder_start_token_id is None:
#     raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")

# if freeze_feature_encoder:
#     model.freeze_feature_encoder()

# if freeze_encoder:
#     model.freeze_encoder()
#     model.model.encoder.gradient_checkpointing = False

# model.config.forced_decoder_ids = None
# model.config.suppress_tokens = []

# if gradient_checkpointing:
#     model.config.use_cache = False

# ############################        DATASET LOADING AND PREP        ##########################

# def load_custom_dataset(split):
#     ds = []
#     if split == 'train':
#         for dset in train_datasets:
#             ds.append(load_from_disk(dset))
#     elif split == 'eval':
#         for dset in eval_datasets:
#             ds.append(load_from_disk(dset))

#     ds_to_return = concatenate_datasets(ds)
#     ds_to_return = ds_to_return.shuffle(seed=22)
#     return ds_to_return

# def prepare_dataset(batch):
#     # load and (possibly) resample audio data to 16kHz
#     audio = batch["audio"]

#     # compute log-Mel input features from input audio array 
#     batch["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]
#     # compute input length of audio sample in seconds
#     batch["input_length"] = len(audio["array"]) / audio["sampling_rate"]
    
#     # optional pre-processing steps
#     transcription = batch["sentence"]
#     if do_lower_case:
#         transcription = transcription.lower()
#     if do_remove_punctuation:
#         transcription = normalizer(transcription).strip()
    
#     # encode target text to label ids
#     batch["labels"] = processor.tokenizer(transcription).input_ids
#     return batch

# max_label_length = model.config.max_length
# min_input_length = 0.0
# max_input_length = 30.0

# def is_in_length_range(length, labels):
#     return min_input_length < length < max_input_length and 0 < len(labels) < max_label_length

# print('DATASET PREPARATION IN PROGRESS...')
# raw_dataset = DatasetDict()
# raw_dataset["train"] = load_custom_dataset('train')
# raw_dataset["eval"] = load_custom_dataset('eval')

# raw_dataset = raw_dataset.cast_column("audio", Audio(sampling_rate=sampling_rate))
# raw_dataset = raw_dataset.map(prepare_dataset, num_proc=num_proc)

# raw_dataset = raw_dataset.filter(
#     is_in_length_range,
#     input_columns=["input_length", "labels"],
#     num_proc=num_proc,
# )

# ###############################     DATA COLLATOR AND METRIC DEFINITION     ########################

# @dataclass
# class DataCollatorSpeechSeq2SeqWithPadding:
#     processor: Any

#     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
#         # split inputs and labels since they have to be of different lengths and need different padding methods
#         # first treat the audio inputs by simply returning torch tensors
#         input_features = [{"input_features": feature["input_features"]} for feature in features]
#         batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

#         # get the tokenized label sequences
#         label_features = [{"input_ids": feature["labels"]} for feature in features]
#         # pad the labels to max length
#         labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

#         # replace padding with -100 to ignore loss correctly
#         labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

#         # if bos token is appended in previous tokenization step,
#         # cut bos token here as it's append later anyways
#         if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
#             labels = labels[:, 1:]

#         batch["labels"] = labels

#         return batch

# data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
# print('DATASET PREPARATION COMPLETED')

# metric = evaluate.load("wer")

# def compute_metrics(pred):
#     pred_ids = pred.predictions
#     label_ids = pred.label_ids

#     # replace -100 with the pad_token_id
#     label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

#     # we do not want to group tokens when computing the metrics
#     pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
#     label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

#     if do_normalize_eval:
#         pred_str = [normalizer(pred) for pred in pred_str]
#         label_str = [normalizer(label) for label in label_str]

#     wer = 100 * metric.compute(predictions=pred_str, references=label_str)
#     return {"wer": wer}

# ###############################     TRAINING ARGS AND TRAINING      ############################

# if train_strategy == 'epoch':
#     training_args = Seq2SeqTrainingArguments(
#         output_dir=output_dir,
#         per_device_train_batch_size=train_batchsize,
#         gradient_accumulation_steps=1,
#         learning_rate=learning_rate,
#         warmup_steps=warmup,
#         gradient_checkpointing=gradient_checkpointing,
#         fp16=True,
#         evaluation_strategy="epoch",
#         save_strategy="epoch",
#         num_train_epochs=num_epochs,
#         save_total_limit=10,
#         per_device_eval_batch_size=eval_batchsize,
#         predict_with_generate=True,
#         generation_max_length=225,
#         logging_steps=500,
#         report_to=["tensorboard"],
#         load_best_model_at_end=True,
#         metric_for_best_model="wer",
#         greater_is_better=False,
#         optim="adamw_bnb_8bit",
#         resume_from_checkpoint=resume_from_ckpt,
#     )

# elif train_strategy == 'steps':
#     training_args = Seq2SeqTrainingArguments(
#         output_dir=output_dir,
#         per_device_train_batch_size=train_batchsize,
#         gradient_accumulation_steps=1,
#         learning_rate=learning_rate,
#         warmup_steps=warmup,
#         gradient_checkpointing=gradient_checkpointing,
#         fp16=True,
#         evaluation_strategy="steps",
#         eval_steps=1000,
#         save_strategy="steps",
#         save_steps=1000,
#         max_steps=num_steps,
#         save_total_limit=10,
#         per_device_eval_batch_size=eval_batchsize,
#         predict_with_generate=True,
#         generation_max_length=225,
#         logging_steps=500,
#         report_to=["tensorboard"],
#         load_best_model_at_end=True,
#         metric_for_best_model="wer",
#         greater_is_better=False,
#         optim="adamw_bnb_8bit",
#         resume_from_checkpoint=resume_from_ckpt,
#     )

# trainer = Seq2SeqTrainer(
#     args=training_args,
#     model=model,
#     train_dataset=raw_dataset["train"],
#     eval_dataset=raw_dataset["eval"],
#     data_collator=data_collator,
#     compute_metrics=compute_metrics,
#     tokenizer=processor.feature_extractor,
# )

# processor.save_pretrained(training_args.output_dir)

# print('TRAINING IN PROGRESS...')
# trainer.train()
# print('DONE TRAINING')


In [5]:
model_name = 'openai/whisper-small'
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_name)
tokenizer = WhisperTokenizer.from_pretrained(model_name, language="English", task="transcribe")
processor = WhisperProcessor.from_pretrained(model_name, language="English", task="transcribe")
model = WhisperForConditionalGeneration.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,  # Set batch size to 1 to train on individual examples
    gradient_accumulation_steps=1,  # Set gradient accumulation steps to 1
    logging_steps=500,
    save_steps=1000,
    evaluation_strategy="steps",
    eval_steps=1000,
    max_steps=100000,
)

# Create trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
)

max_steps is given, it will override any value given in num_train_epochs


In [None]:
# Define paths to your batch files
batch_files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file.endswith(".jsonl")]
output_dir = './test'

test_file = batch_files.pop()

In [19]:
import os
import jsonlines
import torch
from torch.utils.data import Dataset, DataLoader

# Define your custom dataset class
class CustomDataset(Dataset):
    def __init__(self, batch_files):
        self.batch_files = batch_files

    def __len__(self):
        return len(self.batch_files)

    def __getitem__(self, idx):
        batch_file = self.batch_files[idx]
        return batch_file

# Define hyperparameters and settings
batch_size = 1  # Process one batch at a time
num_epochs = 5
learning_rate = 0.001

# Define paths to your batch files
batch_files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file.endswith(".jsonl")]

# Create your custom dataset and data loader
train_dataset = CustomDataset(batch_files)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Define your loss function and optimizer
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    for batch_file in train_loader:
        # Load data from JSONL file
        with jsonlines.open(batch_file) as reader:
            for obj in reader:
                # Process each object in the JSONL file as needed
                audio = obj['audio']
                transcript = obj['transcript']

                # Convert data to tensors
                audio_tensor = torch.tensor(audio)
                transcript_tensor = torch.tensor(transcript)

                # Forward pass
                outputs = model(audio_tensor)

                # Compute loss
                loss = criterion(outputs, transcript_tensor)

                # Backpropagation
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

    # Print loss after each epoch
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

KeyboardInterrupt: 

In [18]:
import jsonlines

def load_file(file_path):
    with jsonlines.open(file_path, 'r') as reader:
        return next(iter(reader))

data = load_file('./data/batch_17.jsonl')
print(data.keys())

dict_keys(['audio', 'transcript', 'input_features', 'input_length', 'labels'])


In [None]:
test_data = list(load_jsonl(test_file))
test_results = trainer.evaluate(test_data)
print("Test Results:", test_results)