In [10]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from datasets import load_dataset
import librosa
import torch
from transformers import TrainingArguments, Trainer

In [13]:
model_name = "facebook/wav2vec2-large-xlsr-53"

# Load the Wav2Vec2 model for CTC (Connectionist Temporal Classification)
try:
    model = Wav2Vec2ForCTC.from_pretrained(model_name)
except Exception as e:
    print("Error loading model:", e)

# Load processor that combines feature extractor and tokenizer
try:
    processor = Wav2Vec2Processor.from_pretrained(model_name)
except Exception as e:
    print("Error loading processor:", e)

# Load the dataset (replace with the actual path or dataset name if needed)
# This assumes you have access to the "Sandalwood" dataset or similar
try:
    dataset = load_dataset("Sandalwood")
except Exception as e:
    print("Error loading dataset:", e)

# Example for checking the dataset structure
print(dataset)

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-xlsr-53 and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Error loading processor: Can't load tokenizer for 'facebook/wav2vec2-large-xlsr-53'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'facebook/wav2vec2-large-xlsr-53' is the correct path to a directory containing all relevant files for a Wav2Vec2CTCTokenizer tokenizer.
DatasetDict({
    train: Dataset({
        features: ['audio'],
        num_rows: 71
    })
})


In [16]:
def preprocess(batch):
    try:
        audio = batch["audio"]
        batch["input_values"] = processor(audio["array"], sampling_rate=16000).input_values[0]
        
        # Check if 'text' exists in batch
        if "text" in batch:
            batch["labels"] = processor.tokenizer(batch["text"]).input_ids  # Convert transcription to tokens
        else:
            print("Warning: 'text' not found in batch.")
            batch["labels"] = []  # Handle missing text case
        
    except Exception as e:
        print(f"Error in preprocessing: {e}")
    
    return batch

processed_dataset = dataset.map(preprocess, remove_columns=["audio", "text"])

ValueError: Column to remove ['text'] not in the dataset. Current columns in the dataset: ['audio']

In [None]:
training_args = TrainingArguments(
    output_dir="./wav2vec2-kannada",
    per_device_train_batch_size=4,
    evaluation_strategy="epoch",
    learning_rate=3e-4,
    weight_decay=0.005,
    logging_steps=10,
    save_steps=100,
    num_train_epochs=10,
    save_total_limit=2,
    fp16=True,
)

# Define the Trainer
trainer = Trainer(
    model=model,
    data_collator=lambda data: {"input_values": torch.tensor([f["input_values"] for f in data]),
                                "labels": torch.tensor([f["labels"] for f in data])},
    args=training_args,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["test"],
)

# Fine-tune the model
trainer.train()