In [None]:
# ## Step 1: Install Dependencies
# Run this in your terminal or as a code cell:
# ```bash
# pip install transformers datasets torchaudio librosa jiwer
# ```

In [None]:
# ## Step 2: Import Libraries
import torch
import torchaudio
from datasets import load_dataset, Dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, Trainer, TrainingArguments
import librosa
import numpy as np
import re
from jiwer import wer
import json
from utils.log import logger  # Assuming you have a logger setup


In [None]:
# ## Step 3: Define Text Normalization for Turkish
# Turkish has specific characters and needs normalization for ASR.
chars_to_ignore = r'[,\?\.\!\-\;:"“%‘”�]'
chars_to_mapping = {"ğ": "g", "ı": "i", "ö": "o", "ü": "u", "ş": "s", "ç": "c"}

def normalize_text(text):
    """Normalize Turkish text for ASR."""
    text = text.lower().strip()
    for src, dst in chars_to_mapping.items():
        text = text.replace(src, dst)
    text = re.sub(chars_to_ignore, '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
# ## Step 4: Prepare Dataset

# Load the Common Voice Turkish dataset and preprocess it.
logger.info("Loading Turkish dataset...")
dataset = load_dataset("mozilla-foundation/common_voice_11_0", "tr", split="train+validation")

# Preprocess dataset
def prepare_dataset(batch):
    """Preprocess audio and text for training."""
    # Load audio
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    speech_array = librosa.resample(speech_array.squeeze().numpy(), orig_sr=sampling_rate, target_sr=16000)
    batch["speech"] = speech_array
    # Normalize transcription
    batch["sentence"] = normalize_text(batch["sentence"])
    return batch

logger.info("Preprocessing dataset...")
dataset = dataset.map(prepare_dataset, remove_columns=["accent", "age", "client_id", "down_votes", "gender", "locale", "segment", "up_votes"])

# Split into train and validation (90% train, 10% validation)
train_dataset = dataset.shuffle(seed=42).select(range(int(len(dataset) * 0.9)))
eval_dataset = dataset.shuffle(seed=42).select(range(int(len(dataset) * 0.9), len(dataset)))

In [None]:
# ## Step 5: Create Vocabulary
# Create a vocabulary based on the training dataset.
vocab = set("".join(train_dataset["sentence"]))
vocab_dict = {v: k for k, v in enumerate(sorted(vocab))}
vocab_dict["|"] = len(vocab_dict)  # Space token
vocab_dict["[UNK]"] = len(vocab_dict)  # Unknown token
vocab_dict["[PAD]"] = len(vocab_dict)  # Padding token

# Save vocab to a file
with open("vocab.json", "w") as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [None]:

# ## Step 6: Initialize Model and Processor
logger.info("Initializing Wav2Vec2 model and processor...")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base", vocab_file="vocab.json")

# Update the processor's tokenizer with the new vocab
processor.tokenizer = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base", vocab_file="vocab.json").tokenizer

In [None]:
# ## Step 7: Define Data Collator
def data_collator(batch):
    """Collate data for training."""
    input_values = processor([item["speech"] for item in batch], sampling_rate=16000, return_tensors="pt", padding=True).input_values
    labels = processor.tokenizer([item["sentence"] for item in batch], return_tensors="pt", padding=True).input_ids
    return {"input_values": input_values, "labels": labels}

In [None]:
# ## Step 8: Define Evaluation Metric (WER)
def compute_metrics(pred):
    """Compute Word Error Rate (WER) for evaluation."""
    pred_logits = pred.predictions
    pred_ids = np.argmax(pred_logits, axis=-1)
    pred_str = processor.batch_decode(pred_ids)
    label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
    wer_score = wer(label_str, pred_str)
    return {"wer": wer_score}

In [None]:
# ## Step 9: Set Training Arguments
training_args = TrainingArguments(
    output_dir="./model/wav2vec",
    evaluation_strategy="steps",
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    gradient_accumulation_steps=2,
    save_steps=500,
    eval_steps=500,
    logging_steps=100,
    save_total_limit=2,
    fp16=True,  # Use mixed precision if GPU supports it
)

In [None]:
# ## Step 10: Initialize Trainer
trainer = Trainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=processor.tokenizer,
)

In [None]:
# ## Step 11: Train the Model
logger.info("Starting fine-tuning...")
trainer.train()

In [None]:
# ## Step 12: Save the Fine-Tuned Model and Processor
logger.info("Saving fine-tuned model and processor...")
model.save_pretrained("./model/wav2vec")
processor.save_pretrained("./model/wav2vec")

logger.info("Fine-tuning complete!")