In [1]:
pip install faster-whisper pandas tqdm


Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import pandas as pd
from tqdm import tqdm
from faster_whisper import WhisperModel


In [4]:
# Path to your main Common Voice folder
base_path = r"C:\Users\rakes\Downloads\commonvoice"

# List all files inside it to confirm structure
for root, dirs, files in os.walk(base_path):
    for f in files:
        if f.endswith(".tsv"):
            print(os.path.join(root, f))


C:\Users\rakes\Downloads\commonvoice\test\test.tsv
C:\Users\rakes\Downloads\commonvoice\train\train.tsv
C:\Users\rakes\Downloads\commonvoice\validation\validation.tsv


In [6]:
import pandas as pd

# ✅ Use your confirmed paths
train_tsv = r"C:\Users\rakes\Downloads\commonvoice\train\train.tsv"
test_tsv = r"C:\Users\rakes\Downloads\commonvoice\test\test.tsv"
val_tsv = r"C:\Users\rakes\Downloads\commonvoice\validation\validation.tsv"

# ✅ Load the data
train_df = pd.read_csv(train_tsv, sep="\t")
test_df = pd.read_csv(test_tsv, sep="\t")
val_df = pd.read_csv(val_tsv, sep="\t")

# ✅ Show sample rows to verify successful loading
print("Train set shape:", train_df.shape)
print("Test set shape:", test_df.shape)
print("Validation set shape:", val_df.shape)

train_df.head()


Train set shape: (2000, 8)
Test set shape: (400, 8)
Validation set shape: (400, 8)


Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent
0,ac5fea9cacdfa4a2d6291c780b0a0ee1c0f2c5d2389cc0...,common_voice_en_10110,I really liked the film we saw last week.,4,0,sixties,male,us
1,ac5fea9cacdfa4a2d6291c780b0a0ee1c0f2c5d2389cc0...,common_voice_en_10153,Please put maimi yajima's song onto Operación ...,3,0,sixties,male,us
2,0e7bca7f3243636599bd8e7bbe03b4f09ae8898bb0e16e...,common_voice_en_101622,Three men are painting a metal wall white.,3,0,twenties,male,indian
3,ac5fea9cacdfa4a2d6291c780b0a0ee1c0f2c5d2389cc0...,common_voice_en_10187,"Though this be madness, yet there is method in it",4,0,sixties,male,us
4,ac5fea9cacdfa4a2d6291c780b0a0ee1c0f2c5d2389cc0...,common_voice_en_10199,"As she watched, the cat washed his ears and th...",4,0,sixties,male,us


In [9]:
from datasets import Dataset
from transformers import Wav2Vec2Processor
import librosa
import numpy as np
import torch
import os

# ✅ Convert DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
val_dataset = Dataset.from_pandas(val_df)

# ✅ Load processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

# ✅ Path folders for each split
train_clips_dir = r"C:\Users\rakes\Downloads\commonvoice\train\clips"
test_clips_dir = r"C:\Users\rakes\Downloads\commonvoice\test\clips"
val_clips_dir = r"C:\Users\rakes\Downloads\commonvoice\validation\clips"

def preprocess_audio(batch, split="train"):
    # Determine correct folder
    if split == "train":
        base_path = train_clips_dir
    elif split == "test":
        base_path = test_clips_dir
    else:
        base_path = val_clips_dir

    # Build full file path with .wav extension
    file_name = batch["path"] + ".wav"
    file_path = os.path.join(base_path, file_name)

    # Check if file exists
    if not os.path.exists(file_path):
        print(f"⚠️ File not found: {file_path}")
        batch["input_values"] = None
        batch["labels"] = None
        return batch

    # Load audio
    speech_array, sampling_rate = librosa.load(file_path, sr=16000)

    # Extract features
    batch["input_values"] = processor(
        speech_array, sampling_rate=16000, return_tensors="pt", padding="longest"
    ).input_values[0]

    batch["labels"] = batch["sentence"]
    return batch

# ✅ Test on a small batch
small_train_dataset = train_dataset.select(range(10)).map(
    lambda batch: preprocess_audio(batch, split="train"),
    remove_columns=train_dataset.column_names
)

print("✅ Preprocessing complete! Example keys:", small_train_dataset[0].keys())


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

✅ Preprocessing complete! Example keys: dict_keys(['input_values', 'labels'])


In [11]:
for i in range(3):
    input_values = small_train_dataset[i]["input_values"]
    print("Audio features length:", len(input_values))  # list length instead of shape
    print("Transcript:", small_train_dataset[i]["labels"])


Audio features length: 93904
Transcript: I really liked the film we saw last week.
Audio features length: 96592
Transcript: Please put maimi yajima's song onto Operación Bikini.
Audio features length: 83920
Transcript: Three men are painting a metal wall white.


In [34]:
# Make sure Whisper is installed
!pip install git+https://github.com/openai/whisper.git
!pip install ffmpeg-python

# Imports
import whisper
import os


Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to c:\users\rakes\appdata\local\temp\pip-req-build-yj5ufhel
  Resolved https://github.com/openai/whisper.git to commit c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'


  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git 'C:\Users\rakes\AppData\Local\Temp\pip-req-build-yj5ufhel'




In [2]:
import os
import whisper
import pandas as pd
from tqdm import tqdm

# --------------------------
# Load Whisper model
# --------------------------
model = whisper.load_model("small")  # choose "base", "medium", "large" for higher accuracy

# --------------------------
# Paths to Common Voice clips
# --------------------------
folders = {
    "train": r"C:\Users\rakes\Downloads\commonvoice\train\clips",
    "test": r"C:\Users\rakes\Downloads\commonvoice\test\clips",
    "validation": r"C:\Users\rakes\Downloads\commonvoice\validation\clips"
}

# --------------------------
# Function to transcribe a folder
# --------------------------
def transcribe_folder(folder_path):
    files = [f for f in os.listdir(folder_path) if f.endswith(".wav")]
    data = []

    for f in tqdm(files, desc=f"Transcribing {os.path.basename(folder_path)}"):
        audio_path = os.path.join(folder_path, f)
        result = model.transcribe(audio_path)
        data.append({
            "file_name": f,
            "detected_language": result["language"],
            "transcript": result["text"]
        })
    return pd.DataFrame(data)

# --------------------------
# Transcribe all folders
# --------------------------
dfs = []
for split, path in folders.items():
    df = transcribe_folder(path)
    df["split"] = split
    dfs.append(df)

all_transcriptions = pd.concat(dfs, ignore_index=True)

# --------------------------
# Save results to CSV
# --------------------------
output_csv = r"C:\Users\rakes\Downloads\commonvoice_transcriptions.csv"
all_transcriptions.to_csv(output_csv, index=False)
print(f"✅ All transcriptions saved to: {output_csv}")

# --------------------------
# Display first 5 rows
# --------------------------
all_transcriptions.head()


Transcribing clips: 100%|████████████████████████████████████████████████████████| 2000/2000 [5:35:54<00:00, 10.08s/it]
Transcribing clips: 100%|██████████████| 400/400 [1:00:06<00:00,  9.02s/it]
Transcribing clips: 100%|████████████████| 400/400 [56:06<00:00,  8.42s/it]


✅ All transcriptions saved to: C:\Users\rakes\Downloads\commonvoice_transcriptions.csv


Unnamed: 0,file_name,detected_language,transcript,split
0,common_voice_en_10110.wav,en,I really like the film we saw last week.,train
1,common_voice_en_10153.wav,en,Please put Mamie Yahima's song onto Operation...,train
2,common_voice_en_101622.wav,en,3. Painting a metal ball white,train
3,common_voice_en_10187.wav,en,"Though this be madness, yet there is method i...",train
4,common_voice_en_10199.wav,en,"As she watched, the cat washed his ears and t...",train


In [1]:
# Cell 1: Import translation libraries
from transformers import MarianMTModel, MarianTokenizer

# Function to load translation models dynamically
def load_translation_model(src_lang="en", tgt_lang="fr"):
    """
    Load MarianMT model for English → target language translation.
    Example: tgt_lang="hi" for Hindi, "te" for Telugu, "es" for Spanish.
    """
    model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return tokenizer, model


In [2]:
# Cell 2: Function to translate text
def translate_text(text, tgt_lang="hi"):
    """
    Translate English text into the target language.
    """
    tokenizer, model = load_translation_model("en", tgt_lang)
    batch = tokenizer([text], return_tensors="pt", padding=True)
    translated = model.generate(**batch)
    translated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)[0]
    return translated_text


In [1]:
import whisper
from pydub import AudioSegment
from pathlib import Path
import tempfile
import os

# Make sure ffmpeg is properly set
ffmpeg_path = r"C:\ffmpeg\bin\ffmpeg.exe"  # update if needed
AudioSegment.converter = ffmpeg_path

# Load Whisper model once
model = whisper.load_model("small")  # or "medium", "large" for better accuracy

def transcribe_video(video_path: str, translate_to_english: bool = False):
    """
    Extracts audio from a video and transcribes it using Whisper.
    If translate_to_english=True, it translates non-English speech to English.
    """
    try:
        # Create temp wav file
        video_path = Path(video_path)
        wav_path = Path(tempfile.gettempdir()) / f"{video_path.stem}.wav"

        # Extract audio
        audio = AudioSegment.from_file(video_path)
        audio = audio.set_frame_rate(16000).set_channels(1)
        audio.export(wav_path, format="wav")

        # Transcribe
        task_type = "translate" if translate_to_english else "transcribe"
        result = model.transcribe(str(wav_path), task=task_type)

        return result

    except Exception as e:
        print(f"❌ Error in transcription: {e}")
        return None


In [3]:
# Cell 1: Setup Translation Functionality
from transformers import MarianMTModel, MarianTokenizer

# Function to load MarianMT model dynamically
def load_translation_model(src_lang="en", tgt_lang="hi"):
    model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    return tokenizer, model

# Function to translate text
def translate_text(text, tgt_lang="hi"):
    tokenizer, model = load_translation_model("en", tgt_lang)
    batch = tokenizer([text], return_tensors="pt", padding=True)
    translated = model.generate(**batch)
    return tokenizer.batch_decode(translated, skip_special_tokens=True)[0]


In [4]:
from pydub import AudioSegment
from pathlib import Path
import tempfile
import whisper
import os

# Path to ffmpeg
ffmpeg_path = r"C:\ffmpeg\bin\ffmpeg.exe"
AudioSegment.converter = ffmpeg_path

# Load model (choose: tiny, base, small, medium, large)
model = whisper.load_model("small")

def transcribe_video(video_path: str, translate_to_english: bool = False):
    """
    Extracts audio from video and transcribes it.
    """
    try:
        video_path = Path(video_path)
        wav_path = Path(tempfile.gettempdir()) / f"{video_path.stem}.wav"

        # Extract audio
        audio = AudioSegment.from_file(video_path)
        audio = audio.set_frame_rate(16000).set_channels(1)
        audio.export(wav_path, format="wav")

        # Transcribe
        task_type = "translate" if translate_to_english else "transcribe"
        result = model.transcribe(str(wav_path), task=task_type)
        return result

    except Exception as e:
        print(f"❌ Error: {e}")
        return None
