In [1]:
!pip install transformers torchaudio soundfile




In [2]:
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import os


  from .autonotebook import tqdm as notebook_tqdm


In [9]:
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Load pre-trained model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
model.eval()

# Define transcription function
def transcribe_audio(audio_path):
    # Load audio
    speech_array, sampling_rate = torchaudio.load(audio_path)
    
    # Resample if needed
    if sampling_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
        speech_array = resampler(speech_array)
    
    # Mono channel only
    if speech_array.shape[0] > 1:
        speech_array = speech_array.mean(dim=0, keepdim=True)
    
    input_values = processor(speech_array.squeeze(), sampling_rate=16000, return_tensors="pt").input_values

    # Inference
    with torch.no_grad():
        logits = model(input_values).logits
    
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.batch_decode(predicted_ids)[0]

    return transcription.lower()


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
audio_path = "../03_audio_samples/common_voice_en_1.wav"
text = transcribe_audio(audio_path)
print("Transcription:", text)


Transcription: his mother is a casting director


In [11]:
import os

folder = "../03_audio_samples"  # adjust if needed
ext = ".wav"

# List all wav files
files = [f for f in os.listdir(folder) if f.endswith(ext)]
files.sort()  # sort alphabetically for consistent order

# Rename them
for i, filename in enumerate(files, start=1):
    new_name = f"sample_{i}{ext}"
    old_path = os.path.join(folder, filename)
    new_path = os.path.join(folder, new_name)
    os.rename(old_path, new_path)

print("Renamed files:")
print([f"sample_{i}{ext}" for i in range(1, len(files)+1)])


Renamed files:
['sample_1.wav', 'sample_2.wav', 'sample_3.wav', 'sample_4.wav', 'sample_5.wav', 'sample_6.wav', 'sample_7.wav', 'sample_8.wav', 'sample_9.wav', 'sample_10.wav', 'sample_11.wav', 'sample_12.wav', 'sample_13.wav', 'sample_14.wav', 'sample_15.wav', 'sample_16.wav', 'sample_17.wav', 'sample_18.wav', 'sample_19.wav', 'sample_20.wav', 'sample_21.wav', 'sample_22.wav', 'sample_23.wav', 'sample_24.wav', 'sample_25.wav', 'sample_26.wav', 'sample_27.wav', 'sample_28.wav', 'sample_29.wav', 'sample_30.wav', 'sample_31.wav', 'sample_32.wav', 'sample_33.wav', 'sample_34.wav', 'sample_35.wav', 'sample_36.wav', 'sample_37.wav', 'sample_38.wav', 'sample_39.wav', 'sample_40.wav', 'sample_41.wav', 'sample_42.wav', 'sample_43.wav', 'sample_44.wav', 'sample_45.wav', 'sample_46.wav', 'sample_47.wav', 'sample_48.wav', 'sample_49.wav', 'sample_50.wav', 'sample_51.wav', 'sample_52.wav', 'sample_53.wav', 'sample_54.wav', 'sample_55.wav', 'sample_56.wav', 'sample_57.wav', 'sample_58.wav', 'sample

In [13]:
import os

folder = "../03_audio_samples"
ext = ".wav"
output_file = "../02_inference/transcriptions.txt"

files = [f for f in os.listdir(folder) if f.endswith(ext)]
files.sort(key=lambda x: int(x.split("_")[1].split(".")[0]))

with open(output_file, "w", encoding="utf-8") as f_out:
    for f in files:
        path = os.path.join(folder, f)
        print(f"\n Transcribing {f}...")
        text = transcribe_audio(path)
        print(f" Transcription: {text}")
        f_out.write(f"{f}: {text}\n")

print(f"\nTranscriptions saved to {output_file}")



 Transcribing sample_1.wav...
 Transcription: his mother is a casting director

 Transcribing sample_2.wav...
 Transcription: its species were formerly placed in the genus vernonia

 Transcribing sample_3.wav...
 Transcription: his mother is a casting director

 Transcribing sample_4.wav...
 Transcription: often coast guardsmen would judge landings by raising placards with large numbers

 Transcribing sample_5.wav...
 Transcription: each of us would have in two ways affirmed himself and the other person

 Transcribing sample_6.wav...
 Transcription: in his address colonel gare referred to the late surgent alex

 Transcribing sample_7.wav...
 Transcription: often coast guardsmen would judge landings by raising placards with large numbers

 Transcribing sample_8.wav...
 Transcription: each of us would have in two ways affirmed himself and the other person

 Transcribing sample_9.wav...
 Transcription: in his address colonel gare referred to the late surgent alex

 Transcribing sample_10