In [1]:
# -*- coding: utf-8 -*-
"""Audio to Text Generation in Google Colab.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/... (Your Colab Notebook Link Here)
"""

# Install necessary libraries
!pip install -U torchaudio librosa datasets transformers

import torch
import torchaudio
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import librosa
import numpy as np

# Check if CUDA is available and set the device
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using CUDA: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

# Load pre-trained Wav2Vec2 model and processor
model_name = "facebook/wav2vec2-base-960h"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)

# --- Option 1: Transcribe an audio file from Google Drive ---
from google.colab import files

print("\n--- Option 1: Transcribe an audio file from Google Drive ---")
print("Please upload your audio file (.wav, .mp3, etc.)")
uploaded = files.upload()

if uploaded:
    audio_file_path = list(uploaded.keys())[0]
    print(f"Uploaded audio file: {audio_file_path}")

    try:
        # Load the audio file
        speech, sample_rate = torchaudio.load(audio_file_path)
        speech = speech.squeeze().numpy()

        # Resample the audio if the sample rate doesn't match the model's expected rate
        if sample_rate != processor.feature_extractor.sampling_rate:
            speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=processor.feature_extractor.sampling_rate)

        # Process the audio
        input_values = processor(speech, return_tensors="pt", padding="longest").input_values.to(device)

        # Perform inference
        with torch.no_grad():
            logits = model(input_values).logits

        # Decode the predictions
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = processor.batch_decode(predicted_ids)[0]

        print("\nTranscription (from uploaded file):")
        print(transcription)

    except Exception as e:
        print(f"Error processing audio file: {e}")
else:
    print("No audio file uploaded.")

# --- Option 2: Transcribe audio directly from your microphone (requires browser permissions) ---
print("\n--- Option 2: Transcribe audio directly from your microphone ---")
print("Click the button below to record audio (allow microphone access if prompted).")

try:
    from IPython.display import Javascript
    from google.colab import output
    from base64 import b64decode

    RECORD = """
    const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
    const b2text = blob => new Promise(resolve => {
      const reader = new FileReader();
      reader.onloadend = function() {
        resolve(reader.result);
      }
      reader.readAsDataURL(blob);
    });

    var recordButton = document.querySelector('#recordButton');
    var stopButton = document.querySelector('#stopButton');
    var audioChunks = [];
    var recorder;

    const stream = await navigator.mediaDevices.getUserMedia({ audio: true });

    const mimeType = 'audio/webm';
    recorder = new MediaRecorder(stream, { mimeType });

    recorder.ondataavailable = event => {
      if (typeof event.data == 'undefined') return;
      if (event.data.size > 0) {
        audioChunks.push(event.data);
      }
    };

    recorder.onstop = async () => {
      const blob = new Blob(audioChunks, {
        type: mimeType
      });
      const audioURL = URL.createObjectURL(blob);
      const base64 = await b2text(blob);
      google.colab.output.setIframeHeight(document.querySelector('#outputFrame').scrollHeight + 20);
      const data = base64.substr(base64.indexOf(',') + 1);
      const audio = document.createElement('audio');
      audio.controls = true;
      audio.src = audioURL;
      document.body.appendChild(audio);
      resolve(data);
    };

    recordButton.disabled = false;
    """.format()

    RECORD_BUTTON = """
    <button id="recordButton" disabled>Record</button>
    <button id="stopButton" disabled>Stop</button>
    <script>
    const recordButton = document.querySelector('#recordButton');
    const stopButton = document.querySelector('#stopButton');
    let audioData;
    recordButton.addEventListener('click', () => {
      audioChunks = [];
      recorder.start();
      recordButton.disabled = true;
      stopButton.disabled = false;
    });

    stopButton.addEventListener('click', async () => {
      recorder.stop();
      recordButton.disabled = false;
      stopButton.disabled = true;
      audioData = await new Promise(resolve => {
        recorder.onstop = async () => {
          const blob = new Blob(audioChunks, { type: 'audio/webm' });
          const reader = new FileReader();
          reader.onloadend = function() {
            resolve(reader.result.split(',')[1]);
          }
          reader.readAsDataURL(blob);
        }
      });
      google.colab.kernel.invoke_function('notebook.transcribe_audio', { audio_data: audioData }, {});
    });
    </script>
    """

    @output.register_callback('notebook.transcribe_audio')
    def transcribe_audio(audio_data):
        audio_bytes = b64decode(audio_data)
        try:
            # Load audio from bytes
            speech, sample_rate = torchaudio.load(io.BytesIO(audio_bytes))
            speech = speech.squeeze().numpy()

            # Resample if necessary
            if sample_rate != processor.feature_extractor.sampling_rate:
                speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=processor.feature_extractor.sampling_rate)

            # Process the audio
            input_values = processor(speech, return_tensors="pt", padding="longest").input_values.to(device)

            # Perform inference
            with torch.no_grad():
                logits = model(input_values).logits

            # Decode the predictions
            predicted_ids = torch.argmax(logits, dim=-1)
            transcription = processor.batch_decode(predicted_ids)[0]

            print("\nTranscription (from microphone):")
            print(transcription)

        except Exception as e:
            print(f"Error processing recorded audio: {e}")

    import io
    display(Javascript(RECORD))
    display(HTML(RECORD_BUTTON))

except Exception as e:
    print(f"Error setting up microphone recording: {e}")
    print("Microphone recording might not be supported in this environment.")
    print("Please try Option 1 with an uploaded audio file.")

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchaudio)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux201

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Option 1: Transcribe an audio file from Google Drive ---
Please upload your audio file (.wav, .mp3, etc.)


Saving file_example_MP3_5MG.mp3 to file_example_MP3_5MG.mp3
Uploaded audio file: file_example_MP3_5MG.mp3


It is strongly recommended to pass the `sampling_rate` argument to `Wav2Vec2FeatureExtractor()`. Failing to do so can result in silent errors that might be hard to debug.



Transcription (from uploaded file):
TAA  TAA  A  A  AATETEAATO AAT EA EATAT E EEA  EEAT E  E A EAEADE EEAEE ADEDE  EEA EAEAD EA TEE ADEDE E HAT  TEAAT E AAAE EAAT EAD A TE DD EE EA HEE  E EAEAD  TEAAT EAEA EATEE DDO E EAEAD  TEAAATOE EADEA EA TEE DADO EA EEEAEA A TEE ADD  E A TA  TA  TAT TAT TAA  TEAAATO  TAA EA TE AAT E TEEA E T EA E   E   AAEE  EE EA ADADO  EAEEAEA EEATEE AEDO  EAAAHATTEAAATO  A E EAAT E AEDA TE ADD EEA EAD EE  E EAEAD  TEAAATO E EEAE ETEE AEDO EA EAEA

--- Option 2: Transcribe audio directly from your microphone ---
Click the button below to record audio (allow microphone access if prompted).
Error setting up microphone recording: unexpected '{' in field name
Microphone recording might not be supported in this environment.
Please try Option 1 with an uploaded audio file.
