<a href="https://colab.research.google.com/github/soheilpaper/-tft-2.4-ili9341-STM32/blob/master/youtube_subtitle/lowcode_whisper2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install --upgrade pip
!pip install --upgrade git+https://github.com/huggingface/transformers.git accelerate datasets[audio] soundfile

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1
Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-hxoekf7d
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-hxoekf7d
  Resolved https://github.com/huggingface/transformers.git to commit 06c16de3d3971e125232c2682ec99d282bb1a27d
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml

In [None]:


import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset
import urllib.request
import os

# Function to load audio from URL
def download_audio(url, filename="audio.mp3"):
    urllib.request.urlretrieve(url, filename)
    return filename

# Function to load audio from Google Drive path (already mounted)
def load_audio_from_drive(drive_path, local_path="audio_from_drive.mp3"):
    os.system(f"cp '{drive_path}' '{local_path}'")
    return local_path

# Choose device
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# Load Whisper large-v3 model and processor (best for Persian)
model_id = "openai/whisper-large-v3"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    torch_dtype=torch_dtype,
    low_cpu_mem_usage=True,
    use_safetensors=True
)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

# Create ASR pipeline
asr = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=512,
    chunk_length_s=30,
    batch_size=8,
    return_timestamps=False,
    torch_dtype=torch_dtype,
    device=device,
)

# --- User input section ---

# Option 1: Upload audio file manually in Colab
from google.colab import files
print("Upload your audio file (mp3, wav, m4a, etc.):")
uploaded = files.upload()
audio_file = next(iter(uploaded.keys()))

# Option 2: Use audio from URL (uncomment and set your URL)
# audio_url = "https://example.com/path/to/audio.mp3"
# audio_file = download_audio(audio_url)

# Option 3: Use audio file from Google Drive (uncomment and set your drive path)
# from google.colab import drive
# drive.mount('/content/drive')
# drive_audio_path = "/content/drive/MyDrive/path/to/audio.mp3"
# audio_file = load_audio_from_drive(drive_audio_path)

# Transcribe audio to Persian text
print("Transcribing audio...")
result = asr(audio_file, language="fa", task="transcribe")

print("\n--- Transcription Result ---\n")
print(result["text"])

#the perplexity suggested code:

In [None]:
# 1. Install required libraries
!pip install --upgrade pip
!pip install --upgrade git+https://github.com/huggingface/transformers.git accelerate datasets[audio] soundfile
!pip install speechbrain

# Restart runtime after this cell before running next cells!

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1
Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-uyzgttzp
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-uyzgttzp
  Resolved https://github.com/huggingface/transformers.git to commit 9f8fffed3cbe6c322d6d15735d06d4a2c27ae16f
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml

In [None]:
# 2. Import libraries and upload audio file
from transformers import pipeline
from google.colab import files
import IPython
import torch

print("Upload your audio file (wav, mp3, m4a, etc.):")
uploaded = files.upload()
audio_file = next(iter(uploaded))
print(f"Uploaded file: {audio_file}")

# Optional: play audio in notebook
IPython.display.Audio(audio_file)

In [None]:
# 3. Run ASR with Wav2Vec2 Large XLSR Persian V3 (Hugging Face)
print("Running Wav2Vec2 Persian ASR...")

model_name_wav2vec = "m3hrdadfi/wav2vec2-large-xlsr-persian-v3"
asr_wav2vec = pipeline("automatic-speech-recognition", model=model_name_wav2vec, device=0 if torch.cuda.is_available() else -1)

result_wav2vec = asr_wav2vec(audio_file, task="transcribe")
print("\n--- Wav2Vec2 Persian ASR Result ---\n")
print(result_wav2vec['text'])

In [None]:
# 4. Run ASR with SpeechBrain Whisper Large V2 Persian
print("Running SpeechBrain Whisper Large V2 Persian ASR...")

from speechbrain.pretrained import EncoderDecoderASR

device = "cuda" if torch.cuda.is_available() else "cpu"

asr_speechbrain = EncoderDecoderASR.from_hparams(
    source="speechbrain/asr-whisper-large-v2-commonvoice-fa",
    savedir="pretrained_models/asr-whisper-large-v2-commonvoice-fa",
    run_opts={"device": device}
)

transcription_speechbrain = asr_speechbrain.transcribe_file(audio_file)
print("\n--- SpeechBrain Whisper Large V2 Persian ASR Result ---\n")
print(transcription_speechbrain)