<a href="https://colab.research.google.com/github/qvieth/PhoTransciptor/blob/main/PhoTranscriptor_Copy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- Install the PyTorch stack ---
print("Installing torch, torchvision, torchaudio...")
!pip install torch torchvision torchaudio

# --- Install other required Python libraries, UPGRADING Gradio ---
print("Installing transformers, accelerate, soundfile, librosa, gradio...")
!pip install transformers accelerate soundfile librosa gradio --upgrade

# --- Install FFmpeg ---
print("Installing FFmpeg...")
!apt-get update -qq
!apt-get install -qq ffmpeg

print("\nInstallation steps complete.")

In [None]:
import gradio as gr
import torch
from transformers import pipeline
import os
import sys
import librosa
import soundfile as sf
from functools import lru_cache
import time

MODEL_NAME = "vinai/PhoWhisper-large"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
CHUNK_LENGTH_S = 30

LOCAL_MODEL_PATH = MODEL_NAME
print(f"Running on: {DEVICE}")

@lru_cache(maxsize=None)
def load_model(model_source):
    print(f"Loading model: {model_source}...")
    progress = gr.Progress()
    progress(0, desc="Loading model...")
    try:
        pipe = pipeline(
            "automatic-speech-recognition",
            model=model_source,
            chunk_length_s=CHUNK_LENGTH_S,
            device=DEVICE,
        )
        progress(1.0, desc="Model loaded successfully.")
        print("Model loaded successfully.")
        return pipe
    except Exception as e:
        error_msg = f"Error loading model: {e}"
        print(error_msg)
        progress(1.0, desc=error_msg)
        return None

asr_pipeline = load_model(LOCAL_MODEL_PATH)

def get_audio_duration(audio_path):
    if not audio_path or not os.path.exists(audio_path):
        return 0
    try:
        duration = librosa.get_duration(path=audio_path)
        return duration
    except Exception as e:
        print(f"Warning: Could not get duration for {audio_path}: {e}")
        try:
            with sf.SoundFile(audio_path) as f:
                duration = len(f) / f.samplerate
                return duration
        except Exception as sf_e:
            print(f"Warning: Soundfile also failed for {audio_path}: {sf_e}")
            return 0

def transcribe_audio_for_blocks(audio_file_obj, progress=gr.Progress()):
    if asr_pipeline is None:
        return "Error: ASR model is not loaded. Check console output.", "Status: Error - Model not loaded."
    if audio_file_obj is None:
        return "Please upload an audio file.", "Status: Waiting for audio."

    audio_filepath = audio_file_obj
    print(f"Received audio input, temp file at: {audio_filepath}")

    duration = get_audio_duration(audio_filepath)
    duration_str = f"{duration:.2f} seconds" if duration > 0 else "unknown duration"
    print(f"Audio duration: {duration_str}")

    status_message_during_transcription = f"Status: Transcribing {duration_str} of audio..."
    progress(0, desc=f"Audio Duration: {duration_str}. Transcribing...")

    start_time = time.time()
    try:
        transcription_result = asr_pipeline(audio_filepath)
        end_time = time.time()
        processing_time = end_time - start_time
        progress(1.0, desc="Transcription Complete.")
        success_status_message = f"Status: Transcription completed in {processing_time:.2f} seconds (Audio duration: {duration_str})"
        return transcription_result["text"], success_status_message
    except Exception as e:
        end_time = time.time()
        processing_time = end_time - start_time
        error_message = f"Transcription failed: {e}"
        print(f"An error occurred during transcription: {e}")
        progress(1.0, desc=error_message)
        failure_status_message = f"Status: Transcription failed after {processing_time:.2f} seconds. Error: {e}"
        return error_message, failure_status_message

# ✅ NEW: Save transcript to file
def save_transcript_to_file(transcript_text, filename="transcript.txt"):
    with open(filename, "w", encoding="utf-8") as f:
        f.write(transcript_text)
    return filename

# ✅ NEW: Prepare download
def prepare_download_file(transcript_text):
    filename = "transcript.txt"
    return save_transcript_to_file(transcript_text, filename)

# UI strings
title = "PhoWhisper ASR"
description_markdown = f"""
Upload a Vietnamese audio file to transcribe it using the PhoWhisper-large model ({MODEL_NAME}).
<b>Running on:</b> {DEVICE}
<b>Note:</b> Sau khi tạo transcript xong, một số người dùng phản ánh tình trạng lag khi dán vào Word nên mình đã bổ sung thêm 1 tính năng Download Transcript.
<br>Khi transcript tạo xong, click chuột phải chỗ vị trí #KB ⇣ , chọn Lưu liên kết thành.../Save link as... để lưu transcript thành 1 file txt, từ file này copy vào word thì sẽ không bị lag.
<br>HOẶC đơn giản hơn, sau khi bấm nút copy transcript, dán nội dung vào Notepad/GoogleDocs để xoá format ẩn, sau đó copy lại và dán vào Word thì sẽ không bị lag nữa.
"""

citation_bibtex = """
@inproceedings{PhoWhisper,
   title     = {{PhoWhisper: Automatic Speech Recognition for Vietnamese}},
   author    = {Thanh-Thien Le and Linh The Nguyen and Dat Quoc Nguyen},
   booktitle = {Proceedings of the ICLR 2024 Tiny Papers track},
   year      = {2024}
}
"""

credits_and_citation_markdown = f"""
# Credits & Citation

## User Interface Development
This interface was developed by Mr. Le Nguyen Nhu Anh (<a href="mailto:leyny036@mymail.unisa.edu.au">leyny036@mymail.unisa.edu.au</a>), PhD Candidate at the University of South Australia.

## ASR Model Citation
If you use the underlying PhoWhisper model for your work, please cite the original authors:

{citation_bibtex}

Links:
- Paper: <a href="https://openreview.net/pdf?id=qsif2awK2L" target="_blank">PhoWhisper (ICLR 2024)</a>
- Model: <a href="https://huggingface.co/vinai/PhoWhisper-large" target="_blank">vinai/PhoWhisper-large</a>
"""

# --- Interface ---
with gr.Blocks(title=title, theme=gr.themes.Soft()) as interface:
    gr.Markdown(f"<h1>{title}</h1>")
    gr.Markdown(description_markdown)

    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(type="filepath", label="Upload Vietnamese Audio File")
            transcribe_button = gr.Button("Transcribe Audio")

        with gr.Column():
            transcription_output_textbox = gr.Textbox(
                label="Transcription",
                lines=5,  # ✨ Adjusted to reduce interface lag
                interactive=True,
                placeholder="Transcription will appear here...",
                show_copy_button=True
            )
            status_textbox = gr.Textbox(label="Status", interactive=False)

            # ✨ New: download file button and output
            download_button = gr.Button("Download Transcript")
            download_file_output = gr.File(label="Download your transcript")

    gr.Markdown(credits_and_citation_markdown)

    transcribe_button.click(
        fn=transcribe_audio_for_blocks,
        inputs=[audio_input],
        outputs=[transcription_output_textbox, status_textbox],
        show_progress=True
    )

    # ✨ Download action
    download_button.click(
        fn=prepare_download_file,
        inputs=[transcription_output_textbox],
        outputs=[download_file_output]
    )

if __name__ == "__main__":
    if asr_pipeline is None:
        print("\nApplication will not launch because the ASR Pipeline failed to load.")
    else:
        interface.launch(share=True, debug=False)