# 🎙️ YouTube Persian ASR, Summarization & Translation
Welcome to this interactive Google Colab notebook!  
You can:
✅ Enter a YouTube link  
✅ Transcribe Persian speech into text  
✅ Translate, Summarize, and Punctuate it using Gemini models

👇 Run all cells below. It will open a Gradio app. You can play around with it in this notebook or open the public URL.

‼️ If you were detected as a bot and received PO_TOKEN related error, delete runtime and re-run all cells again.

---

📜 This application is licensed under Creative Commons Attribution-NonCommercial 4.0 (CC BY-NC 4.0).

If you're interested in commercial applications, please contact us at:

✉️ Email: saeedzou2012@gmail.com

---

In [1]:
# @title Installation requirements
%%capture
!pip install gradio gradio_client pytubefix pydub

import gradio as gr
import os
import requests
from pytubefix import YouTube
from pytubefix.cli import on_progress
from gradio_client import Client, handle_file

# Hugging Face Space Name
HF_SPACE = "saeedzou/Persian_ASR_Text_Summarization"
client = Client(HF_SPACE)

# Function to download YouTube audio
def download_audio(youtube_url):
    try:
        yt = YouTube(youtube_url, on_progress_callback=on_progress)
        audio_stream = yt.streams.get_audio_only()
        filename = audio_stream.download(filename="youtube_audio.mp4")  # Save as mp4
        return filename
    except Exception as e:
        return f"Error: {str(e)}"

# Function to transcribe using Hugging Face API
def transcribe_audio(youtube_url):
    audio_path = download_audio(youtube_url)

    if "Error" in audio_path:
        return audio_path, "", "", ""

    result = client.predict(audio=handle_file(audio_path), api_name="/transcribe")

    return result, "", "", ""  # Empty translation & summary initially

# Function to translate
def translate_text(text, target_language, model_sel):
    result = client.predict(text=text, target_language=target_language, model_sel=model_sel, api_name="/translate")
    return result

# Function to summarize
def summarize_text(text, word_count, model_sel, lang_sel):
    result = client.predict(transcript_text=text, word_count=word_count, model_sel=model_sel, lang_sel=lang_sel, api_name="/summarize")
    return result

# Function to punctuate
def punctuate_text(text, model_sel):
    result = client.predict(transcript=text, model_sel=model_sel, api_name="/punctuate")
    return result

In [None]:
# @title App
languages = [
    "English", "Persian", "French", "Spanish", "German", "Italian", "Portuguese", "Dutch", "Swedish", "Danish",
    "Finnish", "Norwegian", "Russian", "Polish", "Turkish", "Arabic", "Hindi", "Chinese", "Japanese", "Korean",
    "Thai", "Vietnamese", "Indonesian", "Hebrew", "Greek", "Czech", "Hungarian", "Romanian", "Bulgarian", "Serbian",
    "Croatian", "Slovak", "Slovenian", "Ukrainian", "Lithuanian", "Latvian", "Estonian", "Macedonian", "Albanian",
    "Basque", "Catalan", "Maltese", "Icelandic", "Georgian", "Armenian", "Belarusian", "Yiddish", "Pashto", "Urdu",
    "Bengali", "Punjabi", "Tamil", "Telugu", "Malayalam", "Sinhala", "Burmese", "Lao", "Khmer", "Mongolian",
    "Nepali", "Marathi", "Gujarati", "Kannada", "Odia", "Assamese", "Maithili", "Kurdish", "Azerbaijani", "Kazakh",
    "Uzbek", "Turkmen", "Tajik", "Kyrgyz", "Uighur", "Tatar", "Haitian Creole", "Swahili", "Hausa", "Yoruba",
    "Zulu", "Xhosa", "Amharic", "Somali", "Tigrinya", "Shona", "Igbo", "Malagasy", "Quechua", "Aymara", "Guarani",
    "Sundanese", "Javanese", "Filipino", "Hmong", "Fijian", "Tongan", "Samoan", "Chamorro", "Hawaiian"
]
languages = sorted(languages)
model_selections = ["gemini-2.0-flash", "gemini-2.0-pro-exp-02-05", "gemini-2.0-flash-lite"]

lang = 'English'
model_sel = 'gemini-2.0-flash'
assert model_sel in model_selections, "Invalid model selection"

with gr.Blocks() as demo:
    gr.Markdown("# 🎙️ YouTube Persian ASR & NLP via Hugging Face API")

    with gr.Row():
        youtube_input = gr.Textbox(label="📺 Enter YouTube Link")
        transcribe_button = gr.Button("🎵 Download & Transcribe")

    transcript_output = gr.Textbox(label="📝 Transcription", interactive=True)
    translation_output = gr.Textbox(label="🌍 Translation", interactive=False)
    summarized_output = gr.Textbox(label="📖 Summarized Text", interactive=False)

    with gr.Row():
        translate_button = gr.Button("🌐 Translate")
        summarize_button = gr.Button("✂️ Summarize")
        punctuate_button = gr.Button("🔤 Restore Punctuation")

    with gr.Row():
        word_count_input = gr.Number(value=50, label="📏 Summary Length")
        lang_selection = gr.Dropdown(choices=languages, value=lang, label="🌎 Target Language")
        model_selection = gr.Dropdown(choices=model_selections, value=model_sel, label="🤖 AI Model")

    # Link functions to buttons
    transcribe_button.click(transcribe_audio, inputs=youtube_input, outputs=[transcript_output, translation_output, summarized_output])
    translate_button.click(translate_text, inputs=[transcript_output, lang_selection, model_selection], outputs=translation_output)
    summarize_button.click(summarize_text, inputs=[transcript_output, word_count_input, model_selection, lang_selection], outputs=summarized_output)
    punctuate_button.click(punctuate_text, inputs=[transcript_output, model_selection], outputs=transcript_output)

    gr.Markdown(
        """
        \n\n
        ---

        Powered by NVIDIA’s **NeMo Fast Conformer**, this tool is optimized for high-quality **Persian ASR (Automatic Speech Recognition)**.

        **📚 Trained on 800+ Hours of Speech Data:**
        - Common Voice 17 (~300 hours)
        - YouTube (~400 hours)
        - NasleMana (~90 hours)
        - In-house dataset (~70 hours)

        ---

        ## 📜 License & Business Inquiries

        This application is licensed under **Creative Commons Attribution-NonCommercial 4.0 (CC BY-NC 4.0)**.
        - **🛑 Non-Commercial Use Only** – Commercial use is not permitted without prior approval.
        - **🔗 Attribution Required** – Credit must be given to FAIM Group, Sharif University of Technology.
        - **❌ No Derivatives** – Modifications or adaptations of this work are not allowed.

        📜 Full License Details: [CC BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/)

        📩 **Business Inquiries:**
        If you're interested in commercial applications, please contact us at:
        ✉️ **Email:** [saeedzou2012@gmail.com](mailto:saeedzou2012@gmail.com)

        ---
        """
    )
demo.launch(share=True, debug=True)  # Enables public link