<a href="https://colab.research.google.com/github/priscacare20/audio_to_text_hf_space/blob/main/Audio_text_summarizer_app_with_GPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.30.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.10-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.

In [2]:
!pip install bitsandbytes
# Install all required dependencies including bitsandbytes correctly
!pip install -U pip setuptools wheel
!pip install -q \
  bitsandbytes \
  transformers \
  accelerate \
  torchaudio \
  soundfile \
  gradio \
  sentencepiece \
  requests

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

# Audio Summarizer:
This demos the code creation of an audio summarizer app that takes an audio file or a downloadable link to an audio. Transcribes the audio and creates a summary of the content.
The purpose of the app is to help learners disgest audio content quickly and determine if they need to listen to the full audio. This is applicable to summarizing youtube content, podcast or writing the minute of a meeting.

In [1]:
import os
import torch
import gradio as gr
import requests
import soundfile as sf
import gdown
from urllib.parse import urlparse, parse_qs

from IPython.display import Markdown, display, update_display
from google.colab import drive
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig, AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch_dtype = torch.float16 if device.type == "cuda" else torch.float32

In [2]:
!pip freeze > requirements.txt

### Audio collector: function to download the audio from url

In [3]:
def download_audio(audio_url):
    """
    Downloads audio from a direct link or Google Drive link using gdown if needed.

    Args:
        audio_url (str): Direct audio URL or Google Drive share link.

    Returns:
        Tuple[str or None, str or None]: File path and error (if any).
    """
    try:
        if "drive.google.com" in audio_url:
            file_id = ""
            if "file/d/" in audio_url:
                file_id = audio_url.split("file/d/")[1].split("/")[0]
            elif "id=" in audio_url:
                file_id = parse_qs(urlparse(audio_url).query).get("id", [""])[0]
            else:
                return None, "Unsupported Google Drive link format."

            download_url = f"https://drive.google.com/uc?id={file_id}"
            file_path = "temp_url_audio.mp3"
            gdown.download(download_url, file_path, quiet=False)
        else:
            response = requests.get(audio_url)
            if response.status_code != 200:
                return None, f"Failed to download audio. Status code: {response.status_code}"
            file_path = "temp_url_audio.mp3"
            with open(file_path, "wb") as f:
                f.write(response.content)

        return file_path, None
    except Exception as e:
        return None, str(e)


In [4]:
audio_url = "https://drive.google.com/file/d/1-0HVuVe4I_nU5cwqV72f0mDuDmj22MT2/view?usp=sharing"
file_path, error = download_audio(audio_url)

Downloading...
From: https://drive.google.com/uc?id=1-0HVuVe4I_nU5cwqV72f0mDuDmj22MT2
To: /content/temp_url_audio.mp3
100%|██████████| 14.4M/14.4M [00:00<00:00, 37.7MB/s]


In [5]:
print(file_path)

temp_url_audio.mp3


In [6]:
print(error)

None


### convet audio to text using openAI's whisper model.

In [7]:
def is_valid_audio(file_path):
    """
    Checks if a given file is a valid audio file.

    Args:
        file_path (str): Path to the file to be checked.

    Returns:
        bool: True if the file is a valid audio file, False otherwise.
    """
    try:
        with sf.SoundFile(file_path) as f:
            return True
    except RuntimeError:
        return False

In [8]:
def transcribe(audio_file=None, audio_url=""):
    """
    Transcribes spoken audio from either an uploaded file or a URL using the Whisper model.

    Args:
        audio_file (str or None): Local path to uploaded audio file (optional).
        audio_url (str): URL or Google Drive share link to an audio file (optional).

    Returns:
        str: Transcribed text or error message.
    """
    #file_path = None
    try:
        if audio_file is not None:
            file_path = audio_file
        elif audio_url:
            file_path, error = download_audio(audio_url)
            if error:
                return f"Error: {error} invalid url"
        else:
            return "Please upload a file or enter an audio URL."

        if not is_valid_audio(file_path):
          return "The audio file is invalid or not supported. Try uploading a different file."

        # Specifies the pre-trained Whisper Medium model hosted on Hugging Face. This model is capable of converting speech into text.
        audio_model = "openai/whisper-medium"
        speech_model = AutoModelForSpeechSeq2Seq.from_pretrained(audio_model, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
        speech_model.to(device) #  move the model to GPU

        # process the audio using autoprocessor. This uses feature extractor to extract the text and a tokenizer to process the text into a format useful to the model
        processor = AutoProcessor.from_pretrained(audio_model)

        pipe = pipeline(
        "automatic-speech-recognition",
        model=speech_model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        torch_dtype=torch_dtype,
        device=device)

        # return_timestamps = True, is used when the audio is longer than 3000 mel, language =en is used to translate the transcripttion to English
        result = pipe(file_path, return_timestamps=True)
        transcription = result["text"]

        if audio_url and os.path.exists(file_path):
            os.remove(file_path)
        return transcription
    except Exception as e:
        return f"Transcription error: {str(e)}"


In [9]:
#transcription = transcribe(file_path)

In [10]:
#transcription

### summarize audio content using llama

In [11]:
def generate_summary_from_transcript(transcription: str, user_prompt) -> str:
    """
    Generates structured discussion summary in Markdown format from a transcript using a quantized LLaMA model.

    Args:
        transcription (str): The transcript text to summarize.

    Returns:
        str: Markdown-formatted summary.
    """
    if not user_prompt:
      user_prompt = "Please provide highlights including summary, key discussion points, takeaways and action items in markdown from the transcript"

    # Prompt configuration for LLM
    system_message = (
        "You are an assistant that produces highlights of a discussion from transcripts, "
        "with summary, key discussion points, takeaways and action items with owners, in markdown. "
    )
    user_message = (
        f"{user_prompt}\n"
        f"Here is the discussion transcript: {transcription} "

    )
    print(user_message)
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message}
    ]

    # Model quantization configuration
    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4"
    )

    # Load tokenizer and model
    llama = "meta-llama/Meta-Llama-3.1-8B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(llama)
    tokenizer.pad_token = tokenizer.eos_token

    # Tokenize prompts and generate output
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
    streamer = TextStreamer(tokenizer)
    model = AutoModelForCausalLM.from_pretrained(
        llama,
        device_map="auto",
        quantization_config=quant_config,

    )
    outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)

    # Decode and return result
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(decoded_output)


    # # Extract only the assistant's reply
    response = decoded_output.split(user_prompt)[-1].strip()

    return response,

In [12]:
#summary = generate_summary_from_transcript(transcription, '')

In [13]:
def transcribe_and_generate(audio_file, audio_url, user_prompt=''):
    # get audio transcript
    transcript = transcribe(audio_file, audio_url)

    # generate summary from transcript

    try:
        markdown_summary = generate_summary_from_transcript(transcript, user_prompt)
        print(markdown_summary)

        # Unpack if it's returned as a tuple
        if isinstance(markdown_summary, tuple):
            markdown_summary = markdown_summary[0]
        # Save to file for download
        output_path = "audio_summary.md"
        with open(output_path, "w") as f:
            f.write(markdown_summary)
        return markdown_summary, output_path
    except Exception as e:
        return f"Generation error: {str(e)}", None

# ----------- Gradio UI -----------

with gr.Blocks() as demo:
    gr.Markdown("## 📝 Audio to Text Summary Generator")

    with gr.Row():
        audio_file = gr.Audio(label="Upload audio file", type="filepath")
        audio_url = gr.Textbox(label="Or paste audio URL (e.g. Google Drive)", placeholder="https://...")

    prompt = gr.Textbox(label="Custom prompt (optional)", lines=4, placeholder="Leave empty to use default audio summarizer prompt")

    with gr.Row():
        transcribe_btn = gr.Button("Generate Summary")
        download_file = gr.File(label="Download .md file")

    markdown_output = gr.Markdown()

    transcribe_btn.click(
        transcribe_and_generate,
        inputs=[audio_file, audio_url, prompt],
        outputs=[markdown_output, download_file]
    )



In [14]:
# ----------- Launch App -----------

demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://dbc2fef7046f002f91.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


