In [None]:
pip install transformers torch soundfile librosa

In [None]:
pip install transformers torch torchaudio datasets

In [1]:
import torch
from transformers import pipeline
import soundfile as sf
import librosa

# Define the audio file path
AUDIO_FILE_PATH = "/kaggle/input/eng-hinbi-marathi-mix-audio/New Recording 220.m4a"

# --- Step 1: Transcribe the Audio using a Hugging Face ASR model ---
try:
    # Use a robust ASR model like 'openai/whisper-base'
    # 'whisper-large-v3' provides better accuracy but requires more memory/VRAM
    print(f"Loading ASR model (whisper-base)... This may take a moment the first time.")
    speech_to_text_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-base", chunk_length_s=30) # <-- Add this line to enable long-form transcription)

    # Load and process the audio file
    # Whisper models expect a 16kHz sampling rate
    # librosa helps ensure the correct format
    print(f"Transcribing audio file: {AUDIO_FILE_PATH}")
    audio_input, sampling_rate = librosa.load(AUDIO_FILE_PATH, sr=16000)
    
    # Perform transcription
    transcription_result = speech_to_text_pipeline(audio_input,return_timestamps=True)
    meeting_transcript = transcription_result['text']
    
    
    print("\n--- Full Meeting Transcript ---")
    print(meeting_transcript)
    print("="*50)

    # --- Step 2: Summarize the Transcript using a Hugging Face Summarization model ---

    # Use a summarization model like 'facebook/bart-large-cnn'
    print("Loading Summarization model (bart-large-cnn)...")
    summarizer_pipeline = pipeline("summarization", model="facebook/bart-large-cnn")

    # Generate the summary from the transcribed text
    # The max_length and min_length control the size of the generated summary
    summary_result = summarizer_pipeline(
        meeting_transcript,
        max_length=150,
        min_length=40,
        do_sample=False
    )

    generated_summary = summary_result[0]['summary_text']

    print("\n--- AI-Generated Meeting Summary ---")
    print(generated_summary)
    print("="*50 + "\n")

except FileNotFoundError:
    print(f"Error: Audio file not found at '{AUDIO_FILE_PATH}'")
    print("Please make sure you have an audio file named meeting_audio.wav in the current directory.")
except Exception as e:
    print(f"An error occurred: {e}")
    print("Ensure all required libraries are installed and you have sufficient memory (RAM/VRAM) to run the models locally.")

2025-11-27 07:06:49.868450: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764227210.119218      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764227210.184177      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Loading ASR model (whisper-base)... This may take a moment the first time.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/290M [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

preprocessor_config.json: 0.00B [00:00, ?B/s]

Device set to use cpu


Transcribing audio file: /kaggle/input/eng-hinbi-marathi-mix-audio/New Recording 220.m4a


  audio_input, sampling_rate = librosa.load(AUDIO_FILE_PATH, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



--- Full Meeting Transcript ---
 So what was I saying is that tomorrow at 9 a.m., I have to go to the hospital ani mok tyanantar mala havde havai, dupari baaravajapar antamikorat hi hain. So, agi dho bhaji ki baad dho se agi me apna office ka kankar sakti hum.
Loading Summarization model (bart-large-cnn)...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu
Your max_length is set to 150, but your input_length is only 88. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)



--- AI-Generated Meeting Summary ---
Tomorrow at 9 a.m., I have to go to the hospital ani mok tyanantar mala havde havai. So, agi dho bhaji ki baad dho se agi me apna office ka kankar sakti hum.

