In [1]:
!pip install librosa openai-whisper ffmpeg-python

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/800.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━[0m [32m665.6/800.5 kB[0m [31m20.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting ffmpeg-python
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl.metadata (1.7 kB)
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting triton>=2.0.0 (from openai-whisper)
  Downloading triton-3.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_

In [2]:
import librosa
import numpy as np
import soundfile as sf

def preprocess_audio(input_file, output_file):
    """
    Preprocess the audio: noise reduction and silence removal.
    Args:
        input_file (str): Path to the input audio file.
        output_file (str): Path to save the processed audio.
    """
    # Load the audio file
    audio, sr = librosa.load(input_file, sr=None)

    # Noise reduction (simple: adjust amplitude threshold)
    noise_threshold = 0.02  # Adjust based on the dataset
    audio = np.where(np.abs(audio) > noise_threshold, audio, 0)

    # Trim silence
    trimmed_audio, _ = librosa.effects.trim(audio, top_db=20)

    # Save the processed audio
    sf.write(output_file, trimmed_audio, sr)
    print(f"Processed audio saved to: {output_file}")

# # Example Usage
# preprocess_audio("/content/Video_Demo_Intro_Audio.wav", "/content/meeting_audio_processed.wav")

Processed audio saved to: /content/meeting_audio_processed.wav


In [3]:
import whisper

def transcribe_audio(processed_audio_file):
    """
    Transcribe audio using OpenAI's Whisper model.
    Args:
        processed_audio_file (str): Path to the preprocessed audio file.
    Returns:
        str: Transcribed text from the audio.
    """
    # Load the Whisper model
    model = whisper.load_model("base")  # Options: 'tiny', 'base', 'small', 'medium', 'large'
    model = model.to('cuda')
    # Transcribe audio
    result = model.transcribe(processed_audio_file)

    # Extract and return the transcribed text
    transcribed_text = result["text"]
    print("Transcription completed.")
    return transcribed_text

# # Example Usage
# transcription = transcribe_audio("/content/meeting_audio_processed.wav")
# print("Transcribed Text:\n", transcription)


100%|███████████████████████████████████████| 139M/139M [00:02<00:00, 68.8MiB/s]
  checkpoint = torch.load(fp, map_location=device)


Transcription completed.
Transcribed Text:
  Hello, my name is Shreya Patip and I am part of the Vastavik team. We are developing an innovative algorithm that converts 2D images of variable spectacles into 3D models.


In [None]:
def audio_transcription_pipeline(input_audio, output_audio):
    """
    Complete pipeline for audio processing and transcription.
    Args:
        input_audio (str): Raw input audio file path.
        output_audio (str): Processed audio file path.
    Returns:
        str: Transcribed text from the audio.
    """
    # Step 1: Preprocess Audio
    preprocess_audio(input_audio, output_audio)

    # Step 2: Transcribe Audio
    transcribed_text = transcribe_audio(output_audio)
    return transcribed_text

# Example Usage
input_audio = "/content/Video_Demo_Intro_Audio.wav"
output_audio = "/content/meeting_audio_processed.wav"
transcription_result = audio_transcription_pipeline(input_audio, output_audio)
print("Final Transcription:\n", transcription_result)
