<a href="https://colab.research.google.com/github/siddhamapple/Projects/blob/main/Meeting_audio_to_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install moviepy vosk pydub soundfile
# Importing required libraries
import os
import subprocess
import wave
import json
from vosk import Model, KaldiRecognizer
import moviepy.editor as mp

# Function to setup FFmpeg and configure Pydub to use it
def setup_ffmpeg(ffmpeg_dir):
    # Path to ffmpeg executable
    ffmpeg_path = os.path.join(ffmpeg_dir, "ffmpeg.exe")
    ffprobe_path = os.path.join(ffmpeg_dir, "ffprobe.exe")
    ffplay_path = os.path.join(ffmpeg_dir, "ffplay.exe")

    if not os.path.exists(ffmpeg_path):
        raise FileNotFoundError(f"FFmpeg not found at {ffmpeg_path}")

    # Return paths for ffmpeg executables
    return ffmpeg_path, ffprobe_path, ffplay_path
# Setup FFmpeg and get the paths to executables
ffmpeg_path, ffprobe_path, ffplay_path = setup_ffmpeg(ffmpeg_directory)
print(f"FFmpeg configured at: {ffmpeg_path}")

model_path = r"C:\Users\Ritika.Gupta\Downloads\vosk"
video_path = r"C:\Users\Ritika.Gupta\Downloads\MIDAS_UiPath_Process_20240603.mp4"
audio_output_path = r"C:\Users\Ritika.Gupta\Downloads\MIDAS_UiPath_Process_20240603.wav"
# Function to extract audio from video using ffmpeg
def extract_audio_from_video(ffmpeg_path, video_path, audio_output_path):
    try:
        # Use ffmpeg to extract audio
        ffmpeg_command = [
            ffmpeg_path,
            "-i", video_path,  # Input video file
            "-vn",  # No video
            "-acodec", "pcm_s16le",  # Audio codec: PCM signed 16-bit little-endian
            "-ac", "1",  # Audio channels: mono
            "-ar", "16000",  # Audio sampling rate: 16kHz
            "-y",  # Overwrite output file if it exists
            audio_output_path  # Output audio file
        ]
        subprocess.run(ffmpeg_command, check=True)
        print(f"Audio extracted to {audio_output_path}")
    except subprocess.CalledProcessError as e:
        print(f"Error during audio extraction with ffmpeg: {e}")
    except Exception as e:
        print(f"Unexpected error during audio extraction: {e}")

# Function to transcribe audio to text using Vosk
def transcribe_audio(audio_path, model_path):
    try:
        if not os.path.exists(model_path):
            print("Please download the Vosk model for English with Indian accent from https://alphacephei.com/vosk/models.")
            return None

        model = Model(model_path)
        recognizer = KaldiRecognizer(model, 16000)

        wf = wave.open(audio_path, "rb")
        if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getframerate() != 16000:
            print("Audio file must be WAV format mono PCM at 16kHz")
            return None

        transcription = ""

        while True:
            data = wf.readframes(4000)
            if len(data) == 0:
                break
            if recognizer.AcceptWaveform(data):
                result = json.loads(recognizer.Result())
                transcription += result['text'] + " "

        final_result = json.loads(recognizer.FinalResult())
        transcription += final_result['text']

        return transcription
    except Exception as e:
        print(f"Error during transcription: {e}")
        return None
# Extract audio from video
extract_audio_from_video(ffmpeg_path, video_path, audio_output_path)
# Transcribe the extracted audio
transcription_text = transcribe_audio(audio_output_path, model_path)
if transcription_text:
    print("Transcription Text:")
    print(transcription_text)
else:
    print("No transcription generated.")



In [None]:
"""
MTSL internship project
The code facilitates the conversion of a meeting video into text by extracting audio using FFmpeg, a powerful multimedia framework.
 It converts the video’s audio track to a mono WAV file with a specific format (16kHz, PCM 16-bit).
  The Vosk library is then employed to transcribe the audio: it processes the audio in chunks, recognizing speech and converting it into text.
  This is achieved by reading audio data, running it through Vosk’s speech recognition model, and appending the recognized text to a transcription.
   The result is a text document that captures the dialogue from the video.
"""