## Transformer Model

In [None]:
! pip install -q transformers moviepy

In [None]:
import librosa
import torch
import moviepy.editor as mp
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer

In [None]:
# Load the video using MoviePy
clip = mp.VideoFileClip("sample.mp4")

# Separate audio track from the video
audio = clip.audio.write_audiofile("temp_audio.wav")


In [None]:
#load pre-trained model and tokenizer
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

In [None]:
#load any audio file of your choice
speech, rate = librosa.load("temp_audio.wav",sr=16000)

In [None]:
import IPython.display as display
display.Audio("temp_audio.wav", autoplay=True)

In [None]:
input_values = tokenizer(speech, return_tensors = 'pt').input_values

In [None]:
input_values

In [None]:
#Store logits (non-normalized predictions)
logits = model(input_values).logits

In [None]:
logits

In [None]:
#Store predicted id's
predicted_ids = torch.argmax(logits, dim =-1)

In [None]:
#decode the audio to generate text
transcriptions = tokenizer.decode(predicted_ids[0])

In [None]:
print(transcriptions)

## OpenAI Whisper

In [None]:
!pip install git+https://github.com/openai/whisper.git 

In [None]:
# import
import whisper

model = whisper.load_model("base")

# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio("Pipeline.mp4")
audio = whisper.pad_or_trim(audio)

# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio).to(model.device)

# detect the spoken language
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)

# print the recognized text
print(result.text)

In [None]:
import whisper
import glob, os

# Define chunk duration in seconds (adjust as needed)
chunk_duration = 30

def process_large_video(video_path, model_name="base"):
    """
    Processes a large video by splitting it into chunks and performing speech recognition using Whisper.
    Args:
        video_path: Path to the large video file.
        model_name: Name of the Whisper model to use (default: base).
    Returns:
        str: The combined transcribed text from all chunks.
    """
    
    model = whisper.load_model(model_name)
    total_text = ""
    
    # Use ffmpeg to split video into chunks
    import subprocess
    subprocess.run(f"ffmpeg -i {video_path} -f segment -segment_time {chunk_duration} -c copy output_%03d.mp4".split())
    
    # Process each video chunk
    for chunk_file in sorted(glob.glob("output_*.mp4")):
        audio = whisper.load_audio(chunk_file)
        audio = whisper.pad_or_trim(audio)
        
        mel = whisper.log_mel_spectrogram(audio).to(model.device)
        options = whisper.DecodingOptions()
        
        result = whisper.decode(model, mel, options)
        total_text += result.text + "\n"  # Add newline for separation
        
        # Clean up processed chunk (optional)
        os.remove(chunk_file)
    return total_text

In [None]:
# Example usage
video_path = "Pipeline.mp4"
transcribed_text = process_large_video(video_path)
print(transcribed_text)