In [None]:
# Video to Text Converter
# Based on https://github.com/Carleslc/AudioToText with updates for video processing

# Install necessary dependencies first (for Google Colab)
import sys
import subprocess

# Install required packages first
print("Installing required packages...")
!pip install SpeechRecognition pydub ffmpeg-python -q
!apt-get update -qq
!apt-get install -y ffmpeg -qq
print("Dependencies installed successfully!")

# Now import the required modules
import os
import speech_recognition as sr
import tempfile
from pydub import AudioSegment
from pydub.silence import split_on_silence
from google.colab import files
import time
import matplotlib.pyplot as plt
import numpy as np
import IPython.display as ipd
from IPython.display import display, HTML

# Extract audio from video file
def extract_audio_from_video(video_path, output_audio_path=None):
    """
    Extract audio from video file using ffmpeg
    """
    if output_audio_path is None:
        output_audio_path = tempfile.mktemp(suffix='.wav')

    cmd = [
        'ffmpeg', '-i', video_path,
        '-vn',  # No video
        '-acodec', 'pcm_s16le',  # Audio codec
        '-ar', '44100',  # Sample rate
        '-ac', '1',  # Mono
        output_audio_path
    ]

    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return output_audio_path

def get_large_audio_transcription(audio_path, language="en-US", chunk_size_ms=60000):
    """
    Split audio into chunks and apply speech recognition
    """
    r = sr.Recognizer()
    sound = AudioSegment.from_wav(audio_path)

    # Split audio where silence is detected
    chunks = split_on_silence(
        sound,
        min_silence_len=500,  # minimum silence length in ms
        silence_thresh=sound.dBFS-14,  # silence threshold
        keep_silence=500  # keep some silence at the beginning and end
    )

    # If chunks are too few (or silence detection didn't work well), use time-based chunking
    if len(chunks) < 2:
        chunks = [sound[i:i+chunk_size_ms] for i in range(0, len(sound), chunk_size_ms)]

    print(f"Audio split into {len(chunks)} chunks")

    folder_name = tempfile.mkdtemp()
    whole_text = ""
    timestamps = []
    current_time = 0

    for i, chunk in enumerate(chunks):
        chunk_filename = os.path.join(folder_name, f"chunk{i}.wav")
        chunk.export(chunk_filename, format="wav")

        with sr.AudioFile(chunk_filename) as source:
            # Adjust for ambient noise and record
            audio_data = r.record(source)

            try:
                # Use Google Speech Recognition
                text = r.recognize_google(audio_data, language=language)
                chunk_duration = len(chunk) / 1000.0  # Convert ms to seconds

                timestamps.append({
                    "start": current_time,
                    "end": current_time + chunk_duration,
                    "text": text
                })

                current_time += chunk_duration
                whole_text += text + " "
                print(f"Chunk {i+1}/{len(chunks)} processed")

            except sr.UnknownValueError:
                print(f"Speech not recognized in chunk {i+1}")
                current_time += len(chunk) / 1000.0
            except sr.RequestError as e:
                print(f"API error in chunk {i+1}: {e}")
                current_time += len(chunk) / 1000.0

    return whole_text, timestamps

def generate_srt(timestamps, filename="subtitles.srt"):
    """
    Generate SRT subtitle file from timestamps
    """
    with open(filename, "w") as srt_file:
        for i, item in enumerate(timestamps):
            start = time.strftime('%H:%M:%S,000', time.gmtime(item["start"]))
            end = time.strftime('%H:%M:%S,000', time.gmtime(item["end"]))

            srt_file.write(f"{i+1}\n")
            srt_file.write(f"{start} --> {end}\n")
            srt_file.write(f"{item['text']}\n\n")

    return filename

def visualize_audio(audio_path):
    """
    Visualize audio waveform
    """
    audio = AudioSegment.from_wav(audio_path)
    samples = np.array(audio.get_array_of_samples())

    plt.figure(figsize=(14, 4))
    plt.plot(np.linspace(0, len(audio) / 1000, len(samples)), samples)
    plt.title('Audio Waveform')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.show()

    # Play audio
    display(ipd.Audio(audio_path))

# Main function to run the entire process
def main():
    print("Upload a video file to transcribe:")
    uploaded = files.upload()

    if not uploaded:
        print("No file was uploaded.")
        return

    video_filename = list(uploaded.keys())[0]
    print(f"Processing video: {video_filename}")

    # Extract audio from video
    audio_path = extract_audio_from_video(video_filename)
    print(f"Audio extracted to: {audio_path}")

    # Visualize the audio
    print("Audio visualization:")
    visualize_audio(audio_path)

    # Select language
    print("\nSelect language for transcription:")
    print("1. English (en-US)")
    print("2. Japanese (ja-JP)")
    print("3. Spanish (es-ES)")
    print("4. French (fr-FR)")
    print("5. German (de-DE)")

    language_codes = {
        "1": "en-US",
        "2": "ja-JP",
        "3": "es-ES",
        "4": "fr-FR",
        "5": "de-DE"
    }

    choice = input("Enter your choice (1-5): ")
    language = language_codes.get(choice, "en-US")

    # Transcribe audio
    print(f"\nTranscribing audio in {language}...")
    start_time = time.time()
    text, timestamps = get_large_audio_transcription(audio_path, language=language)
    end_time = time.time()

    print(f"\nTranscription completed in {end_time - start_time:.2f} seconds")
    print("\nTranscribed text:")
    print(text)

    # Generate SRT file
    srt_filename = generate_srt(timestamps)
    print(f"\nGenerated SRT file: {srt_filename}")
    files.download(srt_filename)

    # Clean up temporary files
    os.remove(audio_path)

    return text, timestamps, srt_filename

# Run the main function
if __name__ == "__main__":
    main()

Installing required packages...
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m32.9/32.9 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hW: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Dependencies installed successfully!
Upload a video file to transcribe:


In [None]:

# Install necessary packages (for Google Colab)
!pip install openai-whisper ffmpeg-python pydub -q
!apt-get update -qq
!apt-get install -y ffmpeg -qq

import os
import subprocess
import tempfile
import whisper
from pydub import AudioSegment
from google.colab import files
import matplotlib.pyplot as plt
import numpy as np
import IPython.display as ipd
from IPython.display import display

# Extract audio from video
def extract_audio_from_video(video_path, output_audio_path=None):
    if output_audio_path is None:
        output_audio_path = tempfile.mktemp(suffix='.wav')

    cmd = [
        'ffmpeg', '-i', video_path,
        '-vn',  # disable video
        '-acodec', 'pcm_s16le',
        '-ar', '16000',
        '-ac', '1',
        output_audio_path
    ]
    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return output_audio_path

# Transcribe using Whisper
def transcribe_with_whisper(audio_path, language="en"):
    model = whisper.load_model("base")  # or "small", "medium", "large"
    print("Transcribing with Whisper...")
    result = model.transcribe(audio_path, language=language)
    return result["text"], result["segments"]

# Generate SRT
def generate_srt_from_whisper_segments(segments, filename="subtitles.srt"):
    with open(filename, "w") as f:
        for i, segment in enumerate(segments):
            start = format_timestamp(segment["start"])
            end = format_timestamp(segment["end"])
            f.write(f"{i+1}\n{start} --> {end}\n{segment['text']}\n\n")
    return filename

# Format seconds to SRT time format
def format_timestamp(seconds):
    hrs = int(seconds // 3600)
    mins = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    millis = int((seconds - int(seconds)) * 1000)
    return f"{hrs:02}:{mins:02}:{secs:02},{millis:03}"

# Visualize Audio
def visualize_audio(audio_path):
    audio = AudioSegment.from_wav(audio_path)
    samples = np.array(audio.get_array_of_samples())
    plt.figure(figsize=(14, 4))
    plt.plot(np.linspace(0, len(audio) / 1000, len(samples)), samples)
    plt.title('Audio Waveform')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.show()
    display(ipd.Audio(audio_path))

# Main function
def main():
    print("Upload a video file to transcribe:")
    uploaded = files.upload()
    if not uploaded:
        print("No file uploaded.")
        return

    video_filename = list(uploaded.keys())[0]
    print(f"Processing video: {video_filename}")

    audio_path = extract_audio_from_video(video_filename)
    print(f"Extracted audio: {audio_path}")

    visualize_audio(audio_path)

    print("\nSelect language:")
    print("1. English\n2. Japanese\n3. Spanish\n4. French\n5. German")
    lang_map = {
        "1": "en", "2": "ja", "3": "es", "4": "fr", "5": "de"
    }
    choice = input("Enter choice (1-5): ")
    language = lang_map.get(choice, "en")

    text, segments = transcribe_with_whisper(audio_path, language=language)
    print("\nTranscription complete:")
    print(text)

    srt_filename = generate_srt_from_whisper_segments(segments)
    print(f"\nGenerated subtitle file: {srt_filename}")
    files.download(srt_filename)

    os.remove(audio_path)
    return text, segments, srt_filename

# Run the main
if __name__ == "__main__":
    main()


In [None]:
# Install necessary packages
!pip install openai-whisper ffmpeg-python pydub -q
!apt-get update -qq
!apt-get install -y ffmpeg -qq

import os
import subprocess
import tempfile
import whisper
from pydub import AudioSegment
from google.colab import files
import matplotlib.pyplot as plt
import numpy as np
import IPython.display as ipd
from IPython.display import display

# Extract audio from video
def extract_audio_from_video(video_path, output_audio_path=None):
    if output_audio_path is None:
        output_audio_path = tempfile.mktemp(suffix='.wav')
    cmd = [
        'ffmpeg', '-i', video_path,
        '-vn',  # no video
        '-acodec', 'pcm_s16le',
        '-ar', '16000',
        '-ac', '1',
        output_audio_path
    ]
    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return output_audio_path

# Format timestamp for SRT
def format_timestamp(seconds):
    hrs = int(seconds // 3600)
    mins = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    millis = int((seconds - int(seconds)) * 1000)
    return f"{hrs:02}:{mins:02}:{secs:02},{millis:03}"

# Generate SRT file
def generate_srt_from_whisper_segments(segments, filename="subtitles.srt"):
    with open(filename, "w") as f:
        for i, segment in enumerate(segments):
            start = format_timestamp(segment["start"])
            end = format_timestamp(segment["end"])
            f.write(f"{i+1}\n{start} --> {end}\n{segment['text']}\n\n")
    return filename

# Visualize Audio
def visualize_audio(audio_path):
    audio = AudioSegment.from_wav(audio_path)
    samples = np.array(audio.get_array_of_samples())
    plt.figure(figsize=(14, 4))
    plt.plot(np.linspace(0, len(audio) / 1000, len(samples)), samples)
    plt.title('Audio Waveform')
    plt.xlabel('Time (s)')
    plt.ylabel('Amplitude')
    plt.show()
    display(ipd.Audio(audio_path))

# Transcribe with Whisper
def transcribe_with_whisper(audio_path, language="en"):
    model = whisper.load_model("base")  # Change to "medium"/"large" if needed
    print("Transcribing...")
    result = model.transcribe(audio_path, language=language)
    return result["text"], result["segments"]

# Burn subtitles into video using ffmpeg
def burn_subtitles_into_video(video_path, srt_path, output_path=None):
    if output_path is None:
        output_path = tempfile.mktemp(suffix='.mp4')
    cmd = [
        'ffmpeg', '-i', video_path,
        '-vf', f"subtitles={srt_path}",
        '-c:a', 'copy',
        output_path
    ]
    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return output_path

# Main flow
def main():
    print("Upload a video file:")
    uploaded = files.upload()
    if not uploaded:
        print("No file uploaded.")
        return

    video_filename = list(uploaded.keys())[0]
    print(f"Processing: {video_filename}")

    audio_path = extract_audio_from_video(video_filename)
    print(f"Extracted audio to: {audio_path}")
    visualize_audio(audio_path)

    print("\nChoose Language:")
    print("1. English\n2. Japanese\n3. Spanish\n4. French\n5. German")
    lang_map = {
        "1": "en", "2": "ja", "3": "es", "4": "fr", "5": "de"
    }
    choice = input("Enter choice (1-5): ")
    language = lang_map.get(choice, "en")

    text, segments = transcribe_with_whisper(audio_path, language=language)
    print("\nTranscription:")
    print(text)

    srt_path = generate_srt_from_whisper_segments(segments)
    print(f"\nSRT file created: {srt_path}")

    output_video_path = burn_subtitles_into_video(video_filename, srt_path)
    print(f"\nVideo with captions created: {output_video_path}")

    files.download(output_video_path)
    os.remove(audio_path)
    return output_video_path

# Run the main
if __name__ == "__main__":
    main()


In [None]:
!pip install openai-whisper ffmpeg-python pydub argostranslate -q
!apt-get install -y ffmpeg -qq


In [None]:
import os
import subprocess
import tempfile
import whisper
from pydub import AudioSegment
from google.colab import files
from argostranslate import package, translate

# Setup Argos Translate
def setup_translation(source_lang_code, target_lang_code):
    if source_lang_code == target_lang_code:
        return None

    available_packages = package.get_available_packages()
    matching = next(
        (p for p in available_packages if p.from_code == source_lang_code and p.to_code == target_lang_code),
        None
    )

    if matching:
        download_path = matching.download()
        package.install_from_path(download_path)

    installed_languages = package.get_installed_packages()
    for p in installed_languages:
        if p.from_code == source_lang_code and p.to_code == target_lang_code:
            return p.get_translation()

    raise Exception(f"No translator found for {source_lang_code} ‚Üí {target_lang_code}")

# Extract audio from video
def extract_audio_from_video(video_path):
    output_audio_path = tempfile.mktemp(suffix='.wav')
    cmd = ['ffmpeg', '-i', video_path, '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', output_audio_path]
    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return output_audio_path

# Transcribe audio using Whisper
def transcribe_with_whisper(audio_path):
    model = whisper.load_model("base")
    result = model.transcribe(audio_path)
    return result["text"], result["segments"]

# Format timestamp for SRT
def format_timestamp(seconds):
    hrs = int(seconds // 3600)
    mins = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    millis = int((seconds - int(seconds)) * 1000)
    return f"{hrs:02}:{mins:02}:{secs:02},{millis:03}"

# Generate translated SRT
def generate_translated_srt(segments, translator, filename="translated_subtitles.srt"):
    with open(filename, "w") as f:
        for i, segment in enumerate(segments):
            start = format_timestamp(segment["start"])
            end = format_timestamp(segment["end"])
            if translator:
                translated_text = translator.translate(segment["text"])
            else:
                translated_text = segment["text"]
            f.write(f"{i+1}\n{start} --> {end}\n{translated_text}\n\n")
    return filename

# Main
def main():
    print("Upload a video file to transcribe and translate:")
    uploaded = files.upload()
    if not uploaded:
        print("No file uploaded.")
        return

    video_filename = list(uploaded.keys())[0]
    audio_path = extract_audio_from_video(video_filename)

    text, segments = transcribe_with_whisper(audio_path)

    print("\nChoose target language:")
    print("1. English\n2. Japanese\n3. Spanish\n4. French\n5. German")
    lang_map = {"1": "en", "2": "ja", "3": "es", "4": "fr", "5": "de"}
    choice = input("Enter choice (1-5): ")
    target_lang = lang_map.get(choice, "en")

    translator = setup_translation("en", target_lang)

    # Generate and save translated subtitles
    srt_file = generate_translated_srt(segments, translator)
    print(f"Subtitles saved as: {srt_file}")

    # Download the file
    files.download(srt_file)

    os.remove(audio_path)

main()


In [None]:
!pip install git+https://github.com/openai/whisper.git
!pip install pydub
!pip install requests
!sudo apt-get install ffmpeg

In [None]:
import os
import subprocess
import tempfile
import whisper
from google.colab import files

# Step 1: Extract audio from video
def extract_audio_from_video(video_path):
    output_audio_path = tempfile.mktemp(suffix='.wav')
    cmd = ['ffmpeg', '-i', video_path, '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', output_audio_path]
    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return output_audio_path

# Step 2: Transcribe with Whisper (auto language detection)
def transcribe_with_whisper(audio_path):
    model = whisper.load_model("base")
    result = model.transcribe(audio_path)
    return result["text"], result["language"]

# Step 3: Main
def main():
    print("Upload a video file:")
    uploaded = files.upload()
    if not uploaded:
        print("No file uploaded.")
        return

    video_filename = list(uploaded.keys())[0]
    audio_path = extract_audio_from_video(video_filename)

    text, detected_lang = transcribe_with_whisper(audio_path)

    print(f"\n‚úÖ Detected Language: {detected_lang}")
    print("\nüìù Transcription in Original Language:\n")
    print(text)

    os.remove(audio_path)

main()


In [None]:
pip install openai-whisper transformers sentencepiece ffmpeg-python

In [None]:
import os
import tempfile
import subprocess
import whisper
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from google.colab import files  # Only if using Google Colab

# Whisper to NLLB mapping
WHISPER_TO_NLLB = {
    "en": "eng_Latn",
    "es": "spa_Latn",
    "fr": "fra_Latn",
    "de": "deu_Latn",
    "ja": "jpn_Jpan",
    "hi": "hin_Deva",
    "ta": "tam_Taml",
    "te": "tel_Telu",
    "zh": "zho_Hans",
    "ko": "kor_Hang",
}

def extract_audio_from_video(video_path):
    output_audio_path = tempfile.mktemp(suffix=".wav")
    cmd = ['ffmpeg', '-i', video_path, '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', output_audio_path]
    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return output_audio_path

def transcribe_audio(audio_path):
    model = whisper.load_model("base")
    result = model.transcribe(audio_path)
    return result["text"], result["language"]

def translate_to_english(native_text, whisper_lang_code):
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

    model_name = "facebook/nllb-200-distilled-600M"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    nllb_lang_code = WHISPER_TO_NLLB.get(whisper_lang_code)
    if not nllb_lang_code:
        raise ValueError(f"Unsupported language code: {whisper_lang_code}")

    # Get forced_bos_token_id safely
    tokenizer.src_lang = nllb_lang_code
    tokenizer.tgt_lang = "eng_Latn"
    forced_bos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.tgt_lang)

    # Split long text into smaller chunks
    sentences = native_text.strip().split("„ÄÇ")  # Japanese sentence end
    chunks = [s.strip() + "„ÄÇ" for s in sentences if s.strip()]

    translated_parts = []
    for chunk in chunks:
        try:
            inputs = tokenizer(chunk, return_tensors="pt", padding=True,
                               truncation=True, max_length=512)
            translated_tokens = model.generate(
                **inputs,
                forced_bos_token_id=forced_bos_token_id,
                max_length=512,
                num_beams=4,
                early_stopping=True
            )
            translated_text = tokenizer.decode(translated_tokens[0],
                                               skip_special_tokens=True)
            translated_parts.append(translated_text)
        except Exception as e:
            print(f"‚ö†Ô∏è Error translating chunk: {chunk[:30]}... -> {e}")

    return " ".join(translated_parts)



def main():
    print("üìÇ Please upload a video file (.mp4, .mkv, etc.):")
    uploaded = files.upload()
    if not uploaded:
        print("No file uploaded.")
        return

    video_path = list(uploaded.keys())[0]

    print("\nüîä Extracting audio...")
    audio_path = extract_audio_from_video(video_path)

    print("üìù Transcribing with Whisper...")
    native_text, detected_lang = transcribe_audio(audio_path)
    print(f"\nüåç Detected Language: {detected_lang}")
    print(f"\nüó£Ô∏è Native Transcription:\n{native_text.strip()}")

    print("\nüåê Translating to English...")
    translated_text = translate_to_english(native_text, detected_lang)
    print(f"\n‚úÖ English Translation:\n{translated_text.strip()}")

    os.remove(audio_path)

main()


In [None]:
pip install -q sentence-transformers faiss-cpu transformers whisper

In [None]:
!pip uninstall -y whisper


In [None]:
!pip install -U openai-whisper


In [None]:
import os
import tempfile
import subprocess
import whisper
import json
import faiss
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from google.colab import files, drive

# Whisper to NLLB language code mapping
WHISPER_TO_NLLB = {
    "en": "eng_Latn",
    "es": "spa_Latn",
    "fr": "fra_Latn",
    "de": "deu_Latn",
    "ja": "jpn_Jpan",
    "hi": "hin_Deva",
    "ta": "tam_Taml",
    "te": "tel_Telu",
    "zh": "zho_Hans",
    "ko": "kor_Hang",
}

# Step 1: Mount Google Drive
drive.mount('/content/drive')

# Paths for FAISS index and documents
docs_path = "/content/drive/MyDrive/news_json/sample_news.json"
index_path = "/content/drive/MyDrive/news_json/faiss_index.index"


# List contents of the folder to confirm
folder = "/content/drive/MyDrive/news_json"
print("üìÅ Files in your folder:", os.listdir(folder))

# Step 2: Load or Create FAISS Index
def load_or_create_faiss_index(docs_path, model, index_path="faiss_index.index"):
    if os.path.exists(index_path) and os.path.exists("docs.json"):
        print("üì¶ Loading existing FAISS index and documents...")
        faiss_index = faiss.read_index(index_path)
        with open("docs.json", "r", encoding="utf-8") as f:
            documents = json.load(f)
    else:
        print("‚ö†Ô∏è FAISS index or documents not found. Creating new index...")

        # ‚úÖ Read and extract content from your articles
        with open(docs_path, "r", encoding="utf-8") as f:
            data = json.load(f)
            articles = data.get("articles", [])
            documents = []
            for article in articles:
                content = article.get("content")
                if content:  # only add non-empty content
                    documents.append(content)

        if not documents:
            raise ValueError("No valid 'content' fields found in the JSON file.")

        # ‚úÖ Generate embeddings
        doc_embeddings = model.encode(documents, convert_to_numpy=True)

        # ‚úÖ Build FAISS index
        faiss_index = faiss.IndexFlatL2(doc_embeddings.shape[1])
        faiss_index.add(doc_embeddings)

        # ‚úÖ Save index and documents
        faiss.write_index(faiss_index, index_path)
        with open("docs.json", "w", encoding="utf-8") as f:
            json.dump(documents, f)

    return faiss_index, documents



# Step 3: Extract Audio
def extract_audio_from_video(video_path):
    output_audio_path = tempfile.mktemp(suffix=".wav")
    cmd = ['ffmpeg', '-i', video_path, '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', output_audio_path]
    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return output_audio_path

# Step 4: Whisper Transcription
def transcribe_audio(audio_path):
    model = whisper.load_model("base")
    result = model.transcribe(audio_path)
    return result["text"], result["language"]

# Step 5: Translate to English
def translate_to_english(native_text, whisper_lang_code):
    from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

    model_name = "facebook/nllb-200-distilled-600M"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    nllb_lang_code = WHISPER_TO_NLLB.get(whisper_lang_code)
    if not nllb_lang_code:
        raise ValueError(f"Unsupported language code: {whisper_lang_code}")

    tokenizer.src_lang = nllb_lang_code
    tokenizer.tgt_lang = "eng_Latn"
    forced_bos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.tgt_lang)

    sentences = native_text.strip().split("„ÄÇ")
    chunks = [s.strip() + "„ÄÇ" for s in sentences if s.strip()]

    translated_parts = []
    for chunk in chunks:
        try:
            inputs = tokenizer(chunk, return_tensors="pt", padding=True,
                               truncation=True, max_length=512)
            translated_tokens = model.generate(
                **inputs,
                forced_bos_token_id=forced_bos_token_id,
                max_length=512,
                num_beams=4,
                early_stopping=True
            )
            translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
            translated_parts.append(translated_text)
        except Exception as e:
            print(f"‚ö†Ô∏è Error translating chunk: {chunk[:30]}... -> {e}")

    return " ".join(translated_parts)

# Step 6: RAG Relevancy Check
def is_relevant(text, index, documents, threshold=0.5):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    query_embedding = model.encode([text])
    D, I = index.search(query_embedding, k=1)
    matched_doc = documents[I[0][0]]
    score = cosine_similarity(query_embedding, model.encode([matched_doc]))[0][0]

    return score >= threshold, documents[I[0][0]]

# Step 7: Generate SRT file
def generate_srt(translated_text):
    srt_path = tempfile.mktemp(suffix=".srt")
    lines = translated_text.split('. ')
    with open(srt_path, "w", encoding="utf-8") as f:
        for i, line in enumerate(lines, 1):
            start = f"00:00:{(i-1)*5:02},000"
            end = f"00:00:{i*5:02},000"
            f.write(f"{i}\n{start} --> {end}\n{line.strip()}\n\n")
    return srt_path

# Step 8: Embed subtitle into video
def embed_subtitle(video_path, srt_path):
    output_video = video_path.rsplit('.', 1)[0] + "_subtitled.mp4"
    cmd = ['ffmpeg', '-i', video_path, '-vf', f"subtitles={srt_path}", '-c:a', 'copy', output_video]
    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return output_video

def main():
    print("üìÇ Please upload a video file (.mp4, .mkv, etc.):")
    uploaded = files.upload()
    if not uploaded:
        print("No file uploaded.")
        return
    video_path = list(uploaded.keys())[0]

    print("\nüîä Extracting audio...")
    audio_path = extract_audio_from_video(video_path)

    print("üìù Transcribing with Whisper...")
    native_text, detected_lang = transcribe_audio(audio_path)
    print(f"\nüåç Detected Language: {detected_lang}")
    print(f"\nüó£Ô∏è Native Transcription:\n{native_text.strip()}")

    print("\nüåê Translating to English...")
    translated_text = translate_to_english(native_text, detected_lang)
    print(f"\n‚úÖ English Translation:\n{translated_text.strip()}")

    print("\nüì¶ Loading RAG index...")
    sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
    index, documents = load_or_create_faiss_index(docs_path, sentence_model, index_path=index_path)

    print("\nüîç Checking relevancy with RAG...")
    relevant, doc = is_relevant(translated_text, index, documents)
    if relevant:
        print("‚úÖ Content is relevant to the RAG context.")
    else:
        print("‚ö†Ô∏è Content is NOT relevant to the RAG context. Skipping subtitle embedding.")

    print("\nüìù Generating subtitle file...")
    srt_path = generate_srt(translated_text)

    print("üéûÔ∏è Embedding subtitles into video...")
    output_video = embed_subtitle(video_path, srt_path)

    print(f"\n‚úÖ Done! Subtitled video saved as: {output_video}")

    # Optionally offer download
    files.download(output_video)

main()


In [None]:
!pip install gradio==4.18.0
!pip install gradio==4.18.0 ffmpeg-python openai-whisper sentence-transformers faiss-cpu transformers
!pip install "pydantic<2.0"


In [None]:
import gradio as gr
import tempfile
import subprocess
import whisper
import json
import os
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Language Mapping
WHISPER_TO_NLLB = {
    "en": "eng_Latn", "es": "spa_Latn", "fr": "fra_Latn", "de": "deu_Latn",
    "ja": "jpn_Jpan", "hi": "hin_Deva", "ta": "tam_Taml", "te": "tel_Telu",
    "zh": "zho_Hans", "ko": "kor_Hang"
}

# RAG index loading
docs_path = "/content/sample_news.json"  # update your file path
index_path = "/content/faiss_index.index"

def load_or_create_faiss_index(model):
    if os.path.exists(index_path) and os.path.exists("docs.json"):
        faiss_index = faiss.read_index(index_path)
        with open("docs.json", "r", encoding="utf-8") as f:
            documents = json.load(f)
    else:
        with open(docs_path, "r", encoding="utf-8") as f:
            data = json.load(f)
            documents = [a.get("content") for a in data.get("articles", []) if a.get("content")]
        doc_embeddings = model.encode(documents, convert_to_numpy=True)
        faiss_index = faiss.IndexFlatL2(doc_embeddings.shape[1])
        faiss_index.add(doc_embeddings)
        faiss.write_index(faiss_index, index_path)
        with open("docs.json", "w", encoding="utf-8") as f:
            json.dump(documents, f)
    return faiss_index, documents

def extract_audio(video_path):
    audio_path = tempfile.mktemp(suffix=".wav")
    cmd = ['ffmpeg', '-i', video_path, '-vn', '-acodec', 'pcm_s16le', '-ar', '16000', '-ac', '1', audio_path]
    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return audio_path

def transcribe(audio_path):
    model = whisper.load_model("base")
    result = model.transcribe(audio_path)
    return result["text"], result["language"]

def translate_to_english(text, lang):
    model_name = "facebook/nllb-200-distilled-600M"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    src_lang = WHISPER_TO_NLLB.get(lang)
    if not src_lang:
        raise ValueError(f"Unsupported language code: {lang}")

    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = "eng_Latn"
    forced_bos_token_id = tokenizer.convert_tokens_to_ids(tokenizer.tgt_lang)

    sentences = text.strip().split("„ÄÇ")
    chunks = [s.strip() + "„ÄÇ" for s in sentences if s.strip()]
    translated = []

    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True, max_length=512)
        outputs = model.generate(**inputs, forced_bos_token_id=forced_bos_token_id, max_length=512)
        translated.append(tokenizer.decode(outputs[0], skip_special_tokens=True))

    return " ".join(translated)

def generate_srt(text):
    srt_path = tempfile.mktemp(suffix=".srt")
    lines = text.split('. ')
    with open(srt_path, "w", encoding="utf-8") as f:
        for i, line in enumerate(lines, 1):
            start = f"00:00:{(i-1)*5:02},000"
            end = f"00:00:{i*5:02},000"
            f.write(f"{i}\n{start} --> {end}\n{line.strip()}\n\n")
    return srt_path

def embed_subtitle(video_path, srt_path):
    output_video = tempfile.mktemp(suffix=".mp4")
    cmd = ['ffmpeg', '-i', video_path, '-vf', f"subtitles={srt_path}", '-c:a', 'copy', output_video]
    subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    return output_video

def process_video(video):
    # Save uploaded video
    video_path = tempfile.mktemp(suffix=".mp4")
    with open(video_path, "wb") as f:
        f.write(video.read())

    # Step-by-step
    audio = extract_audio(video_path)
    native_text, lang = transcribe(audio)
    translated = translate_to_english(native_text, lang)

    rag_model = SentenceTransformer("all-MiniLM-L6-v2")
    index, docs = load_or_create_faiss_index(rag_model)
    query_embedding = rag_model.encode([translated])
    D, I = index.search(query_embedding, k=1)
    matched_doc = docs[I[0][0]]
    similarity = cosine_similarity(query_embedding, rag_model.encode([matched_doc]))[0][0]

    if similarity < 0.5:
        return "‚ùå The video content is not relevant to the RAG context.", None

    srt_path = generate_srt(translated)
    result_video = embed_subtitle(video_path, srt_path)
    return "‚úÖ Video translated and subtitled!", result_video

# Gradio UI
demo = gr.Interface(
    fn=process_video,
    inputs=gr.File(label="Upload Video"),
    outputs=[gr.Text(label="Status"), gr.Video(label="Subtitled Video")],
    title="üìΩÔ∏è Subtitle Translator",
    description="Upload a video and get back a subtitled version translated to English using Whisper and NLLB."
)

demo.launch(debug=True, share=True)
