In [None]:
!git clone https://github.com/AI4Bharat/IndicTrans2
%cd IndicTrans2/huggingface_interface

In [None]:
!bash install.sh

In [None]:
import os
import torch
from pathlib import Path
from google.colab import drive
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from IndicTransToolkit.IndicTransToolkit import IndicProcessor

# ========================
# Utility Functions
# ========================

def format_timestamp(seconds: float) -> str:
    """Preserve SRT timestamp format."""
    millis = int((seconds - int(seconds)) * 1000)
    seconds = int(seconds)
    mins, sec = divmod(seconds, 60)
    hrs, mins = divmod(mins, 60)
    return f"{hrs:02d}:{mins:02d}:{sec:02d},{millis:03d}"

# ========================
# Environment Setup
# ========================

drive.mount('/content/drive')

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

src_lang = "eng_Latn"
tgt_lang = "hin_Deva"
model_name = "ai4bharat/indictrans2-en-indic-1B"

# Load model + tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    attn_implementation="flash_attention_2"
).to(DEVICE)

ip = IndicProcessor(inference=True)

# Input / Output paths
input_dir = "/content/drive/MyDrive/Commercial Horticulture English ASR"
output_dir = "/content/drive/MyDrive/Commercial Horticulture English MThi"

# ========================
# Translation Logic
# ========================

def translate_srt(input_path, output_srt, output_txt):
    """Translate a single SRT file and save both .srt and .txt outputs."""
    with open(input_path, 'r', encoding='utf-8') as f:
        srt_lines = f.readlines()

    translated_srt, translated_txt = [], []

    for i, line in enumerate(srt_lines):
        if i % 4 == 2:  # Subtitle line
            text = line.strip()
            if text:
                batch = ip.preprocess_batch([text], src_lang=src_lang, tgt_lang=tgt_lang)
                inputs = tokenizer(batch, truncation=True, padding="longest", return_tensors="pt").to(DEVICE)

                with torch.no_grad():
                    outputs = model.generate(
                        **inputs,
                        max_length=256,
                        num_beams=5,
                        num_return_sequences=1,
                    )

                decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
                final_text = ip.postprocess_batch([decoded], lang=tgt_lang)[0]

                translated_srt.append(final_text)
                translated_txt.append(final_text)
        else:
            translated_srt.append(line.strip())

    # Save outputs
    with open(output_srt, 'w', encoding='utf-8') as f:
        f.write("\n".join(translated_srt))

    with open(output_txt, 'w', encoding='utf-8') as f:
        f.write("\n".join(translated_txt))

    print(f"✅ Saved: {output_srt}")
    print(f"✅ Saved: {output_txt}")

# ========================
# Batch Processing
# ========================

for root, _, files in os.walk(input_dir):
    for file in files:
        if file.endswith(".srt"):
            input_path = os.path.join(root, file)
            relative = os.path.relpath(root, input_dir)
            save_dir = os.path.join(output_dir, relative)
            os.makedirs(save_dir, exist_ok=True)

            output_srt = os.path.join(save_dir, file)
            output_txt = os.path.join(save_dir, file.replace(".srt", ".txt"))

            print(f"🔄 Translating: {input_path}")
            translate_srt(input_path, output_srt, output_txt)
