# without rate limit

In [1]:
from google.colab import drive, userdata
import os, re, time
from concurrent.futures import ThreadPoolExecutor, as_completed
from google import genai

# === Mount Drive + API ===
drive.mount('/content/drive')
os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")
client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])

# === 11 Languages ===
languages = [
    "Hindi", "Tamil", "Telugu", "Bengali", "Kannada",
    "Malayalam", "Marathi", "Gujarati", "Punjabi",
    "Odia", "Urdu"
]

# === Paths ===
base_dir = "/content/drive/My Drive/Test_28_Adnew_mp3/Test 1A ASRgemini/"
asr_dir = os.path.join(base_dir, "ASR_whisper1")
out_dir = os.path.join(base_dir, "MT_gemini_parallel")
os.makedirs(out_dir, exist_ok=True)

print("‚ö° Parallel MT Started...")

# === Regex to extract SRT blocks ===
pattern = r"(\d+)\s+([\d:,]+ --> [\d:,]+)\s+(.+?)(?=\n\d+\n|$)"

# === Translation Function ===
def translate_batch(batch, lang):
    joined = "\n".join([f"{n} {t}" for n, _, t in batch])

    prompt = f"""
Translate to {lang}.
Keep meaning natural. DO NOT translate timestamps/numbers.
Return one line per subtitle with segment number included.

{joined}
"""

    for _ in range(3):
        try:
            r = client.models.generate_content(
                model="gemini-2.5-pro",
                contents=prompt
            )
            lines = r.text.strip().split("\n")
            return lines
        except Exception as e:
            print(f"Retrying batch in {lang} ‚Äî {e}")
            time.sleep(2)

    return [""] * len(batch)

# === Parallel Batch Worker ===
def process_language(lang):

    print(f"\nüåê Language: {lang}")

    lang_folder = os.path.join(out_dir, lang)
    os.makedirs(lang_folder, exist_ok=True)

    for fname in os.listdir(asr_dir):
        if not fname.endswith(".srt"):
            continue

        in_path = os.path.join(asr_dir, fname)
        srt_text = open(in_path, "r", encoding="utf-8").read()

        entries = re.findall(pattern, srt_text, flags=re.DOTALL)
        print(f"  {lang}: {fname} ‚Üí {len(entries)} blocks")

        # Prepare batches of 10
        batches = []
        for i in range(0, len(entries), 10):
            block = entries[i:i+10]
            formatted = [(n, t, s.strip()) for n, t, s in block]
            batches.append(formatted)

        results = [None] * len(batches)

        # === Run batches in parallel ===
        with ThreadPoolExecutor(max_workers=10) as ex:
            future_map = {
                ex.submit(translate_batch, batches[i], lang): i
                for i in range(len(batches))
            }
            for f in as_completed(future_map):
                idx = future_map[f]
                results[idx] = f.result()
                print(f"  {lang}: Batch {idx+1}/{len(batches)} done")

        # === Build final SRT + TXT ===
        srt_out = []
        txt_out = []

        for batch, translated in zip(batches, results):
            for (num, ts, _), line in zip(batch, translated):
                line = line.replace(f"{num} ", "")
                srt_out.append(f"{num}\n{ts}\n{line}\n")
                txt_out.append(line)

        base = os.path.splitext(fname)[0]
        srt_file = os.path.join(lang_folder, f"{base}_{lang}.srt")
        txt_file = os.path.join(lang_folder, f"{base}_{lang}.txt")

        open(srt_file, "w", encoding="utf-8").write("\n".join(srt_out))
        open(txt_file, "w", encoding="utf-8").write("\n".join(txt_out))

        print(f"  ‚úî Saved: {lang}/{fname}")

    return f"{lang} completed."

# === Run 11 languages in parallel ===
with ThreadPoolExecutor(max_workers=11) as executor:
    futures = [executor.submit(process_language, lang) for lang in languages]
    for f in as_completed(futures):
        print("‚úÖ", f.result())

print("\nüéâ ALL LANGUAGES PROCESSED IN PARALLEL SUCCESSFULLY!")


Mounted at /content/drive
‚ö° Parallel MT Started...

üåê Language: Hindi

üåê Language: Tamil

üåê Language: Telugu

üåê Language: Bengali

üåê Language: Kannada

üåê Language: Malayalam

üåê Language: Marathi

üåê Language: Gujarati

üåê Language: Punjabi

üåê Language: Odia

üåê Language: Urdu
  Marathi: Copy of Chapter 1A - Concept of Basic Electricity Voltage, Currents, Resistance, Impedance & Power Factor_Whisper1_eng_eng.srt ‚Üí 304 blocks
  Gujarati: Copy of Chapter 1A - Concept of Basic Electricity Voltage, Currents, Resistance, Impedance & Power Factor_Whisper1_eng_eng.srt ‚Üí 304 blocks
  Bengali: Copy of Chapter 1A - Concept of Basic Electricity Voltage, Currents, Resistance, Impedance & Power Factor_Whisper1_eng_eng.srt ‚Üí 304 blocks
  Punjabi: Copy of Chapter 1A - Concept of Basic Electricity Voltage, Currents, Resistance, Impedance & Power Factor_Whisper1_eng_eng.srt ‚Üí 304 blocks
  Urdu: Copy of Chapter 1A - Concept of Basic Electricity Voltage, Currents, R

# with rate limit

In [3]:
from google.colab import drive, userdata
import os, re, time, threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from google import genai

# === Mount Drive + API ===
drive.mount('/content/drive')
os.environ["GOOGLE_API_KEY"] = userdata.get("GOOGLE_API_KEY")
client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])

# === 11 Languages ===
languages = [
    "Hindi", "Tamil", "Telugu", "Bengali", "Kannada",
    "Malayalam", "Marathi", "Gujarati", "Punjabi",
    "Odia", "Urdu"
]

# === Paths ===
base_dir = "/content/drive/My Drive/Test_28_Adnew_mp3/Test 1A ASRgemini/"
asr_dir = os.path.join(base_dir, "ASR_whisper1")
out_dir = os.path.join(base_dir, "MT_gemini_parallel_safe")
os.makedirs(out_dir, exist_ok=True)

pattern = r"(\d+)\s+([\d:,]+ --> [\d:,]+)\s+(.+?)(?=\n\d+\n|$)"

# === GLOBAL RATE LIMITER (2 requests per second) ===
rate_lock = threading.Lock()
last_request_time = 0
RATE_LIMIT = 0.5   # seconds between requests ‚Üí 2 req/sec (120 per minute)

def rate_limited_request(prompt):
    global last_request_time

    with rate_lock:
        now = time.time()
        wait_time = RATE_LIMIT - (now - last_request_time)
        if wait_time > 0:
            time.sleep(wait_time)
        last_request_time = time.time()

    # Safe API request
    return client.models.generate_content(
        model="gemini-2.5-pro",
        contents=prompt
    )

# === Translation Batch ===
def translate_batch(batch, lang):
    joined = "\n".join([f"{n} {t}" for n, _, t in batch])

    prompt = f"""
Translate to {lang}. Keep meaning natural.
Do NOT translate timestamps or numbers.
Return one line per subtitle, starting with the number.

{joined}
"""

    for _ in range(3):
        try:
            r = rate_limited_request(prompt)
            return r.text.strip().split("\n")
        except Exception as e:
            print(f"{lang} retry due to: {e}")
            time.sleep(2)

    return [""] * len(batch)

# === Process One Language ===
def process_language(lang):

    print(f"\nüåê Starting: {lang}")

    lang_folder = os.path.join(out_dir, lang)
    os.makedirs(lang_folder, exist_ok=True)

    for fname in os.listdir(asr_dir):
        if not fname.endswith(".srt"):
            continue

        srt_text = open(os.path.join(asr_dir, fname), "r", encoding="utf-8").read()
        entries = re.findall(pattern, srt_text, flags=re.DOTALL)

        batches = []
        for i in range(0, len(entries), 10):
            block = entries[i:i+10]
            formatted = [(n, t, s.strip()) for n, t, s in block]
            batches.append(formatted)

        results = [None] * len(batches)

        # Only 3 parallel workers per language
        with ThreadPoolExecutor(max_workers=3) as ex:
            future_map = {
                ex.submit(translate_batch, batches[i], lang): i
                for i in range(len(batches))
            }
            for f in as_completed(future_map):
                idx = future_map[f]
                results[idx] = f.result()
                print(f"{lang} ‚Üí batch {idx+1}/{len(batches)}")

        # Build SRT + TXT outputs
        srt_out, txt_out = [], []
        for batch, translated in zip(batches, results):
            for (num, ts, _), line in zip(batch, translated):
                line = line.replace(f"{num} ", "")
                srt_out.append(f"{num}\n{ts}\n{line}\n")
                txt_out.append(line)

        base = os.path.splitext(fname)[0]
        open(os.path.join(lang_folder, f"{base}_{lang}.srt"), "w", encoding="utf-8").write("\n".join(srt_out))
        open(os.path.join(lang_folder, f"{base}_{lang}.txt"), "w", encoding="utf-8").write("\n".join(txt_out))

    return f"{lang} completed."

# === Run 11 languages in parallel ===
with ThreadPoolExecutor(max_workers=11) as executor:
    futures = [executor.submit(process_language, lang) for lang in languages]
    for f in as_completed(futures):
        print("‚úî", f.result())

print("\nüéâ ALL LANGUAGES DONE (RATE SAFE)!")


Tamil ‚Üí batch 2/31
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

üåê Starting: Hindi

üåê Starting: Tamil

üåê Starting: Telugu

üåê Starting: Bengali

üåê Starting: Kannada

üåê Starting: Malayalam

üåê Starting: Marathi

üåê Starting: Gujarati

üåê Starting: Punjabi

üåê Starting: Odia

üåê Starting: Urdu
Telugu retry due to: Server disconnected without sending a response.
Bengali retry due to: Server disconnected without sending a response.
Bengali ‚Üí batch 2/31
Tamil ‚Üí batch 2/31
Telugu retry due to: [SSL: DECRYPTION_FAILED_OR_BAD_RECORD_MAC] decryption failed or bad record mac (_ssl.c:2580)Tamil retry due to: [SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:2580)

Bengali ‚Üí batch 1/31
Bengali ‚Üí batch 2/31
Marathi retry due to: [SSL: DECRYPTION_FAILED_OR_BAD_RECORD_MAC] decryption failed or bad record mac (_ssl.c:2580)
Urdu retry due to: [SSL: WRONG_VERSION_NUMBER] wrong 

KeyboardInterrupt: 