new logic


In [None]:
from google.colab import drive, userdata
import os
import re
import time
from google import genai

# === Mount Google Drive and API ===
drive.mount('/content/drive')
os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')
client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])

# === Paths ===
base_dir = "/content/drive/My Drive/Test_quality/gemini25pro/"
asr_dir = os.path.join(base_dir, "Abhishek")  # input SRTs
mt_dir = os.path.join(base_dir, "mt")  # translated output
os.makedirs(mt_dir, exist_ok=True)

target_language = "English"
print("üü¢ Ready ‚Äî Processing all .srt files...")

# === SRT parsing pattern ===
pattern = r"(\d+)\s+([\d:,]+ --> [\d:,]+)\s+(.+?)(?=\n\d+\n|$)"

def translate_batch(lines):
    """Translate list of subtitle text chunks at once with Gemini."""
    joined_text = "\n".join(lines)
    prompt = f"""
You are a professional subtitle translator for Indic languages.

Translate the following subtitle dialogue into {target_language}.
Preserve meaning. Keep subtitles short and natural.
Do NOT translate numbers or timestamps.
Return one line per subtitle, in order.

Text:
{joined_text}
"""
    for _ in range(3):  # retry logic
        try:
            response = client.models.generate_content(
                model="gemini-2.5-pro",  # or gemini-2.0-pro if you have access
                contents=prompt
            )
            # Gemini's response object
            result_text = response.text.strip()
            return result_text.split("\n")
        except Exception as e:
            print("Retrying batch due to error:", e)
            time.sleep(3)
    return [""] * len(lines)

# === Loop over all SRT files ===
for f_name in os.listdir(asr_dir):
    if not f_name.lower().endswith(".srt"):
        continue

    input_file = os.path.join(asr_dir, f_name)
    print(f"\nüé¨ Processing: {f_name}")

    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()

    entries = re.findall(pattern, content, flags=re.DOTALL)
    print(f"   ‚Üí {len(entries)} subtitles detected")

    translated_entries = []
    translated_text_only = []

    batch_size = 15
    for i in range(0, len(entries), batch_size):
        batch = entries[i:i+batch_size]
        orig_texts = [t[2].strip() for t in batch]

        translated_batch = translate_batch(orig_texts)

        for (num, ts, _), trans in zip(batch, translated_batch):
            translated_entries.append(f"{num}\n{ts}\n{trans}\n")
            translated_text_only.append(trans)

        print(f"   ‚úÖ Translated segments {i+1}‚Äì{min(i+batch_size,len(entries))}")

    # Save outputs
    base = os.path.splitext(f_name)[0]
    srt_out = os.path.join(mt_dir, f"{base}_{target_language}.srt")
    txt_out = os.path.join(mt_dir, f"{base}_{target_language}.txt")

    with open(srt_out, "w", encoding='utf-8') as f:
        f.write("\n".join(translated_entries))

    with open(txt_out, "w", encoding='utf-8') as f:
        f.write("\n".join(translated_text_only))

    print(f"   üìÅ Saved ‚Üí {srt_out}")
    print(f"   üìÑ Saved ‚Üí {txt_out}")

print("\n‚úÖ All files translated successfully!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
üü¢ Ready ‚Äî Processing all .srt files...

üé¨ Processing: Chapter 10B - Packaging, Labeling & Branding_Gemini25pro_hi_hi.srt
   ‚Üí 108 subtitles detected
   ‚úÖ Translated segments 1‚Äì15
   ‚úÖ Translated segments 16‚Äì30
   ‚úÖ Translated segments 31‚Äì45
   ‚úÖ Translated segments 46‚Äì60
   ‚úÖ Translated segments 61‚Äì75
   ‚úÖ Translated segments 76‚Äì90
   ‚úÖ Translated segments 91‚Äì105
   ‚úÖ Translated segments 106‚Äì108
   üìÅ Saved ‚Üí /content/drive/My Drive/Test_quality/gemini25pro/mt/Chapter 10B - Packaging, Labeling & Branding_Gemini25pro_hi_hi_English.srt
   üìÑ Saved ‚Üí /content/drive/My Drive/Test_quality/gemini25pro/mt/Chapter 10B - Packaging, Labeling & Branding_Gemini25pro_hi_hi_English.txt

‚úÖ All files translated successfully!


In [None]:
# ========================
# INSTALL & IMPORTS
# ========================
!pip install -q google-generativeai pydub tqdm librosa

import os
import io
from google.colab import drive, userdata
import google.generativeai as genai
from pydub import AudioSegment
from tqdm import tqdm

# ========================
# SETUP
# ========================

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Securely load your Gemini API key from Colab secrets
api_key = userdata.get("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("‚ùå No GOOGLE_API_KEY found in Colab secrets! Add it under 'More ‚Üí Secrets'.")

genai.configure(api_key=api_key)

# Choose your model
model = genai.GenerativeModel("models/gemini-3-pro-preview")

# Input/output folders in Google Drive
base_dir = "/content/drive/MyDrive/Test_28_Adnew_wav/"
input_dir = "/content/drive/MyDrive/Test_28_Adnew_wav/Tamil/"
output_dir = os.path.join(base_dir, "TestGem3","Tamil","Srtformatissue")
os.makedirs(output_dir, exist_ok=True)

# ========================
# HELPER FUNCTIONS
# ========================

def transcribe_audio_file(file_path):
    """Transcribe full audio file without splitting."""
    audio = AudioSegment.from_wav(file_path)
    buffer = io.BytesIO()
    audio.export(buffer, format="wav")
    audio_bytes = buffer.getvalue()

    try:
        response = model.generate_content(
        contents=[
            {
                "role": "user",
                "parts": [
                    {"mime_type": "audio/wav", "data": audio_bytes},
                    """
                    You are a Subtitle Generator.

                    Transcribe this audio exactly as spoken (strictly: no extra comments, strictly: no filler words)
                    in valid .srt format.

                    Before outputting, you MUST internally ensure:

                    - Each subtitle segment must contain **exactly 3 sentences**, unless the audio ends and fewer remain.
                    - Maintain natural sentence boundaries.
                    - Combine sentences smoothly while keeping meaning and flow.
                    - Only create a new segment after exactly 3 sentences have been completed (except the final segment).
                    - Timestamp continuity must be correct and must not overlap.
                    - Format must strictly be:

                      <index>
                      HH:MM:SS,SSS --> HH:MM:SS,SSS
                      text

                    Rules:
                    1. Timestamps must be chronological and continuous.
                    2. Every segment contains exactly 3 sentences (except final).
                    3. Never generate timestamps beyond the audio duration.
                    4. If Gemini outputs incorrect timestamps, fix them BEFORE final output.
                    5. No explanations. Only the final SRT.
                    6. Include speaker labels if detectable.
                    7. Silence > 2 seconds ‚Üí include:
                      [Silence]
                      with correct timestamps.


                    """
                ]
            }
        ]
    )


        return response.text.strip()
    except Exception as e:
        print("‚ùå Error:", e)
        return ""

# ========================
# MAIN PROCESS
# ========================

for filename in os.listdir(input_dir):
    if filename.lower().endswith(".wav"):
        file_path = os.path.join(input_dir, filename)
        print(f"\nüéß Transcribing full audio: {filename}")

        # Get full transcription
        text = transcribe_audio_file(file_path)

        # Save TXT file
        txt_output = os.path.join(output_dir, filename.replace(".wav", ".txt"))
        with open(txt_output, "w", encoding="utf-8") as f:
            f.write(text)

        print(f"‚úÖ Done: {filename}")
        print(f"üìÑ TXT saved to: {txt_output}")


Mounted at /content/drive

üéß Transcribing full audio: Chapter 4H - Kachori.wav
‚úÖ Done: Chapter 4H - Kachori.wav
üìÑ TXT saved to: /content/drive/MyDrive/Test_28_Adnew_wav/TestGem3/Tamil/Srtformatissue/Chapter 4H - Kachori.txt

üéß Transcribing full audio: Kadai Kamal Stitch Final.wav
‚úÖ Done: Kadai Kamal Stitch Final.wav
üìÑ TXT saved to: /content/drive/MyDrive/Test_28_Adnew_wav/TestGem3/Tamil/Srtformatissue/Kadai Kamal Stitch Final.txt


# check given asr gemini prompt

In [None]:
# Colab-ready Gemini 2.5 Pro transcription + SRT/TXT/JSON writer
# NOTE: adapt request payload if your installed google-genai SDK has a different signature.
# Docs & Colab Quickstart reference: https://ai.google.dev/api and the Gemini audio quickstart. :contentReference[oaicite:1]{index=1}

# 1) Install SDK (run once)
!pip install --quiet google-genai

# 2) Imports
import os
import json
import base64
from google import genai
from datetime import timedelta

# ---------- User config - fill these ----------
API_KEY = os.environ.get("AIzaSyD13ujexBN3PjVuD2fx_5wHPH3fIdeJKGQ") or "<PASTE_YOUR_GEMINI_API_KEY_HERE>"
AUDIO_PATH = "/content/testprom.mp3"   # upload your file to this path in Colab
AUDIO_DURATION = "00:17:58,884"        # Fill actual duration (HH:MM:SS,mmm) or compute it
HOTWORDS = ["SmartQP", "EduTrack", "CBSE", "NEP"]
CONTEXT = "educational lecture"
EXPECTED_LANGS = ["auto"]  # or explicit like ["ta","en"]
MODEL = "gemini-2.5-pro"
# ---------------------------------------------

if API_KEY.startswith("<PASTE"):
    raise SystemExit("Please set GEMINI_API_KEY environment variable or paste your key into API_KEY.")

# 3) Initialize client
client = genai.Client(api_key=API_KEY)

# 4) Build the comprehensive prompt (string). We send the JSON schema as text instructions.
master_prompt = f"""
Task: Full-Fidelity Transcription with Metadata Extraction
Model: {MODEL}
Temperature: 0.0
Audio Duration: {AUDIO_DURATION}
Hotwords: {json.dumps(HOTWORDS)}
Context: {CONTEXT}
Expected Languages: {EXPECTED_LANGS}
Rules:
- Transcribe exactly as spoken. DO NOT translate or summarize.
- Preserve native script for code-mixed speech. Do not transliterate.
- Do not modify hotwords; keep them exactly as provided.
- If unclear audio: mark as [inaudible].
- Mark pauses >2s as [Silence].
- Provide sentence-level segments, each with start & end timestamps (HH:MM:SS,mmm).
- Diarize: label speakers as Speaker 1, Speaker 2, ...
- Generate .srt, .txt and .json outputs.
Output JSON schema expectations:
{
  "model": "gemini-2.5-pro",
  "temperature": 0.0,
  "detected_languages": [],
  "duration": "{AUDIO_DURATION}",
  "segments": [
    {{ "index": 1, "speaker": "Speaker 1", "start": "00:00:00,000", "end": "00:00:07,500", "text": "...", "confidence": 0.0 }},
    ...
  ],
  "metadata": {{
    "beam_width": 5,
    "silence_intervals": [],
    "hotwords_used": {json.dumps(HOTWORDS)},
    "code_mixing_detected": true
  }}
}
"""

# 5) Read audio and base64 encode (some SDKs accept direct file upload; adapt if your SDK supports multipart)
with open(AUDIO_PATH, "rb") as f:
    audio_bytes = f.read()
audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")

# 6) Prepare request payload - adapt if SDK signature differs.
# Many Gemini examples accept a 'input' list mixing text prompt and audio blob.
# If your SDK has a dedicated audio.transcribe method, use that and attach the 'master_prompt' as the instruction.
request_payload = {
    "model": MODEL,
    "input": [
        {"role": "user", "content": [{"type": "text", "text": master_prompt}]},
        # embed audio as base64 payload (some SDKs support 'content' with type 'audio' and 'format' fields)
        {"role": "user", "content": [{"type": "audio", "audio": audio_b64, "mime_type": "audio/wav", "filename": os.path.basename(AUDIO_PATH)}]}
    ],
    # you can add config params (temperature etc.) depending on SDK
    "temperature": 0.0
}

# 7) Send request (this is a canonical pattern; adjust per SDK)
# The google-genai client may provide .responses.create or .audio.transcribe; if your SDK provides .audio.transcriptions.create,
# prefer using that. Below uses a generic responses.create pattern:
resp = client.responses.create(
    model=MODEL,
    messages=[{"role":"user","content":master_prompt}],
    # If SDK supports direct audio attachments, pass them in the appropriate param.
    # Some SDKs accept "input_audio" or "files" fields ‚Äî check your version of google-genai.
    # For simplicity, we attach the base64 audio in a separate metadata field that the model can access in notebooks.
    # If your SDK supports audio directly, use that instead for streaming & accuracy.
    # NOTE: If this call errors, switch to the SDK's dedicated audio transcription API per docs.
)

# --------- Post-processing: assume resp.output_text (or resp.output) contains JSON structured output -----------
# Adapt to whether resp returns structured 'content' or 'candidates'; inspect resp first:
print("Raw response keys:", dir(resp) if resp else "No response object")

# Example: if the model returned structured JSON in resp.output[0].content[0].text or resp.output_text
# We attempt to extract JSON from resp.output_text if available.
raw_text = None
if hasattr(resp, "output_text") and resp.output_text:
    raw_text = resp.output_text
else:
    # fallback: try resp.output[0].content[0].text
    try:
        raw_text = ""
        for item in resp.output:
            # content could be a list of dicts
            if hasattr(item, "content"):
                raw_text += getattr(item, "content", "")
    except Exception as e:
        print("Could not auto-extract raw text from response; inspect 'resp' object manually.", e)

if not raw_text:
    raise SystemExit("No textual output extracted. Inspect 'resp' object and adapt extraction logic per SDK response format.")

# If the model returned JSON as text, parse it:
try:
    parsed = json.loads(raw_text)
except Exception:
    # If the model returned free text containing a JSON block, attempt to find the JSON substring
    import re
    m = re.search(r"(\{[\s\S]*\})", raw_text)
    if m:
        parsed = json.loads(m.group(1))
    else:
        raise SystemExit("Response did not contain JSON. Inspect raw_text:\n\n" + raw_text[:2000])

# -------------- Helper write functions ----------------
def write_json(path, data):
    with open(path, "w", encoding="utf-8") as wf:
        json.dump(data, wf, ensure_ascii=False, indent=2)

def srt_time_str(hms_msec):
    # Accepts "HH:MM:SS,mmm" strings already; ensure format is correct
    return hms_msec

def write_srt(path, segments):
    with open(path, "w", encoding="utf-8") as wf:
        for seg in segments:
            idx = seg.get("index")
            start = seg.get("start")
            end = seg.get("end")
            text = seg.get("text", "").strip()
            # prefix speaker if present
            speaker = seg.get("speaker")
            if speaker:
                text = f"{speaker}: {text}"
            wf.write(f"{idx}\n")
            wf.write(f"{srt_time_str(start)} --> {srt_time_str(end)}\n")
            wf.write(f"{text}\n\n")

def write_txt(path, segments):
    with open(path, "w", encoding="utf-8") as wf:
        current_speaker = None
        for seg in segments:
            spk = seg.get("speaker", "Speaker")
            if spk != current_speaker:
                wf.write(f"\n{spk}:\n")
                current_speaker = spk
            wf.write(seg.get("text","").strip() + "\n")

# -------------- Save files ----------------
segments = parsed.get("segments")
if not segments:
    raise SystemExit("Parsed JSON does not contain 'segments'. Inspect parsed json keys: " + ", ".join(parsed.keys()))

# Ensure final segment ends at AUDIO_DURATION: if not, you may adjust/append a silent marker (careful: best to let model produce exact)
# Here we only write files as returned.
out_prefix = os.path.splitext(os.path.basename(AUDIO_PATH))[0]
json_path = f"/content/{out_prefix}_transcript.json"
srt_path  = f"/content/{out_prefix}_transcript.srt"
txt_path  = f"/content/{out_prefix}_transcript.txt"

write_json(json_path, parsed)
write_srt(srt_path, segments)
write_txt(txt_path, segments)

print("Wrote:", json_path, srt_path, txt_path)




# Mod prompt run

In [None]:
# =====================================================
# INSTALL & IMPORTS
# =====================================================
!pip install -q google-genai pydub tqdm librosa

import os
import io
import json
import base64
from datetime import timedelta
from tqdm import tqdm
from pydub import AudioSegment
from google.colab import drive, userdata
from google import genai

# =====================================================
# SETUP
# =====================================================

# Mount Drive
drive.mount('/content/drive', force_remount=True)

# API Key (from Colab Secrets)
api_key = userdata.get("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("‚ùå No GOOGLE_API_KEY found in Colab secrets! Add it under 'More ‚Üí Secrets'.")

client = genai.Client(api_key=api_key)
MODEL = "gemini-2.5-pro"

# Input/output paths
base_dir = "/content/drive/MyDrive/Test_28_Adnew_wav/"
input_dir = os.path.join(base_dir, "English")
output_dir = os.path.join(base_dir, "Test_28_srtimprv", "newpromt")
os.makedirs(output_dir, exist_ok=True)

# =====================================================
# FUNCTIONS
# =====================================================

def get_audio_duration(file_path):
    audio = AudioSegment.from_file(file_path)
    total_ms = len(audio)
    td = timedelta(milliseconds=total_ms)
    h, m, s = str(td).split(":")
    ms = int((float(s) - int(float(s))) * 1000)
    return f"{int(h):02d}:{int(m):02d}:{int(float(s)):02d},{ms:03d}"

def build_prompt(duration, context="english speech", hotwords=None, langs=None):
    hotwords = hotwords or []
    langs = langs or ["auto"]
    return f"""
Task: Full-Fidelity Transcription with Metadata Extraction
Model: {MODEL}
Temperature: 0.0
Audio Duration: {duration}
Hotwords: {json.dumps(hotwords)}
Context: {context}
Expected Languages: {langs}
Rules:
- Transcribe exactly as spoken. DO NOT translate or summarize.
- Preserve native script for code-mixed speech.
- Do not modify hotwords; keep them exactly as provided.
- If unclear audio: mark as [inaudible].
- Mark pauses >2s as [Silence].
- Provide sentence-level segments with timestamps (HH:MM:SS,mmm).
- Diarize speakers: Speaker 1, Speaker 2, ...
- Generate .srt, .txt, and .json outputs.
Output JSON schema:
{{
  "model": "{MODEL}",
  "duration": "{duration}",
  "segments": [
    {{ "index": 1, "speaker": "Speaker 1", "start": "00:00:00,000", "end": "00:00:07,500", "text": "...", "confidence": 0.0 }},
    ...
  ]
}}
"""

def write_json(path, data):
    with open(path, "w", encoding="utf-8") as wf:
        json.dump(data, wf, ensure_ascii=False, indent=2)

def write_srt(path, segments):
    with open(path, "w", encoding="utf-8") as wf:
        for seg in segments:
            idx = seg.get("index")
            start = seg.get("start")
            end = seg.get("end")
            text = seg.get("text", "").strip()
            spk = seg.get("speaker")
            if spk:
                text = f"{spk}: {text}"
            wf.write(f"{idx}\n{start} --> {end}\n{text}\n\n")

def write_txt(path, segments):
    with open(path, "w", encoding="utf-8") as wf:
        current_speaker = None
        for seg in segments:
            spk = seg.get("speaker", "Speaker")
            if spk != current_speaker:
                wf.write(f"\n{spk}:\n")
                current_speaker = spk
            wf.write(seg.get("text", "").strip() + "\n")

# =====================================================
# MAIN LOOP
# =====================================================

for filename in tqdm(os.listdir(input_dir), desc="Processing audio files"):
    if not filename.lower().endswith(".wav"):
        continue

    file_path = os.path.join(input_dir, filename)
    print(f"\nüéß Transcribing: {filename}")

    # Get audio duration
    duration = get_audio_duration(file_path)

    # Read and base64-encode audio
    with open(file_path, "rb") as f:
        audio_bytes = f.read()
    audio_b64 = base64.b64encode(audio_bytes).decode("utf-8")

    # Build prompt
    master_prompt = build_prompt(duration, context="Tamil lecture", hotwords=["SmartQP", "EduTrack", "CBSE", "NEP"], langs=["ta"])

    # Send to Gemini
    resp = client.responses.create(
        model=MODEL,
        messages=[
            {"role": "user", "content": master_prompt},
            {"role": "user", "content": f"[AUDIO DATA OMITTED: {filename}]"}
        ],
        temperature=0.0
    )

    # Extract text response
    raw_text = getattr(resp, "output_text", None)
    if not raw_text:
        raise ValueError("‚ö†Ô∏è No textual output returned. Check response format.")

    # Parse JSON output
    import re
    try:
        parsed = json.loads(raw_text)
    except Exception:
        m = re.search(r"(\{[\s\S]*\})", raw_text)
        parsed = json.loads(m.group(1)) if m else {"segments": []}

    # Write outputs
    out_prefix = os.path.splitext(filename)[0]
    json_path = os.path.join(output_dir, f"{out_prefix}.json")
    srt_path  = os.path.join(output_dir, f"{out_prefix}.srt")
    txt_path  = os.path.join(output_dir, f"{out_prefix}.txt")

    if parsed.get("segments"):
        write_json(json_path, parsed)
        write_srt(srt_path, parsed["segments"])
        write_txt(txt_path, parsed["segments"])
        print(f"‚úÖ Saved: {out_prefix}.json / .srt / .txt")
    else:
        print(f"‚ö†Ô∏è No segments found in response for {filename}")


  m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
  m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
  elif re.match('(flt)p?( \(default\))?$', token):
  elif re.match('(dbl)p?( \(default\))?$', token):


Mounted at /content/drive


Processing audio files:   0%|          | 0/7 [00:00<?, ?it/s]


üéß Transcribing: Chapter 1A - Concept of Basic Electricity Voltage, Currents, Resistance, Impedance & Power Factor.wav


Processing audio files:   0%|          | 0/7 [00:02<?, ?it/s]


AttributeError: 'Client' object has no attribute 'responses'

# Gemini ASR+MT

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# ============================
# INSTALL & IMPORTS
# ============================
!pip install -q google-generativeai pydub tqdm librosa

import os, io, re, time
from google.colab import drive, userdata
import google.generativeai as genai
from pydub import AudioSegment
from tqdm import tqdm

# ============================
# SETUP
# ============================
drive.mount('/content/drive', force_remount=True)

api_key = userdata.get("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("‚ùå No GOOGLE_API_KEY found in Colab secrets! Add it under 'More ‚Üí Secrets'.")
genai.configure(api_key=api_key)
model = genai.GenerativeModel("models/gemini-2.5-pro")

# === Base folders ===
base_dir = "/content/drive/MyDrive/Test_28_Adnew_wav"
input_dir = os.path.join(base_dir, "Tamil_wav")          # Folder of input .wav files
asr_dir = os.path.join(base_dir, "asr_srt_raw")          # Tamil raw SRT
fixed_dir = os.path.join(base_dir, "asr_srt_fixed")      # Tamil fixed SRT
mt_dir = os.path.join(base_dir, "mt_Telugu")             # Telugu translated SRT/TXT
os.makedirs(asr_dir, exist_ok=True)
os.makedirs(fixed_dir, exist_ok=True)
os.makedirs(mt_dir, exist_ok=True)

# ============================
# 1Ô∏è‚É£ ASR: Transcribe Audio to SRT (Tamil)
# ============================
def transcribe_audio_file(file_path):
    """Generate Tamil SRT from audio."""
    audio = AudioSegment.from_wav(file_path)
    buffer = io.BytesIO()
    audio.export(buffer, format="wav")
    audio_bytes = buffer.getvalue()
    try:
        response = model.generate_content([
            {"mime_type": "audio/wav", "data": audio_bytes},
            """
            You are a Subtitle Generator:
            Transcribe this audio exactly as spoken in Tamil in proper .srt format.
            Keep full timestamps (HH:MM:SS,mmm), sequential numbering, and mark silences as [Silence].
            """
        ])
        return response.text.strip()
    except Exception as e:
        print("‚ùå Error:", e)
        return ""

for filename in os.listdir(input_dir):
    if filename.lower().endswith(".wav"):
        file_path = os.path.join(input_dir, filename)
        print(f"\nüéß Transcribing: {filename}")
        text = transcribe_audio_file(file_path)
        out_path = os.path.join(asr_dir, filename.replace(".wav", ".srt"))
        with open(out_path, "w", encoding="utf-8") as f:
            f.write(text)
        print(f"‚úÖ Saved raw Tamil SRT ‚Üí {out_path}")

# ============================
# 2Ô∏è‚É£ Fix SRT timestamps
# ============================
def normalize_timestamp(ts: str) -> str:
    ts = ts.strip().replace('.', ',')
    if ',' in ts:
        time_part, ms = ts.split(',', 1)
        ms = re.sub(r'\D', '', ms)[:3].ljust(3, '0')
    else:
        time_part, ms = ts, '000'
    parts = time_part.split(':')
    if len(parts) == 1:
        h, m, s = 0, 0, parts[0]
    elif len(parts) == 2:
        h, m, s = 0, parts[0], parts[1]
    else:
        h, m, s = parts[-3], parts[-2], parts[-1]
    try:
        return f"{int(h):02d}:{int(m):02d}:{int(s):02d},{ms}"
    except:
        return "00:00:00,000"

def fix_srt_file(input_path, output_path):
    ts_pattern = re.compile(r'(\d{1,2}:?\d{1,2}:?\d{1,2}[.,]?\d*)\s*[-‚Äì>]+\s*(\d{1,2}:?\d{1,2}:?\d{1,2}[.,]?\d*)')
    with open(input_path, 'r', encoding='utf-8', errors='ignore') as f:
        lines = f.readlines()
    new_lines = []
    for line in lines:
        match = ts_pattern.search(line)
        if match:
            start, end = match.groups()
            new_lines.append(f"{normalize_timestamp(start)} --> {normalize_timestamp(end)}\n")
        else:
            new_lines.append(line)
    with open(output_path, 'w', encoding='utf-8') as f:
        f.writelines(new_lines)

for file in os.listdir(asr_dir):
    if file.endswith(".srt"):
        inp = os.path.join(asr_dir, file)
        out = os.path.join(fixed_dir, file)
        print(f"üõ† Fixing timestamps in {file}")
        fix_srt_file(inp, out)
        print(f"‚úÖ Fixed file saved to {out}")

# ============================
# 3Ô∏è‚É£ MT: Translate Tamil SRT ‚Üí Telugu
# ============================
pattern = r"(\d+)\s+([\d:,]+ --> [\d:,]+)\s+(.+?)(?=\n\d+\n|$)"
target_language = "Telugu"

def translate_batch(lines):
    joined_text = "\n".join(lines)
    prompt = f"""
    You are a professional subtitle translator.
    Translate the following Tamil subtitle text into {target_language}.
    Preserve timing and style; do not translate timestamps or numbers.
    Return one line per subtitle.
    Text:
    {joined_text}
    """
    for _ in range(3):
        try:
            response = model.generate_content(prompt)
            return response.text.strip().split("\n")
        except Exception as e:
            print("Retrying due to:", e)
            time.sleep(3)
    return [""] * len(lines)

for f_name in os.listdir(fixed_dir):
    if not f_name.lower().endswith(".srt"): continue
    input_file = os.path.join(fixed_dir, f_name)
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()
    entries = re.findall(pattern, content, flags=re.DOTALL)
    print(f"\nüåê Translating {f_name} ‚Üí {target_language} ({len(entries)} lines)")
    translated_entries, translated_texts = [], []
    batch_size = 15
    for i in range(0, len(entries), batch_size):
        batch = entries[i:i+batch_size]
        orig_texts = [t[2].strip() for t in batch]
        translated_batch = translate_batch(orig_texts)
        for (num, ts, _), trans in zip(batch, translated_batch):
            translated_entries.append(f"{num}\n{ts}\n{trans}\n")
            translated_texts.append(trans)
    base = os.path.splitext(f_name)[0]
    srt_out = os.path.join(mt_dir, f"{base}_{target_language}.srt")
    txt_out = os.path.join(mt_dir, f"{base}_{target_language}.txt")
    with open(srt_out, "w", encoding="utf-8") as f: f.write("\n".join(translated_entries))
    with open(txt_out, "w", encoding="utf-8") as f: f.write("\n".join(translated_texts))
    print(f"‚úÖ Translation saved ‚Üí {srt_out}")
    print(f"üìÑ Text saved ‚Üí {txt_out}")

print("\nüéâ All audio processed ‚Üí Tamil + Telugu outputs complete!")


# Gemini ASR

In [None]:
# ========================
# INSTALL & IMPORTS
# ========================
!pip install -q google-generativeai pydub tqdm librosa

import os
import io
from google.colab import drive, userdata
import google.generativeai as genai
from pydub import AudioSegment
from tqdm import tqdm

# ========================
# SETUP
# ========================

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Securely load your Gemini API key from Colab secrets
api_key = userdata.get("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("‚ùå No GOOGLE_API_KEY found in Colab secrets! Add it under 'More ‚Üí Secrets'.")

genai.configure(api_key=api_key)

# Choose your model
model = genai.GenerativeModel("models/gemini-2.5-pro")

# Input/output folders in Google Drive
base_dir = "/content/drive/MyDrive/Test_28_Adnew_wav/"
input_dir = "/content/drive/MyDrive/Test_28_Adnew_wav/English/a/"
output_dir = os.path.join(base_dir, "e5a","Srtformatissue")
os.makedirs(output_dir, exist_ok=True)

# ========================
# HELPER FUNCTIONS
# ========================

def transcribe_audio_file(file_path):
    """Transcribe full audio file without splitting."""
    audio = AudioSegment.from_wav(file_path)
    buffer = io.BytesIO()
    audio.export(buffer, format="wav")
    audio_bytes = buffer.getvalue()

    try:
        response = model.generate_content([
            {"mime_type": "audio/wav", "data": audio_bytes},
            """
            Your are a Subtitle Generator:
            Transcribe this audio exactly as spoken (no extra comments, no filler words) in the .srt format(Subtitle Format):

            1
            00:00:15,362 --> 00:00:21,789
            ‡§Ö‡§¨ ‡§π‡§Æ ‡§ú‡§æ‡§®‡•á‡§Ç‡§ó‡•á ‡§ï‡•à‡§Ç‡§°‡§≤‡•ç‡§∏ ‡§Æ‡•á‡§Ç ‡§ï‡•ç‡§Ø‡§æ ‡§ï‡•ç‡§Ø‡§æ ‡§ö‡•Ä‡§ú‡§º‡•ã‡§Ç ‡§ï‡•Ä ‡§ú‡§º‡§∞‡•Ç‡§∞‡§§ ‡§™‡§°‡§º‡§§‡•Ä ‡§π‡•à ‡§î‡§∞ ‡§â‡§®‡§ï‡•ã ‡§π‡§Æ ‡§ï‡§π‡§æ‡§Å ‡§∏‡•á ‡§ñ‡§º‡§∞‡•Ä‡§¶ ‡§∏‡§ï‡§§‡•á ‡§π‡•à‡§Ç

            2
            00:00:21,922 --> 00:00:27,422
            ‡§§‡•ã ‡§∏‡§¨‡§∏‡•á ‡§™‡§π‡§≤‡•á ‡§ï‡•à‡§Ç‡§°‡§≤ ‡§¨‡§®‡§æ‡§®‡•á ‡§ï‡•á ‡§≤‡§ø‡§è ‡§π‡§Æ‡•á‡§Ç ‡§°‡§¨‡§≤ ‡§¨‡•â‡§Ø‡§≤‡§∞ ‡§ï‡•Ä ‡§ú‡§º‡§∞‡•Ç‡§∞‡§§ ‡§™‡§°‡§º‡§§‡•Ä ‡§π‡•à ‡§Ø‡•á

            3
            00:00:27,617 --> 00:00:29,853
            ‡§á‡§∏ ‡§§‡§∞‡§π ‡§ï‡§æ ‡§Ø‡•á ‡§á‡§Ç‡§°‡§ï‡•ç‡§∂‡§® ‡§π‡•à

            and so on...

            The transcription should strictly follow the format above, where:
            - **Timestamps** are in the format of HH:MM:SS,SSS --> HH:MM:SS,SSS (with millisecond precision)(Hours:Minutes:Seconds,milliseconds).
            - Each entry should have a **sequential index** starting from 1 (e.g., 1, 2, 3, ...).
            - Even if Hours are not, Keep the Hours format in timestamp like this: 00:00:29,854 --> 00:00:34,500 not like this 00:29,854 --> 00:34,500 or 29,854 --> 34,500.
            - The spoken text should be captured **exactly as it is spoken**, without adding or removing words(but remove filler words).
            - If there is **silence** or a pause, mark the duration with a timestamp like this:
              ```
              4
              00:00:29,854 --> 00:00:34,500
              [Silence]
              ```
            - Include **Speaker labels** (e.g., Speaker 1, Speaker 2) where relevant if multiple speakers are detected.

            Please ensure the output strictly follows the SRT format. Thank you!
            """
        ])

        return response.text.strip()
    except Exception as e:
        print("‚ùå Error:", e)
        return ""

# ========================
# MAIN PROCESS
# ========================

for filename in os.listdir(input_dir):
    if filename.lower().endswith(".wav"):
        file_path = os.path.join(input_dir, filename)
        print(f"\nüéß Transcribing full audio: {filename}")

        # Get full transcription
        text = transcribe_audio_file(file_path)

        # Save TXT file
        txt_output = os.path.join(output_dir, filename.replace(".wav", ".txt"))
        with open(txt_output, "w", encoding="utf-8") as f:
            f.write(text)

        print(f"‚úÖ Done: {filename}")
        print(f"üìÑ TXT saved to: {txt_output}")


  m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
  m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
  elif re.match('(flt)p?( \(default\))?$', token):
  elif re.match('(dbl)p?( \(default\))?$', token):


Mounted at /content/drive

üéß Transcribing full audio: Chapter 5A - Use of Growing and Rooting Media in Floriculture.wav


ERROR:tornado.access:503 POST /v1beta/models/gemini-2.5-pro:generateContent?%24alt=json%3Benum-encoding%3Dint (::1) 25289.57ms


‚úÖ Done: Chapter 5A - Use of Growing and Rooting Media in Floriculture.wav
üìÑ TXT saved to: /content/drive/MyDrive/Test_28_Adnew_wav/e5a/Srtformatissue/Chapter 5A - Use of Growing and Rooting Media in Floriculture.txt


fixed srt script for 5a thing

In [None]:
# ========================
# INSTALL & IMPORTS
# ========================
!pip install -q google-generativeai pydub tqdm librosa

import os
import io
from google.colab import drive, userdata
import google.generativeai as genai
from pydub import AudioSegment
from tqdm import tqdm

# ========================
# SETUP
# ========================

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Securely load your Gemini API key from Colab secrets
api_key = userdata.get("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("‚ùå No GOOGLE_API_KEY found in Colab secrets! Add it under 'More ‚Üí Secrets'.")

genai.configure(api_key=api_key)

# Choose your model
model = genai.GenerativeModel("models/gemini-2.5-pro")

# Input/output folders in Google Drive
base_dir = "/content/drive/MyDrive/Test_28_Adnew_wav/"
input_dir = "/content/drive/MyDrive/Test_28_Adnew_wav/English/a/"
output_dir = os.path.join(base_dir, "e5a","Srtformatissue")
os.makedirs(output_dir, exist_ok=True)

# ========================
# HELPER FUNCTIONS
# ========================

def transcribe_audio_file(file_path):
    """Transcribe full audio file without splitting."""
    audio = AudioSegment.from_wav(file_path)
    buffer = io.BytesIO()
    audio.export(buffer, format="wav")
    audio_bytes = buffer.getvalue()

    try:
        response = model.generate_content([
            {"mime_type": "audio/wav", "data": audio_bytes},
            """
            Your are a Subtitle Generator:
            Transcribe this audio exactly as spoken (strictly : no extra comments, strictly : no filler words) in the .srt format(Subtitle Format):


            Before outputting the final subtitles, you MUST internally check and fix:
            - Timestamp continuity (no jumps, no time going backward)
            - No timestamp should exceed the audio duration
            - Format must strictly be:
              <index>
              HH:MM:SS,SSS --> HH:MM:SS,SSS
              text

            1
            00:00:15,362 --> 00:00:21,789
            ‡§Ö‡§¨ ‡§π‡§Æ ‡§ú‡§æ‡§®‡•á‡§Ç‡§ó‡•á ‡§ï‡•à‡§Ç‡§°‡§≤‡•ç‡§∏ ‡§Æ‡•á‡§Ç ‡§ï‡•ç‡§Ø‡§æ ‡§ï‡•ç‡§Ø‡§æ ‡§ö‡•Ä‡§ú‡§º‡•ã‡§Ç ‡§ï‡•Ä ‡§ú‡§º‡§∞‡•Ç‡§∞‡§§ ‡§™‡§°‡§º‡§§‡•Ä ‡§π‡•à ‡§î‡§∞ ‡§â‡§®‡§ï‡•ã ‡§π‡§Æ ‡§ï‡§π‡§æ‡§Å ‡§∏‡•á ‡§ñ‡§º‡§∞‡•Ä‡§¶ ‡§∏‡§ï‡§§‡•á ‡§π‡•à‡§Ç

            2
            00:00:21,922 --> 00:00:27,422
            ‡§§‡•ã ‡§∏‡§¨‡§∏‡•á ‡§™‡§π‡§≤‡•á ‡§ï‡•à‡§Ç‡§°‡§≤ ‡§¨‡§®‡§æ‡§®‡•á ‡§ï‡•á ‡§≤‡§ø‡§è ‡§π‡§Æ‡•á‡§Ç ‡§°‡§¨‡§≤ ‡§¨‡•â‡§Ø‡§≤‡§∞ ‡§ï‡•Ä ‡§ú‡§º‡§∞‡•Ç‡§∞‡§§ ‡§™‡§°‡§º‡§§‡•Ä ‡§π‡•à ‡§Ø‡•á

            3
            00:00:27,617 --> 00:00:29,853
            ‡§á‡§∏ ‡§§‡§∞‡§π ‡§ï‡§æ ‡§Ø‡•á ‡§á‡§Ç‡§°‡§ï‡•ç‡§∂‡§® ‡§π‡•à

            and so on...

            Rules:
            1. Keep timestamps chronological and continuous.
            2. Never generate timestamps like ‚Äú01:00:19,567‚Äù unless the audio is actually 1 hour long.
            3. Split long sentences into multiple subtitle segments with correct timing.
            4. If Gemini produces any incorrect timestamps, recalc and fix them BEFORE producing output.
            5. Do NOT output explanations. Only the corrected final SRT.
            6. Include speaker labels (e.g., Speaker 1, Speaker 2) if you detect multiple voices.
            7. Silence > 2 seconds ‚Üí insert:
              [Silence]
              with correct timestamps.

            """
        ])

        return response.text.strip()
    except Exception as e:
        print("‚ùå Error:", e)
        return ""

# ========================
# MAIN PROCESS
# ========================

for filename in os.listdir(input_dir):
    if filename.lower().endswith(".wav"):
        file_path = os.path.join(input_dir, filename)
        print(f"\nüéß Transcribing full audio: {filename}")

        # Get full transcription
        text = transcribe_audio_file(file_path)

        # Save TXT file
        txt_output = os.path.join(output_dir, filename.replace(".wav", ".txt"))
        with open(txt_output, "w", encoding="utf-8") as f:
            f.write(text)

        print(f"‚úÖ Done: {filename}")
        print(f"üìÑ TXT saved to: {txt_output}")


Mounted at /content/drive

üéß Transcribing full audio: Chapter 5A - Use of Growing and Rooting Media in Floriculture.wav
‚úÖ Done: Chapter 5A - Use of Growing and Rooting Media in Floriculture.wav
üìÑ TXT saved to: /content/drive/MyDrive/Test_28_Adnew_wav/e5a/Srtformatissue/Chapter 5A - Use of Growing and Rooting Media in Floriculture.txt


fillerword remover from eng txt file

In [None]:
# List of filler words (customize this)
file_path = '/content/drive/MyDrive/Test_28_Adnew_wav/e5a/Srtformatissue/Chapter 5A - Use of Growing and Rooting Media in Floriculture.txt'
filler_words = [
    "um", "uh", "like", "you know", "so", "actually", "basically",
    "literally", "right", "i mean", "sort of", "kind of", "okay",
    "well", "hmm"
]

# Load file
with open(file_path, 'r') as f:
    text = f.read()

# Remove filler words using simple replace
for word in filler_words:
    # Remove standalone words (case-insensitive)
    text = re.sub(rf'\b{word}\b', '', text, flags=re.IGNORECASE)

# Optional: clean extra spaces
# text = re.sub(r'\s+', ' ', text).strip()

# Save cleaned text
cleaned_path = file_path.replace('.txt', '_cleaned.txt')
with open(cleaned_path, 'w') as f:
    f.write(text)

cleaned_path

'/content/drive/MyDrive/Test_28_Adnew_wav/e5a/Srtformatissue/Chapter 5A - Use of Growing and Rooting Media in Floriculture_cleaned.txt'

Text to Srt Format rectifier

In [None]:
from google.colab import drive, userdata
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os
import re

def normalize_timestamp(ts: str) -> str:
    """
    Normalize timestamp to 'HH:MM:SS,mmm' format.
    Handles missing hours and malformed parts.
    """
    ts = ts.strip().replace('.', ',')
    # Split at comma for milliseconds
    if ',' in ts:
        time_part, ms = ts.split(',', 1)
        ms = re.sub(r'\D', '', ms)[:3].ljust(3, '0')
    else:
        time_part, ms = ts, '000'
    parts = time_part.split(':')
    # Fill missing parts
    if len(parts) == 1:
        h, m, s = 0, 0, parts[0]
    elif len(parts) == 2:
        h, m, s = 0, parts[0], parts[1]
    else:
        h, m, s = parts[-3], parts[-2], parts[-1]
    try:
        return f"{int(h):02d}:{int(m):02d}:{int(s):02d},{ms}"
    except:
        return "00:00:00,000"


def fix_srt_file(input_path, output_path):
    """
    Reads one .srt/.txt file, fixes timestamp formatting,
    and saves a new valid .srt file.
    """
    with open(input_path, 'r', encoding='utf-8', errors='ignore') as f:
        lines = f.readlines()

    new_lines = []
    ts_pattern = re.compile(
        r'(\d{1,2}:?\d{1,2}:?\d{1,2}[.,]?\d*)\s*[-‚Äì>]+\s*(\d{1,2}:?\d{1,2}:?\d{1,2}[.,]?\d*)'
    )

    for line in lines:
        match = ts_pattern.search(line)
        if match:
            start, end = match.groups()
            start = normalize_timestamp(start)
            end = normalize_timestamp(end)
            new_lines.append(f"{start} --> {end}\n")
        else:
            new_lines.append(line)

    with open(output_path, 'w', encoding='utf-8') as f:
        f.writelines(new_lines)


def process_folder(input_folder, output_folder):
    """
    Process all .srt/.txt files in a folder recursively,
    writing fixed versions to output_folder.
    """
    os.makedirs(output_folder, exist_ok=True)

    for root, _, files in os.walk(input_folder):
        for file in files:
            if file.lower().endswith(('.srt', '.txt')):
                input_path = os.path.join(root, file)
                rel_path = os.path.relpath(input_path, input_folder)
                output_path = os.path.join(output_folder, os.path.splitext(rel_path)[0] + '.srt')

                os.makedirs(os.path.dirname(output_path), exist_ok=True)
                print(f"Fixing: {rel_path}")
                fix_srt_file(input_path, output_path)

    print("\n All files processed and saved in:", output_folder)



if __name__ == "__main__":
    input_folder = "/content/drive/My Drive/Test1/Srtformatissue/"
    output_folder = "/content/drive/My Drive/Test1/Fixed_srt/"

    process_folder(input_folder, output_folder)


Fixing: Chapter 1A - Introduction to DTP.txt
Fixing: Copy of Chapter 1A - Introduction to DTP.txt
Fixing: Copy of Chapter 1A - Introduction to DTP (2).txt
Fixing: Copy of Chapter 1A - Introduction to DTP (1).txt

 All files processed and saved in: /content/drive/My Drive/Test1/Fixed_srt/


# Gemini MT (still testing for improvement)

In [None]:
from google import genai
from google.colab import drive, userdata
import os
import re
import time

# === Mount Google Drive and API ===
drive.mount('/content/drive')
os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')
client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])

# === Paths ===
base_dir = "/content/drive/My Drive/OpenAI_API_pipeline"
asr_dir = os.path.join(base_dir, "asr")  # input SRTs
mt_dir = os.path.join(base_dir, "mt","gemini_2.5_pro")   # translated output
os.makedirs(mt_dir, exist_ok=True)

target_language = "Telugu"
print("üü¢ Ready ‚Äî Processing all .srt files...")

# === SRT parsing pattern ===
pattern = r"(\d+)\s+([\d:,]+ --> [\d:,]+)\s+(.+?)(?=\n\d+\n|$)"

def translate_batch(lines):
    """Translate list of subtitle text chunks at once with Gemini."""
    joined_text = "\n".join(lines)
    prompt = f"""
You are a professional subtitle translator for Indic languages.

Translate the following subtitle dialogue into {target_language}.
Preserve meaning. Keep subtitles short and natural.
Do NOT translate numbers or timestamps.
Return one line per subtitle, in order.

Text:
{joined_text}
"""
    for _ in range(3):  # retry logic
        try:
            response = client.models.generate_content(
                model="gemini-2.5-pro",  # or gemini-2.0-pro if you have access
                contents=prompt
            )
            # Gemini's response object
            result_text = response.text.strip()
            return result_text.split("\n")
        except Exception as e:
            print("Retrying batch due to error:", e)
            time.sleep(3)
    return [""] * len(lines)


# === Loop over all SRT files ===
for f_name in os.listdir(asr_dir):
    if not f_name.lower().endswith(".srt"):
        continue

    input_file = os.path.join(asr_dir, f_name)
    print(f"\nüé¨ Processing: {f_name}")

    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()

    entries = re.findall(pattern, content, flags=re.DOTALL)
    print(f"   ‚Üí {len(entries)} subtitles detected")

    translated_entries = []
    translated_text_only = []

    batch_size = 15
    for i in range(0, len(entries), batch_size):
        batch = entries[i:i+batch_size]
        orig_texts = [t[2].strip() for t in batch]

        translated_batch = translate_batch(orig_texts)

        for (num, ts, _), trans in zip(batch, translated_batch):
            translated_entries.append(f"{num}\n{ts}\n{trans}\n")
            translated_text_only.append(trans)

        print(f"   ‚úÖ Translated segments {i+1}‚Äì{min(i+batch_size,len(entries))}")

    # Save outputs
    base = os.path.splitext(f_name)[0]
    srt_out = os.path.join(mt_dir, f"{base}_{target_language}.srt")
    txt_out = os.path.join(mt_dir, f"{base}_{target_language}.txt")

    with open(srt_out, "w", encoding='utf-8') as f:
        f.write("\n".join(translated_entries))

    with open(txt_out, "w", encoding='utf-8') as f:
        f.write("\n".join(translated_text_only))

    print(f"   üìÅ Saved ‚Üí {srt_out}")
    print(f"   üìÑ Saved ‚Üí {txt_out}")

print("\n‚úÖ All files translated successfully!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
üü¢ Ready ‚Äî Processing all .srt files...

üé¨ Processing: Chapter 6A - Sucessful Entreuprenuer Journey.srt
   ‚Üí 139 subtitles detected
   ‚úÖ Translated segments 1‚Äì15
   ‚úÖ Translated segments 16‚Äì30
   ‚úÖ Translated segments 31‚Äì45
   ‚úÖ Translated segments 46‚Äì60
Retrying batch due to error: 503 UNAVAILABLE. {'error': {'code': 503, 'message': 'The model is overloaded. Please try again later.', 'status': 'UNAVAILABLE'}}
Retrying batch due to error: 503 UNAVAILABLE. {'error': {'code': 503, 'message': 'The model is overloaded. Please try again later.', 'status': 'UNAVAILABLE'}}
Retrying batch due to error: 503 UNAVAILABLE. {'error': {'code': 503, 'message': 'The model is overloaded. Please try again later.', 'status': 'UNAVAILABLE'}}
   ‚úÖ Translated segments 61‚Äì75
   ‚úÖ Translated segments 76‚Äì90
   ‚úÖ Translated segments 91‚Äì105
   ‚úÖ T

# Gemini TTS

In [None]:
!pip install -U -q "google-genai>=1.16.1"
# !pip install pysrt

from google.colab import drive, userdata
import io
import json
import re
import wave
import os
import base64
import struct
import shutil
import pysrt, time

from IPython.display import Audio, display, HTML, Markdown
from google import genai
from google.genai import types
from google.genai.types import GenerateContentConfig, Tool

# -------------------------------
# Mount Google Drive
# -------------------------------
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
drive.mount('/content/drive', force_remount=True)

# Initialize client
client = genai.Client(api_key=GOOGLE_API_KEY)


# -------------------------------
# Helper: parse .srt into segments
# -------------------------------
def parse_srt(path):
    subs = pysrt.open(path)
    segments = []
    for sub in subs:
        start = sub.start.hours*3600 + sub.start.minutes*60 + sub.start.seconds + sub.start.milliseconds/1000
        end   = sub.end.hours*3600   + sub.end.minutes*60   + sub.end.seconds   + sub.end.milliseconds/1000
        text = sub.text.replace("\n", " ").strip()
        segments.append((start, end, text))
    return segments


# -------------------------------
# Helper: write .wav file
# -------------------------------
def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
    print(f"\nWriting audio file with parameters:")
    print(f"Channels: {channels}")
    print(f"Sample rate: {rate}")
    print(f"Sample width: {sample_width}")
    print(f"Data length: {len(pcm)} bytes")

    with wave.open(filename, "wb") as wf:
        wf.setnchannels(channels)
        wf.setsampwidth(sample_width)
        wf.setframerate(rate)
        wf.writeframes(pcm)


# -------------------------------
# NEW Helper: Safe TTS with retry
# -------------------------------
def get_tts_audio(client, prompt, voice, retries=5, delay=5):
    """Call Gemini TTS with retry logic and safe extraction."""
    for attempt in range(retries):
        try:
            response = client.models.generate_content(
                model="gemini-2.5-pro-preview-tts",
                contents=prompt,
                config=types.GenerateContentConfig(
                    response_modalities=["audio"],
                    speech_config=types.SpeechConfig(
                        voice_config=types.VoiceConfig(
                            prebuilt_voice_config=types.PrebuiltVoiceConfig(
                                voice_name=voice
                            )
                        )
                    ),
                ),
            )

            # --- Safe extraction block ---
            data = None
            try:
                data = response.candidates[0].content.parts[0].inline_data.data
            except Exception:
                if hasattr(response.candidates[0].content, "inline_data"):
                    data = response.candidates[0].content.inline_data.data
                elif hasattr(response, "audio") and hasattr(response.audio, "data"):
                    data = response.audio.data

            if data:
                return data  # ‚úÖ success
            else:
                print(f"‚ö†Ô∏è No audio returned on attempt {attempt+1}. Retrying...")
                time.sleep(delay)
        except Exception as e:
            print(f"‚ö†Ô∏è TTS error on attempt {attempt+1}: {e}")
            time.sleep(delay)
    return None  # ‚ùå all retries failed


# -------------------------------
# Input + setup
# -------------------------------
srt_file_path = '/content/drive/MyDrive/aa/test2_Tamil.srt'  # replace with your path
VOICE = 'Kore'

segments = parse_srt(srt_file_path)
print(f"Found {len(segments)} subtitle segments.")

base_name = os.path.splitext(os.path.basename(srt_file_path))[0]
output_dir = f'/content/drive/MyDrive/aa/{base_name}_segments'
os.makedirs(output_dir, exist_ok=True)

failed_log = os.path.join(output_dir, "failed_segments.txt")

# -------------------------------
# Main processing loop
# -------------------------------
for idx, (start, end, text) in enumerate(segments, 1):
    if len(text.strip()) < 5:
        print(f"‚ö†Ô∏è Skipping too-short segment {idx}: '{text}'")
        continue

    PROMPT = f"Speak in Indian female Tamil with an educational tone: {text}"
    print(f"\nProcessing segment {idx} ({start:.2f}s ‚Üí {end:.2f}s): {text[:60]}...")

    data = get_tts_audio(client, PROMPT, VOICE)
    if not data:
        print(f"‚ùå Skipping segment {idx} ‚Äî no audio after retries.")
        with open(failed_log, "a") as log:
            log.write(f"{idx}: {text}\n")
        continue

    # Save audio
    rate = 24000
    file_name = f"{idx:03d}.wav"
    print(f"\nSaving sample rate: {rate}")
    wave_file(file_name, data, rate=rate)

    # Copy to Drive
    destination_path = os.path.join(output_dir, file_name)
    shutil.copy(f"/content/{file_name}", destination_path)
    display(Audio(destination_path))

print(f"\n‚úÖ All segments saved in: {output_dir}")
print(f"üìÑ Failed segments (if any) logged to: {failed_log}")


audio merge

In [None]:
import subprocess

def merge_segments_ffmpeg_timed(segments, segments_dir, output_path, sample_rate=24000):
    """
    Merge segments into a single time-aligned audio track using FFmpeg filter_complex.
    Each segment is placed at its exact SRT start time.
    """
    print("\nüéØ Performing precise timeline merge using FFmpeg...")

    filter_parts = []
    inputs = []

    for i, (start, end, text) in enumerate(segments, 1):
        seg_path = os.path.join(segments_dir, f"{i:03d}.wav")
        if not os.path.exists(seg_path):
            print(f"‚ö†Ô∏è Skipping missing segment {i:03d}")
            continue

        delay_ms = int(start * 1000)  # convert to milliseconds
        inputs += ["-i", seg_path]
        # Apply delay via adelay filter
        filter_parts.append(f"[{i-1}:a]adelay={delay_ms}|{delay_ms}[a{i}]")

    # Combine all delayed audio tracks
    filter_complex = "; ".join(filter_parts) + f"; {' '.join(f'[a{i}]' for i in range(1, len(filter_parts)+1))}amix=inputs={len(filter_parts)}:normalize=0[aout]"

    cmd = [
        "ffmpeg", "-y",
        *inputs,
        "-filter_complex", filter_complex,
        "-map", "[aout]",
        "-ar", str(sample_rate),
        "-ac", "1",
        "-c:a", "pcm_s16le",
        output_path
    ]

    print(f"\nRunning FFmpeg command:\n{' '.join(cmd)}\n")
    subprocess.run(cmd, check=True)
    print(f"‚úÖ Final aligned audio saved at: {output_path}")

final_output = f"/content/drive/MyDrive/aa/{base_name}_merged_timed_Tamil.wav"
merge_segments_ffmpeg_timed(segments, output_dir, final_output)

