In [1]:
import os
from pydub import AudioSegment

from google.colab import drive, userdata
drive.mount('/content/drive', force_remount=True)

# ========================
# HELPER FUNCTION TO REDUCE FILE SIZE
# ========================

def reduce_wav_size(file_path, target_size_mb=25, output_path=None):
    """
    Reduces the size of a .wav file by adjusting bitrate, sample rate, and channels.
    Ensures the resulting file is under the target size (in MB).
    """
    # Load the audio file
    audio = AudioSegment.from_wav(file_path)

    # Convert to mono (if stereo)
    audio = audio.set_channels(1)

    # Reduce sample rate (e.g., 22050 Hz instead of 44100 Hz)
    audio = audio.set_frame_rate(22050)

    # Export the reduced audio to a temporary location to check the size
    temp_path = output_path if output_path else file_path
    audio.export(temp_path, format="wav", codec="pcm_s16le")

    # Check the file size, and adjust further if necessary
    while os.path.getsize(temp_path) > target_size_mb * 1024 * 1024:
        print(f"File too large ({os.path.getsize(temp_path)} bytes). Reducing further...")

        # Reduce further sample rate, or trim silence
        audio = audio.set_frame_rate(audio.frame_rate - 1000)  # Reduce 1 kHz at a time

        # Re-export the file
        audio.export(temp_path, format="wav", codec="pcm_s16le")

    print(f"Final file size: {os.path.getsize(temp_path) / (1024 * 1024):.2f} MB")
    return temp_path

# ========================
# REDUCE FILE SIZE FOR ALL .WAV FILES IN THE INPUT FOLDER
# ========================

input_dir = "/content/drive/MyDrive/Test_28_Adnew_wav/Hindi/a"  # Input folder path
output_dir = "/content/drive/MyDrive/Test_28_Adnew_wav/Hindi/a/Reduced"  # Folder to save reduced files
os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(input_dir):
    if filename.lower().endswith(".wav"):
        file_path = os.path.join(input_dir, filename)
        output_path = os.path.join(output_dir, filename)

        print(f"üéß Reducing size for {filename}...")

        # Reduce the file size
        reduced_file = reduce_wav_size(file_path, target_size_mb=25, output_path=output_path)

        print(f"‚úÖ Reduced file saved at: {reduced_file}")


  m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
  m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
  elif re.match('(flt)p?( \(default\))?$', token):
  elif re.match('(dbl)p?( \(default\))?$', token):


Mounted at /content/drive
üéß Reducing size for Chapter 8B - Sewing of Regular Blouse.wav...
File too large (116417612 bytes). Reducing further...
File too large (111137904 bytes). Reducing further...
File too large (105858196 bytes). Reducing further...
File too large (100578488 bytes). Reducing further...
File too large (95298780 bytes). Reducing further...
File too large (90019072 bytes). Reducing further...
File too large (84739364 bytes). Reducing further...
File too large (79459656 bytes). Reducing further...
File too large (74179948 bytes). Reducing further...
File too large (68900242 bytes). Reducing further...
File too large (63620536 bytes). Reducing further...
File too large (58340830 bytes). Reducing further...
File too large (53061124 bytes). Reducing further...
File too large (47781418 bytes). Reducing further...
File too large (42501712 bytes). Reducing further...
File too large (37222006 bytes). Reducing further...
File too large (31942300 bytes). Reducing further...
F

In [None]:
# ========================
# INSTALL & IMPORTS
# ========================
!pip install -q google-generativeai pydub tqdm librosa

import os
import io
from google.colab import drive, userdata
import google.generativeai as genai
from pydub import AudioSegment
from tqdm import tqdm

# ========================
# SETUP
# ========================

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Securely load your Gemini API key from Colab secrets
api_key = userdata.get("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("‚ùå No GOOGLE_API_KEY found in Colab secrets! Add it under 'More ‚Üí Secrets'.")

genai.configure(api_key=api_key)

# Choose your model
model = genai.GenerativeModel("models/gemini-2.5-pro")

# Input/output folders in Google Drive
base_dir = "/content/drive/MyDrive/Test_28_Adnew_wav/"
input_dir = "/content/drive/MyDrive/Test_28_Adnew_wav/Hindi/aa"
output_dir = os.path.join(base_dir, "Test_quality")
os.makedirs(output_dir, exist_ok=True)

# ========================
# HELPER FUNCTIONS
# ========================

def transcribe_audio_file(file_path):
    """Transcribe full audio file without splitting."""
    audio = AudioSegment.from_wav(file_path)
    buffer = io.BytesIO()
    audio.export(buffer, format="wav")
    audio_bytes = buffer.getvalue()

    try:
        response = model.generate_content(
        contents=[
            {
                "role": "user",
                "parts": [
                    {"mime_type": "audio/wav", "data": audio_bytes},
                    """
                    You are a Subtitle Generator.

                    Transcribe this audio exactly as spoken in Hindi (strictly: no extra comments, strictly: no filler words)
                    in valid .srt format.

                    Before outputting, you MUST internally ensure:

                    - Each subtitle segment must contain exactly 3 sentences in same line, unless the audio ends and fewer remain.
                    - Maintain natural sentence boundaries.
                    - Combine sentences smoothly while keeping meaning and flow.
                    - Only create a new segment after exactly 3 sentences have been completed (except the final segment).
                    - Timestamp continuity must be correct and must not overlap.
                    - Format must strictly be:

                      <index>
                      HH:MM:SS,SSS --> HH:MM:SS,SSS
                      text
                      This format should be strictly followed

                    Rules:
                    1. Timestamps must be chronological and continuous.
                    2. Every segment contains exactly 3 sentences (except final).
                    3. Never generate timestamps beyond the audio duration.
                    4. If Gemini outputs incorrect timestamps, fix them BEFORE final output.
                    5. No explanations. Only the final SRT.
                    6. Include speaker labels if detectable.
                    7. Silence > 2 seconds ‚Üí include:
                      [Silence]
                      with correct timestamps.


                    """
                ]
            }
        ]
    )


        return response.text.strip()
    except Exception as e:
        print("‚ùå Error:", e)
        return ""

# ========================
# MAIN PROCESS
# ========================

for filename in os.listdir(input_dir):
    if filename.lower().endswith(".wav"):
        file_path = os.path.join(input_dir, filename)
        print(f"\nüéß Transcribing full audio: {filename}")

        # Get full transcription
        text = transcribe_audio_file(file_path)

        # Save TXT file
        txt_output = os.path.join(output_dir, filename.replace(".wav", ".txt"))
        with open(txt_output, "w", encoding="utf-8") as f:
            f.write(text)

        print(f"‚úÖ Done: {filename}")
        print(f"üìÑ TXT saved to: {txt_output}")


ValueError: Mountpoint must not already contain files

In [None]:
import os
import re

def normalize_timestamp(ts: str) -> str:
    """
    Normalize timestamp to 'HH:MM:SS,mmm' format.
    Handles missing hours and malformed parts.
    """
    ts = ts.strip().replace('.', ',')
    # Split at comma for milliseconds
    if ',' in ts:
        time_part, ms = ts.split(',', 1)
        ms = re.sub(r'\D', '', ms)[:3].ljust(3, '0')
    else:
        time_part, ms = ts, '000'
    parts = time_part.split(':')
    # Fill missing parts
    if len(parts) == 1:
        h, m, s = 0, 0, parts[0]
    elif len(parts) == 2:
        h, m, s = 0, parts[0], parts[1]
    else:
        h, m, s = parts[-3], parts[-2], parts[-1]
    try:
        return f"{int(h):02d}:{int(m):02d}:{int(s):02d},{ms}"
    except:
        return "00:00:00,000"


def fix_srt_file(input_path, output_path):
    """
    Reads one .srt/.txt file, fixes timestamp formatting,
    and saves a new valid .srt file.
    """
    with open(input_path, 'r', encoding='utf-8', errors='ignore') as f:
        lines = f.readlines()

    new_lines = []
    ts_pattern = re.compile(
        r'(\d{1,2}:?\d{1,2}:?\d{1,2}[.,]?\d*)\s*[-‚Äì>]+\s*(\d{1,2}:?\d{1,2}:?\d{1,2}[.,]?\d*)'
    )

    for line in lines:
        match = ts_pattern.search(line)
        if match:
            start, end = match.groups()
            start = normalize_timestamp(start)
            end = normalize_timestamp(end)
            new_lines.append(f"{start} --> {end}\n")
        else:
            new_lines.append(line)

    with open(output_path, 'w', encoding='utf-8') as f:
        f.writelines(new_lines)


def process_folder(input_folder, output_folder):
    """
    Process all .srt/.txt files in a folder recursively,
    writing fixed versions to output_folder.
    """
    os.makedirs(output_folder, exist_ok=True)

    for root, _, files in os.walk(input_folder):
        for file in files:
            if file.lower().endswith(('.srt', '.txt')):
                input_path = os.path.join(root, file)
                rel_path = os.path.relpath(input_path, input_folder)
                output_path = os.path.join(output_folder, os.path.splitext(rel_path)[0] + '.srt')

                os.makedirs(os.path.dirname(output_path), exist_ok=True)
                print(f"Fixing: {rel_path}")
                fix_srt_file(input_path, output_path)

    print("\n All files processed and saved in:", output_folder)



if __name__ == "__main__":
    input_folder = "/content/drive/My Drive/Test_quality/gemini25pro/Asr_eng/"
    output_folder = "/content/drive/My Drive/Test_quality/gemini25pro/Asr_eng_f/"

    process_folder(input_folder, output_folder)


Fixing: Chapter 5A - Use of Growing and Rooting Media in Floriculture_gemini_25_pro_eng_eng.txt
Fixing: Chapter 1A - Concept of Basic Electricity Voltage, Currents, Resistance, Impedance & Power Factor.txt

 All files processed and saved in: /content/drive/My Drive/Test_quality/gemini25pro/Asr_eng_f/


Test_quality/gemini25pro/Asr_eng/

In [None]:

from google.colab import drive, userdata
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


MT

In [None]:
from google import genai
from google.colab import drive, userdata
import os
import re
import time

# === Mount Google Drive and API ===
drive.mount('/content/drive')
os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')
client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])

# === Paths ===
base_dir = "/content/drive/My Drive/Test_quality/gemini25pro/"
asr_dir = os.path.join(base_dir, "Asr_eng")  # input SRTs
mt_dir = os.path.join(base_dir, "mt","Tel")   # translated output
os.makedirs(mt_dir, exist_ok=True)

target_language = "Telugu"
print("üü¢ Ready ‚Äî Processing all .srt files...")

# === SRT parsing pattern ===
pattern = r"(\d+)\s+([\d:,]+ --> [\d:,]+)\s+(.+?)(?=\n\d+\n|$)"

def translate_batch(lines):
    """Translate list of subtitle text chunks at once with Gemini."""
    joined_text = "\n".join(lines)
    prompt = f"""
You are a professional subtitle translator for Indic languages.

Translate the following subtitle dialogue into {target_language}.
Preserve meaning. Keep subtitles short and natural.
Do NOT translate numbers or timestamps.
Return one line per subtitle, in order.

Text:
{joined_text}
"""
    for _ in range(3):  # retry logic
        try:
            response = client.models.generate_content(
                model="gemini-2.5-pro",  # or gemini-2.0-pro if you have access
                contents=prompt
            )
            # Gemini's response object
            result_text = response.text.strip()
            return result_text.split("\n")
        except Exception as e:
            print("Retrying batch due to error:", e)
            time.sleep(3)
    return [""] * len(lines)


# === Loop over all SRT files ===
for f_name in os.listdir(asr_dir):
    if not f_name.lower().endswith(".srt"):
        continue

    input_file = os.path.join(asr_dir, f_name)
    print(f"\nüé¨ Processing: {f_name}")

    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()

    entries = re.findall(pattern, content, flags=re.DOTALL)
    print(f"   ‚Üí {len(entries)} subtitles detected")

    translated_entries = []
    translated_text_only = []

    batch_size = 15
    for i in range(0, len(entries), batch_size):
        batch = entries[i:i+batch_size]
        orig_texts = [t[2].strip() for t in batch]

        translated_batch = translate_batch(orig_texts)

        for (num, ts, _), trans in zip(batch, translated_batch):
            translated_entries.append(f"{num}\n{ts}\n{trans}\n")
            translated_text_only.append(trans)

        print(f"   ‚úÖ Translated segments {i+1}‚Äì{min(i+batch_size,len(entries))}")

    # Save outputs
    base = os.path.splitext(f_name)[0]
    srt_out = os.path.join(mt_dir, f"{base}_{target_language}.srt")
    txt_out = os.path.join(mt_dir, f"{base}_{target_language}.txt")

    with open(srt_out, "w", encoding='utf-8') as f:
        f.write("\n".join(translated_entries))

    with open(txt_out, "w", encoding='utf-8') as f:
        f.write("\n".join(translated_text_only))

    print(f"   üìÅ Saved ‚Üí {srt_out}")
    print(f"   üìÑ Saved ‚Üí {txt_out}")

print("\n‚úÖ All files translated successfully!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
üü¢ Ready ‚Äî Processing all .srt files...

üé¨ Processing: Chapter 5A - Use of Growing and Rooting Media in Floriculture_gemini_25_pro_eng_eng.srt
   ‚Üí 66 subtitles detected
   ‚úÖ Translated segments 1‚Äì15
   ‚úÖ Translated segments 16‚Äì30
   ‚úÖ Translated segments 31‚Äì45
   ‚úÖ Translated segments 46‚Äì60
   ‚úÖ Translated segments 61‚Äì66
   üìÅ Saved ‚Üí /content/drive/My Drive/Test_quality/gemini25pro/mt/Tel/Chapter 5A - Use of Growing and Rooting Media in Floriculture_gemini_25_pro_eng_eng_Telugu.srt
   üìÑ Saved ‚Üí /content/drive/My Drive/Test_quality/gemini25pro/mt/Tel/Chapter 5A - Use of Growing and Rooting Media in Floriculture_gemini_25_pro_eng_eng_Telugu.txt

üé¨ Processing: Chapter 1A - Concept of Basic Electricity Voltage, Currents, Resistance, Impedance & Power Factor.srt
   ‚Üí 129 subtitles detected
   ‚úÖ Translated segments 1‚Ä

KeyboardInterrupt: 

In [None]:
from google.colab import drive, userdata
import os
import re
import time
from google import genai

# === Mount Google Drive and API ===
drive.mount('/content/drive')
os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')
client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])

# === Paths ===
base_dir = "/content/drive/My Drive/Test_quality/gemini25pro/"
asr_dir = os.path.join(base_dir, "Asr_eng")  # input SRTs
mt_dir = os.path.join(base_dir, "mt", "Tel")  # translated output
os.makedirs(mt_dir, exist_ok=True)

target_language = "Telugu"
print("üü¢ Ready ‚Äî Processing all .srt files...")

# === SRT parsing pattern ===
pattern = r"(\d+)\s+([\d:,]+ --> [\d:,]+)\s+(.+?)(?=\n\d+\n|$)"

def translate_segment(segment_text):
    """Translate a single subtitle segment at a time with Gemini."""
    prompt = f"""
You are a professional subtitle translator for Indic languages.

Translate the following subtitle dialogue into {target_language}.
Preserve meaning. Keep subtitles short and natural.
Do NOT translate numbers or timestamps.
Return one line per subtitle, in order.

Text:
{segment_text}
"""
    for _ in range(3):  # retry logic
        try:
            response = client.models.generate_content(
                model="gemini-2.5-pro",  # or gemini-2.0-pro if you have access
                contents=prompt
            )
            # Gemini's response object
            result_text = response.text.strip()
            return result_text
        except Exception as e:
            print("Retrying due to error:", e)
            time.sleep(3)
    return ""  # return empty if all retries fail

# === Loop over all SRT files ===
for f_name in os.listdir(asr_dir):
    if not f_name.lower().endswith(".srt"):
        continue

    input_file = os.path.join(asr_dir, f_name)
    print(f"\nüé¨ Processing: {f_name}")

    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()

    entries = re.findall(pattern, content, flags=re.DOTALL)
    print(f"   ‚Üí {len(entries)} subtitles detected")

    translated_entries = []
    translated_text_only = []

    # Process each segment one by one
    for i, (num, ts, original_text) in enumerate(entries):
        print(f"   ‚Üí Translating segment {i+1}: {original_text[:50]}...")  # Preview of text

        translated_text = translate_segment(original_text.strip())

        translated_entries.append(f"{num}\n{ts}\n{translated_text}\n")
        translated_text_only.append(translated_text)

        print(f"   ‚úÖ Translated segment {i+1}")

    # Save outputs
    base = os.path.splitext(f_name)[0]
    srt_out = os.path.join(mt_dir, f"{base}_{target_language}.srt")
    txt_out = os.path.join(mt_dir, f"{base}_{target_language}.txt")

    with open(srt_out, "w", encoding='utf-8') as f:
        f.write("\n".join(translated_entries))

    with open(txt_out, "w", encoding='utf-8') as f:
        f.write("\n".join(translated_text_only))

    print(f"   üìÅ Saved ‚Üí {srt_out}")
    print(f"   üìÑ Saved ‚Üí {txt_out}")

print("\n‚úÖ All files translated successfully!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
üü¢ Ready ‚Äî Processing all .srt files...

üé¨ Processing: Chapter 5A - Use of Growing and Rooting Media in Floriculture_gemini_25_pro_eng_eng.srt
   ‚Üí 66 subtitles detected
   ‚Üí Translating segment 1: Now, let us talk about the use of uh routing media...
   ‚úÖ Translated segment 1
   ‚Üí Translating segment 2: It helps in a water retention and drainage due to ...
   ‚úÖ Translated segment 2
   ‚Üí Translating segment 3: And uh it is it is also beneficial for the um uh m...
   ‚úÖ Translated segment 3
   ‚Üí Translating segment 4: Now, there are different types of growing media, i...
   ‚úÖ Translated segment 4
   ‚Üí Translating segment 5: This which are enriched with organic matter. These...
   ‚úÖ Translated segment 5
   ‚Üí Translating segment 6: This mixtures um it includes uh peat moss, uh coco...
   ‚úÖ Translated segment 6
   ‚Üí Translating s

In [None]:
from google.colab import drive, userdata
import os
import re
import time
from google import genai

# === Mount Google Drive and API ===
drive.mount('/content/drive')
os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')
client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])

# === Paths ===
base_dir = "/content/drive/My Drive/Test_quality/gemini25pro/"
asr_dir = os.path.join(base_dir, "Asr_eng")  # input SRTs
mt_dir = os.path.join(base_dir, "mt", "Mal")  # translated output
os.makedirs(mt_dir, exist_ok=True)

target_language = "Malayalam"
print("üü¢ Ready ‚Äî Processing all .srt files...")

# === SRT parsing pattern ===
pattern = r"(\d+)\s+([\d:,]+ --> [\d:,]+)\s+(.+?)(?=\n\d+\n|$)"

def translate_batch(lines):
    """Translate list of subtitle text chunks at once with Gemini."""
    joined_text = "\n".join(lines)
    prompt = f"""
You are a professional subtitle translator for Indic languages.

Translate the following subtitle dialogue into {target_language}.
Preserve meaning. Keep subtitles short and natural.
Do NOT translate numbers or timestamps.
Return one line per subtitle, in order.

Text:
{joined_text}
"""
    for _ in range(3):  # retry logic
        try:
            response = client.models.generate_content(
                model="gemini-2.5-pro",  # or gemini-2.0-pro if you have access
                contents=prompt
            )
            # Gemini's response object
            result_text = response.text.strip()
            return result_text.split("\n")
        except Exception as e:
            print("Retrying batch due to error:", e)
            time.sleep(3)
    return [""] * len(lines)

# === Loop over all SRT files ===
for f_name in os.listdir(asr_dir):
    if not f_name.lower().endswith(".srt"):
        continue

    input_file = os.path.join(asr_dir, f_name)
    print(f"\nüé¨ Processing: {f_name}")

    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()

    entries = re.findall(pattern, content, flags=re.DOTALL)
    print(f"   ‚Üí {len(entries)} subtitles detected")

    translated_entries = []
    translated_text_only = []

    batch_size = 15
    for i in range(0, len(entries), batch_size):
        batch = entries[i:i+batch_size]
        orig_texts = [t[2].strip() for t in batch]

        translated_batch = translate_batch(orig_texts)

        for (num, ts, _), trans in zip(batch, translated_batch):
            translated_entries.append(f"{num}\n{ts}\n{trans}\n")
            translated_text_only.append(trans)

        print(f"   ‚úÖ Translated segments {i+1}‚Äì{min(i+batch_size,len(entries))}")

    # Save outputs
    base = os.path.splitext(f_name)[0]
    srt_out = os.path.join(mt_dir, f"{base}_{target_language}.srt")
    txt_out = os.path.join(mt_dir, f"{base}_{target_language}.txt")

    with open(srt_out, "w", encoding='utf-8') as f:
        f.write("\n".join(translated_entries))

    with open(txt_out, "w", encoding='utf-8') as f:
        f.write("\n".join(translated_text_only))

    print(f"   üìÅ Saved ‚Üí {srt_out}")
    print(f"   üìÑ Saved ‚Üí {txt_out}")

print("\n‚úÖ All files translated successfully!")


Mounted at /content/drive
üü¢ Ready ‚Äî Processing all .srt files...

üé¨ Processing: Chapter 1A - Concept of Basic Electricity Voltage, Currents, Resistance, Impedance & Power Factor.srt
   ‚Üí 129 subtitles detected
   ‚úÖ Translated segments 1‚Äì15
   ‚úÖ Translated segments 16‚Äì30
   ‚úÖ Translated segments 31‚Äì45
   ‚úÖ Translated segments 46‚Äì60
   ‚úÖ Translated segments 61‚Äì75
   ‚úÖ Translated segments 76‚Äì90
   ‚úÖ Translated segments 91‚Äì105
   ‚úÖ Translated segments 106‚Äì120
Retrying batch due to error: 503 UNAVAILABLE. {'error': {'code': 503, 'message': 'The model is overloaded. Please try again later.', 'status': 'UNAVAILABLE'}}
   ‚úÖ Translated segments 121‚Äì129
   üìÅ Saved ‚Üí /content/drive/My Drive/Test_quality/gemini25pro/mt/Mal/Chapter 1A - Concept of Basic Electricity Voltage, Currents, Resistance, Impedance & Power Factor_Malayalam.srt
   üìÑ Saved ‚Üí /content/drive/My Drive/Test_quality/gemini25pro/mt/Mal/Chapter 1A - Concept of Basic Electricity 