# GeminiASR

In [None]:
# ========================
# INSTALL & IMPORTS
# ========================
!pip install -q google-generativeai pydub tqdm librosa

import os
import io
from google.colab import drive, userdata
import google.generativeai as genai
from pydub import AudioSegment
from tqdm import tqdm

# ========================
# SETUP
# ========================

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Securely load your Gemini API key from Colab secrets
api_key = userdata.get("GOOGLE_API_KEY")
if not api_key:
    raise ValueError("❌ No GOOGLE_API_KEY found in Colab secrets! Add it under 'More → Secrets'.")

genai.configure(api_key=api_key)

# Choose your model
model = genai.GenerativeModel("models/gemini-2.5-pro")

# Input/output folders in Google Drive
input_dir = "/content/drive/MyDrive/a/"
output_dir = os.path.join(input_dir, "transcripts_withsrt_nofill")
os.makedirs(output_dir, exist_ok=True)

# ========================
# HELPER FUNCTIONS
# ========================

def transcribe_audio_file(file_path):
    """Transcribe full audio file without splitting."""
    audio = AudioSegment.from_wav(file_path)
    buffer = io.BytesIO()
    audio.export(buffer, format="wav")
    audio_bytes = buffer.getvalue()

    try:
        response = model.generate_content([
            {"mime_type": "audio/wav", "data": audio_bytes},
            """
            Transcribe this audio exactly as spoken (no extra comments, no filler words) with the following .srt format:

            1
            00:00:15,362 --> 00:00:21,789
            अब हम जानेंगे कैंडल्स में क्या क्या चीज़ों की ज़रूरत पड़ती है और उनको हम कहाँ से ख़रीद सकते हैं

            2
            00:00:21,922 --> 00:00:27,422
            तो सबसे पहले कैंडल बनाने के लिए हमें डबल बॉयलर की ज़रूरत पड़ती है ये

            3
            00:00:27,617 --> 00:00:29,853
            इस तरह का ये इंडक्शन है

            and so on...

            The transcription should strictly follow the format above, where:
            - **Timestamps** are in the format of HH:MM:SS,SSS --> HH:MM:SS,SSS (with millisecond precision).
            - Each entry should have a **sequential index** starting from 1 (e.g., 1, 2, 3, ...).
            - The spoken text should be captured **exactly as it is spoken**, without adding or removing words(but remove filler words).
            - If there is **silence** or a pause, mark the duration with a timestamp like this:
              ```
              4
              00:00:29,854 --> 00:00:34,500
              [Silence]
              ```
            - Include **Speaker labels** (e.g., Speaker 1, Speaker 2) where relevant if multiple speakers are detected.

            Please ensure the output strictly follows this format. Thank you!
            """
        ])

        return response.text.strip()
    except Exception as e:
        print("❌ Error:", e)
        return ""

# ========================
# MAIN PROCESS
# ========================

for filename in os.listdir(input_dir):
    if filename.lower().endswith(".wav"):
        file_path = os.path.join(input_dir, filename)
        print(f"\n🎧 Transcribing full audio: {filename}")

        # Get full transcription
        text = transcribe_audio_file(file_path)

        # Save TXT file
        txt_output = os.path.join(output_dir, filename.replace(".wav", ".txt"))
        with open(txt_output, "w", encoding="utf-8") as f:
            f.write(text)

        print(f"✅ Done: {filename}")
        print(f"📄 TXT saved to: {txt_output}")


# GoogleCloudMT

In [None]:
from google.cloud import translate_v2 as translate
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Set the environment variable for your Google Cloud credentials
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/content/drive/MyDrive/Apikey/gen-lang-client-05.json'

# Initialize Google Cloud Translate client
translate_client = translate.Client()

# Function to translate text using Google Translate
def translate_text(text, target_language='te'):
    result = translate_client.translate(text, target_language=target_language)
    return result['translatedText']

# Function to translate a .srt file
def translate_srt(input_file_path, output_file_path, target_language='te'):
    # Read the .srt file
    with open(input_file_path, 'r', encoding='utf-8') as file:
        srt_lines = file.readlines()

    # Extract subtitle text lines (every 3rd line)
    subtitle_lines = []
    subtitle_indices = []
    for i, line in enumerate(srt_lines):
        if i % 4 == 2:  # Every 3rd line is subtitle text in .srt files
            subtitle_lines.append(line.strip())
            subtitle_indices.append(i)

    # Translate subtitle lines in batches
    print(f"🧠 Translating {len(subtitle_lines)} subtitle lines from {os.path.basename(input_file_path)}...")

    translated_lines = []
    batch_size = 8  # You can adjust the batch size based on your needs

    for i in range(0, len(subtitle_lines), batch_size):
        batch = subtitle_lines[i:i + batch_size]
        translated_batch = [translate_text(text, target_language=target_language) for text in batch]
        translated_lines.extend(translated_batch)

    # Rebuild the .srt file with translated text
    translated_srt = []
    t_idx = 0
    for i, line in enumerate(srt_lines):
        if i in subtitle_indices:
            translated_srt.append(translated_lines[t_idx])
            t_idx += 1
        else:
            translated_srt.append(line.strip())

    # Save the translated .srt file
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        output_file.write("\n".join(translated_srt))

    print(f"✅ Translation complete! The translated SRT is saved at {output_file_path}")

# Example usage:
# Define the path of your input .srt file and output file on Google Drive
input_file_path = '/content/drive/MyDrive/a/transcripts_withsrt_nofill/eng/Chapter 6A - Sucessful Entreuprenuer Journey.srt'  # Path to your input .srt file
output_file_path = '/content/drive/MyDrive/a/transcripts_withsrt_nofill/eng/MTeng/tel.srt'  # Path to save the translated .srt file

# Call the function to translate the .srt file
translate_srt(input_file_path, output_file_path, target_language='te')


# SarvamTranslateMT

In [None]:
# ========================
# INSTALL & IMPORTS
# ========================
!pip install transformers accelerate bitsandbytes tqdm -q

import os
import torch
from tqdm import tqdm
from google.colab import drive
from transformers import AutoModelForCausalLM, AutoTokenizer

# ========================
# SETUP
# ========================

drive.mount('/content/drive', force_remount=True)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "sarvamai/sarvam-translate"
tgt_lang = "Telugu"  # Change this to "Hindi", "Tamil", etc.

# Load model + tokenizer
print("⏳ Loading model...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float32  # safer for stability
)
print("✅ Model loaded successfully!")

# Input / Output directories
input_dir = "/content/drive/MyDrive/aa"
output_dir = "/content/drive/MyDrive/aTelugu"
os.makedirs(output_dir, exist_ok=True)


# ========================
# TRANSLATION HELPERS
# ========================

def batch_translate_sarvam(texts, tgt_lang, batch_size=8):
    """Translate multiple lines at once using Sarvam model."""
    translations = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]

        # Build system + user messages for each text
        messages = []
        for t in batch:
            messages.append([
                {"role": "system", "content": f"Translate the text below to {tgt_lang}."},
                {"role": "user", "content": t}
            ])

        # Create prompts
        prompts = [
            tokenizer.apply_chat_template(m, tokenize=False, add_generation_prompt=True)
            for m in messages
        ]

        # Tokenize as a batch
        inputs = tokenizer(
            prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=1024
        ).to(model.device)

        # Generate in one go
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=256,
                do_sample=False,          # deterministic, avoids GPU crash
                num_beams=4,
                temperature=0.7,
                pad_token_id=tokenizer.eos_token_id
            )

        # Decode each result
        for j, out in enumerate(outputs):
            gen_ids = out[len(inputs.input_ids[j]):]
            text_out = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
            translations.append(text_out if text_out else batch[j])

        torch.cuda.empty_cache()  # Free VRAM

    return translations


def translate_srt(input_path, output_srt, output_txt, tgt_lang):
    """Translate a single SRT file using batched Sarvam translation."""
    with open(input_path, 'r', encoding='utf-8') as f:
        srt_lines = f.readlines()

    translated_srt, translated_txt = [], []
    subtitle_lines, subtitle_indices = [], []

    # Collect subtitle text lines (every 3rd line)
    for i, line in enumerate(srt_lines):
        if i % 4 == 2:
            text = line.strip()
            if text:
                subtitle_lines.append(text)
                subtitle_indices.append(i)

    print(f"🔄 Translating {len(subtitle_lines)} lines from {os.path.basename(input_path)}...")

    # Batch translation
    translated_lines = []
    for batch_start in tqdm(range(0, len(subtitle_lines), 8), desc="🚀 Translating", ncols=100):
        batch = subtitle_lines[batch_start:batch_start + 8]
        batch_translated = batch_translate_sarvam(batch, tgt_lang, batch_size=8)
        translated_lines.extend(batch_translated)

    # Merge translations back into original structure
    t_idx = 0
    for i, line in enumerate(srt_lines):
        if i in subtitle_indices:
            translated_srt.append(translated_lines[t_idx])
            translated_txt.append(translated_lines[t_idx])
            t_idx += 1
        else:
            translated_srt.append(line.strip())

    # Save results
    with open(output_srt, 'w', encoding='utf-8') as f:
        f.write("\n".join(translated_srt))

    with open(output_txt, 'w', encoding='utf-8') as f:
        f.write("\n".join(translated_txt))

    print(f"✅ Saved translated SRT → {output_srt}")
    print(f"✅ Saved plain text → {output_txt}")


# ========================
# BATCH PROCESSING LOOP
# ========================

for root, _, files in os.walk(input_dir):
    for file in files:
        if file.endswith(".srt"):
            input_path = os.path.join(root, file)
            relative = os.path.relpath(root, input_dir)
            save_dir = os.path.join(output_dir, relative)
            os.makedirs(save_dir, exist_ok=True)

            output_srt = os.path.join(save_dir, file)
            output_txt = os.path.join(save_dir, file.replace(".srt", ".txt"))

            print(f"\n🔄 Translating: {input_path}")
            translate_srt(input_path, output_srt, output_txt, tgt_lang)
            torch.cuda.empty_cache()  # clear after each file
