# setup

In [None]:
!pip install transformers torchaudio onnx onnxruntime onnxruntime-gpu

In [None]:
!apt-get -y install ffmpeg

In [None]:
from google.colab import drive
import os
import shutil

# # Unmount the drive if it is already mounted
# if os.path.exists('/content/drive'):
#   try:
#     drive.flush_and_unmount()
#   except ValueError:
#     pass # Drive was not mounted
#   # Remove the directory if it still exists and is not empty
#   if os.path.exists('/content/drive') and os.path.isdir('/content/drive') and os.listdir('/content/drive'):
#       print("Removing existing /content/drive directory...")
#       shutil.rmtree('/content/drive')


# Mount the drive
drive.mount('/content/drive', force_remount=True)

# wav single

In [None]:
# Convert MP4 → mono WAV (16 kHz)
import os

# Path to your Google Drive folder containing MP4 files
input_folder =  "/content/drive/MyDrive/Test Videos/Hindi"
output_folder = "/content/drive/MyDrive/Test Videos Wav/Hindi_wav"

os.makedirs(output_folder, exist_ok=True)

for dirpath, dirnames, filenames in os.walk(input_folder):
    for file in filenames:
        if file.endswith(".mp4"):
            input_path = os.path.join(dirpath, file)

            # Recreate subfolder structure inside output
            relative_path = os.path.relpath(dirpath, input_folder)
            output_dir = os.path.join(output_folder, relative_path)
            os.makedirs(output_dir, exist_ok=True)

            output_path = os.path.join(output_dir, os.path.splitext(file)[0] + ".wav")

            # Convert with ffmpeg
            !ffmpeg -y -i "{input_path}" -ac 1 -ar 16000 "{output_path}"
            print(f"Converted: {output_path}")

# 1 lang

In [None]:
import os
import torch
import torchaudio
from transformers import AutoModel
from pathlib import Path

# -------------------------------
# Load IndicConformer model
# -------------------------------
print("🔄 Loading IndicConformer model ...")
model = AutoModel.from_pretrained(
    "ai4bharat/indic-conformer-600m-multilingual", trust_remote_code=True
)
print("✅ Model loaded\n")

# -------------------------------
# Load Silero VAD
# -------------------------------
print("🔄 Loading VAD model ...")
vad_model, utils = torch.hub.load(
    repo_or_dir="snakers4/silero-vad", model="silero_vad", force_reload=False
)
(get_speech_timestamps, save_audio, read_audio, VADIterator, collect_chunks) = utils
print("✅ VAD loaded\n")

# -------------------------------
# Paths
# -------------------------------
input_root = "/content/drive/MyDrive/Test Videos Wav/Hindi_wav"
output_root = "/content/drive/MyDrive/Test Videos ASR indcnf/Hindi"
os.makedirs(output_root, exist_ok=True)

# -------------------------------
# Helper: format seconds to SRT time
# -------------------------------
def format_time(seconds: float):
    millis = int((seconds - int(seconds)) * 1000)
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = int(seconds % 60)
    return f"{h:02}:{m:02}:{s:02},{millis:03}"

# -------------------------------
# Walk input folder
# -------------------------------
for root, dirs, files in os.walk(input_root):
    for file in files:
        if file.lower().endswith(".wav"):
            input_path = os.path.join(root, file)
            print(f"\n🎤 Processing: {input_path}")

            # Mirror folder structure inside output_root
            rel_path = os.path.relpath(root, input_root)
            output_dir = os.path.join(output_root, rel_path)
            os.makedirs(output_dir, exist_ok=True)

            base_name = Path(file).stem
            txt_path = os.path.join(output_dir, base_name + ".txt")
            srt_path = os.path.join(output_dir, base_name + ".srt")

            # Load and preprocess audio
            wav, sr = torchaudio.load(input_path)
            wav = torch.mean(wav, dim=0, keepdim=True)  # Convert to mono
            if sr != 16000:
                wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(wav)
                sr = 16000

            # Normalize audio
            wav = wav / torch.max(torch.abs(wav))

            # -------------------------------
            # Run VAD segmentation
            # -------------------------------
            speech_timestamps = get_speech_timestamps(
                wav[0], vad_model, sampling_rate=sr
            )
            print(f"🧩 Detected {len(speech_timestamps)} speech segments.")

            all_text = []

            # -------------------------------
            # If VAD found segments → process each
            # Else → transcribe full file
            # -------------------------------
            if len(speech_timestamps) > 0:
                with open(srt_path, "w", encoding="utf-8") as f:
                    for i, seg in enumerate(speech_timestamps, start=1):
                        start_time = seg["start"] / sr
                        end_time = seg["end"] / sr
                        segment_wav = wav[:, seg["start"]:seg["end"]]

                        result = model(segment_wav, "hi", "ctc")
                        text = result["text"] if isinstance(result, dict) else str(result)
                        text = text.strip()

                        if text:
                            all_text.append(text)
                            f.write(
                                f"{i}\n"
                                f"{format_time(start_time)} --> {format_time(end_time)}\n"
                                f"{text}\n\n"
                            )

            else:
                print("⚠️ No speech detected — transcribing full audio...")
                result = model(wav, "hi", "ctc")
                text = result["text"] if isinstance(result, dict) else str(result)
                text = text.strip()
                all_text.append(text)

                # Write single block to SRT
                with open(srt_path, "w", encoding="utf-8") as f:
                    f.write("1\n")
                    f.write(f"00:00:00,000 --> {format_time(len(wav[0])/sr)}\n")
                    f.write(f"{text}\n\n")

            # -------------------------------
            # Save full transcript as TXT
            # -------------------------------
            with open(txt_path, "w", encoding="utf-8") as f:
                f.write(" ".join(all_text))

            print(f"✅ Saved TXT → {txt_path}")
            print(f"✅ Saved SRT → {srt_path}")

print("\n🎉 All audio files processed successfully!")
print("📂 Output folder:", output_root)


#