In [2]:
# Upgrade pip first (optional but helpful)
!pip -q install --upgrade pip

# (A) Keep your existing Torch (2.8.0+cu126) — just install the rest from PyPI
!pip -q install "transformers>=4.45" accelerate librosa soundfile jiwer

# (Optional B) If you DO want to reinstall Torch explicitly, use the PyTorch index ONLY for Torch pkgs:
# !pip -q install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
# then install the rest from PyPI (as above).

import torch, platform, importlib
print("Torch:", torch.__version__, "| CUDA available:", torch.cuda.is_available(), "| Python:", platform.python_version())

# quick import checks
for pkg in ["transformers", "librosa", "soundfile", "jiwer"]:
    try:
        importlib.import_module(pkg)
        print(f"import {pkg}: OK")
    except Exception as e:
        print(f"import {pkg}: FAILED -> {e}")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
[?25hTorch: 2.8.0+cu126 | CUDA available: True | Python: 3.12.11
import transformers: OK
import librosa: OK
import soundfile: OK
import jiwer: OK


In [3]:
from google.colab import files
import os, zipfile, glob

# 1) Upload your ZIP from your Ubuntu machine
# uploaded = files.upload()  # choose your local ZIP (e.g., indicwhisper_ta_medium.zip)
# zip_name = next(iter(uploaded.keys()))
zip_name="/content/tamil_models.zip"
print("Uploaded ZIP:", zip_name)

# 2) Extract it
EXTRACT_ROOT = "/content/models_extracted"
os.makedirs(EXTRACT_ROOT, exist_ok=True)
with zipfile.ZipFile(zip_name, 'r') as z:
    z.extractall(EXTRACT_ROOT)

# 3) Find the directory that actually holds config + weights
def find_model_dir(root):
    for r, d, f in os.walk(root):
        has_cfg = "config.json" in f
        has_weights = ("pytorch_model.bin" in f) or any(x.endswith(".safetensors") for x in f)
        if has_cfg and has_weights:
            return r
    return None

MODEL_DIR = find_model_dir(EXTRACT_ROOT)
assert MODEL_DIR is not None, "Could not find a folder with config.json + weights. Check your ZIP."
print("MODEL_DIR:", MODEL_DIR)

# (Optional) peek at files inside
print("Files in MODEL_DIR:", os.listdir(MODEL_DIR)[:10])


Uploaded ZIP: /content/tamil_models.zip
MODEL_DIR: /content/models_extracted/tamil_models/whisper-medium-ta_alldata_multigpu
Files in MODEL_DIR: ['added_tokens.json', 'normalizer.json', 'training_args.bin', 'rng_state_2.pth', 'vocab.json', 'rng_state_1.pth', 'trainer_state.json', 'config.json', 'pytorch_model.bin', 'special_tokens_map.json']


In [6]:
import os, time, math, gc, sys, torch
from pathlib import Path
from transformers import WhisperProcessor, WhisperForConditionalGeneration

assert 'MODEL_DIR' in globals(), "Run Cell 2 first so MODEL_DIR is defined."
MODEL_DIR = Path(MODEL_DIR)

device = 0 if torch.cuda.is_available() else -1
dtype  = torch.float16 if device == 0 else torch.float32

# --- Inspect the folder so we know what we're loading ---
files = os.listdir(MODEL_DIR)
print("MODEL_DIR:", MODEL_DIR)
print("Files:", files)

# Show sizes of relevant files
def human(n):
    for u in ["B","KB","MB","GB","TB"]:
        if n < 1024: return f"{n:.2f}{u}"
        n/=1024
    return f"{n:.2f}PB"

for name in ["pytorch_model.bin", "pytorch_model.bin.index.json"]:
    p = MODEL_DIR / name
    if p.exists():
        print(f"{name}: {human(p.stat().st_size)}")
safetensors = sorted(MODEL_DIR.glob("*.safetensors"))
if safetensors:
    for p in safetensors:
        print(f"{p.name}: {human(p.stat().st_size)}")
else:
    print("No .safetensors found; will load .bin via torch (still ok).")

# --- Load processor: prefer local; fallback to base if missing ---
BASE_PROCESSOR_ID = "openai/whisper-medium"   # change if your finetune base is small/large
try:
    processor = WhisperProcessor.from_pretrained(MODEL_DIR, local_files_only=True)
    print("Processor loaded from ZIP ✅")
except Exception as e:
    print("Processor not in ZIP; falling back to:", BASE_PROCESSOR_ID, "\nReason:", repr(e))
    processor = WhisperProcessor.from_pretrained(BASE_PROCESSOR_ID)

# --- Memory hygiene before big load ---
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# --- Choose best load parameters ---
use_safetensors = len(safetensors) > 0

load_kwargs = dict(
    local_files_only=True,
    low_cpu_mem_usage=True,      # important
    torch_dtype=dtype,
)
# For .bin, ask torch to only read weights (faster/safer on Torch ≥ 2.0)
if not use_safetensors:
    load_kwargs["weights_only"] = True

t0 = time.time()
print(f"\nLoading model ({'safetensors' if use_safetensors else 'bin'})... this can take a minute for large checkpoints.")
model = WhisperForConditionalGeneration.from_pretrained(
    MODEL_DIR,
    **load_kwargs
)
load_secs = time.time() - t0
print(f"Model loaded in {load_secs:.1f}s ✅")

# Move to GPU if available
if device == 0:
    model = model.to("cuda")
print("Device:", "cuda" if device == 0 else "cpu", "| dtype:", dtype)

# --- Force English transcript from Tamil audio ---
model.generation_config.forced_decoder_ids = processor.get_decoder_prompt_ids(
    language="ta", task="translate"   # ta -> en
)
print("Forced source_language='ta', task='translate' (English output) ✅")


MODEL_DIR: /content/models_extracted/tamil_models/whisper-medium-ta_alldata_multigpu
Files: ['added_tokens.json', 'normalizer.json', 'training_args.bin', 'rng_state_2.pth', 'vocab.json', 'rng_state_1.pth', 'trainer_state.json', 'config.json', 'pytorch_model.bin', 'special_tokens_map.json', 'tokenizer_config.json', 'latest', 'rng_state_3.pth', 'rng_state_0.pth', 'merges.txt', 'preprocessor_config.json', 'generation_config.json']
pytorch_model.bin: 1.42GB
No .safetensors found; will load .bin via torch (still ok).
Processor loaded from ZIP ✅

Loading model (bin)... this can take a minute for large checkpoints.
Model loaded in 902.6s ✅
Device: cuda | dtype: torch.float16
Forced source_language='ta', task='translate' (English output) ✅


In [7]:
from transformers import pipeline

pipe = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=30,        # if OOM, try 20 or 15
    stride_length_s=(4, 2),
    return_timestamps=True,
    device=device,
    batch_size=8
)
print("Pipeline ready ✅ (English transcripts will be produced)")


Device set to use cuda:0


Pipeline ready ✅ (English transcripts will be produced)


In [8]:
from google.colab import files
import os, json

#uploaded = files.upload()  # choose a local file like tamil_test.wav / .mp3 / .m4a
#audio_path = next(iter(uploaded.keys()))
audio_path="/content/call2.wav"
print("Audio:", audio_path)

SAVE_DIR = "/content/outputs"
os.makedirs(SAVE_DIR, exist_ok=True)

# Run inference
result = pipe(audio_path)
text   = result["text"]

# Save TXT + JSON
base = os.path.splitext(os.path.basename(audio_path))[0]
txt_path  = os.path.join(SAVE_DIR, f"{base}.txt")
json_path = os.path.join(SAVE_DIR, f"{base}.json")
with open(txt_path, "w", encoding="utf-8") as f: f.write(text)
with open(json_path, "w", encoding="utf-8") as f: json.dump(result, f, ensure_ascii=False, indent=2)

print("\nTranscript preview (~300 chars):")
print(text[:300])
print("\nSaved:", txt_path)
print("Saved:", json_path)

# Show a quick peek at the first 2 timestamped chunks
chunks = result.get("chunks") or []
print(f"\nChunks detected: {len(chunks)}")
print("First 2 chunks:", chunks[:2])


Audio: /content/call2.wav


Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.



Transcript preview (~300 chars):
 வணக்கம் என் தேருஸ் வாழ்த்தி நாங்கள் லைசென்ஸ் செல்ந்து கால் பண்ணியிருக்கோம் இது என்ன நியமனிக்கு வருங்கள் ஆக்சிஸ் மக்லெஃப் இன்சுரன்சில பாலிசை ட்ரிக்கிங் என்ன விஷயம் பேசுவாங்களா பேசும் முன்னை நோக்கிக்கொள்வோம் டீவின் பட்டமுடையாது பாலிசிக் ஆன்சல் பண்ணா ரெனரேட் அவருக்கு கட்லிஸ்ட் ஒரு மூன்று வருஷமாக பரிவி

Saved: /content/outputs/call2.txt
Saved: /content/outputs/call2.json

Chunks detected: 1
First 2 chunks: [{'timestamp': (None, None), 'text': ' வணக்கம் என் தேருஸ் வாழ்த்தி நாங்கள் லைசென்ஸ் செல்ந்து கால் பண்ணியிருக்கோம் இது என்ன நியமனிக்கு வருங்கள் ஆக்சிஸ் மக்லெஃப் இன்சுரன்சில பாலிசை ட்ரிக்கிங் என்ன விஷயம் பேசுவாங்களா பேசும் முன்னை நோக்கிக்கொள்வோம் டீவின் பட்டமுடையாது பாலிசிக் ஆன்சல் பண்ணா ரெனரேட் அவருக்கு கட்லிஸ்ட் ஒரு மூன்று வருஷமாக பரிவிண்ட் பண்ணியிருக்கணும் சில இவ்வளவு பாத்திங்கப்படும் அதிர்ச்சி மூன்று வருஷ பரிவின் பண்ணி சரண்டர் பண்ணுங்கன்னா சரண்டர் வல்லின்றி அவர் மும்முன் புடிச்சிட்டு உன் கொடுப்பாங்க பஸ்ட்ரஸ்ட்டில் பேமின் பண்ணி சரண்டர் பண்ணும்போது இந்த

In [9]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
import torch, gc

assert 'MODEL_DIR' in globals(), "Run Cell 2 first so MODEL_DIR is defined."

device = 0 if torch.cuda.is_available() else -1
dtype  = torch.float16 if device == 0 else torch.float32

# Load processor/model (you already did this earlier; safe to re-run)
processor = WhisperProcessor.from_pretrained(MODEL_DIR, local_files_only=True)
model = WhisperForConditionalGeneration.from_pretrained(
    MODEL_DIR,
    local_files_only=True,
    low_cpu_mem_usage=True,
    torch_dtype=dtype,
)

# Move to GPU if available
if device == 0:
    model = model.to("cuda")

# ❌ IMPORTANT: stop using model-config forced ids (deprecated path)
model.generation_config.forced_decoder_ids = None

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# ✅ Build pipeline that ALWAYS translates to EN at call-time
pipe = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=30,          # lower to 20/15 if OOM
    stride_length_s=(4, 2),
    return_timestamps=True,     # phrase-level timestamps
    device=device,
    batch_size=8,
)

print("Pipeline ready (translation to EN will be set in generate_kwargs) ✅")


Device set to use cuda:0


Pipeline ready (translation to EN will be set in generate_kwargs) ✅
