In [None]:
!pip install --break-system-packages git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-12wfptm7
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-12wfptm7
  Resolved https://github.com/openai/whisper.git to commit c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_c

In [None]:
!pip install --break-system-packages pyannote.audio torchaudio # pydub not strictly needed if only ffmpeg is used for audio proc

Collecting pyannote.audio
  Downloading pyannote.audio-3.3.2-py2.py3-none-any.whl.metadata (11 kB)
Collecting asteroid-filterbanks>=0.4 (from pyannote.audio)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting lightning>=2.0.1 (from pyannote.audio)
  Downloading lightning-2.5.2-py3-none-any.whl.metadata (38 kB)
Collecting pyannote.core>=5.0.0 (from pyannote.audio)
  Downloading pyannote.core-5.0.0-py3-none-any.whl.metadata (1.4 kB)
Collecting pyannote.database>=5.0.1 (from pyannote.audio)
  Downloading pyannote.database-5.1.3-py3-none-any.whl.metadata (1.1 kB)
Collecting pyannote.metrics>=3.2 (from pyannote.audio)
  Downloading pyannote.metrics-3.2.1-py3-none-any.whl.metadata (1.3 kB)
Collecting pyannote.pipeline>=3.0.1 (from pyannote.audio)
  Downloading pyannote.pipeline-3.0.1-py3-none-any.whl.metadata (897 bytes)
Collecting pytorch-metric-learning>=2.1.0 (from pyannote.audio)
  Downloading pytorch_metric_learning-2.8.1-py3-none-any.whl.metadata (18

In [None]:
import whisper
from pyannote.audio import Pipeline
import torch
import re
import os
import subprocess
import json

In [None]:
# Configuration
INPUT_AUDIO_PATH = "/content/001_t2.wav"
CLEAN_AUDIO_PATH = "cleaned_audio_for_asr_and_diarization.wav"
HUGGING_FACE_ACCESS_TOKEN = "hf__"

In [None]:
model = whisper.load_model("large")  # or "large-v3" if available

100%|█████████████████████████████████████| 2.88G/2.88G [01:17<00:00, 40.1MiB/s]


In [15]:
model

Whisper(
  (encoder): AudioEncoder(
    (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
    (blocks): ModuleList(
      (0-31): 32 x ResidualAttentionBlock(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=1280, out_features=1280, bias=True)
          (key): Linear(in_features=1280, out_features=1280, bias=False)
          (value): Linear(in_features=1280, out_features=1280, bias=True)
          (out): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (attn_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=1280, out_features=5120, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=5120, out_features=1280, bias=True)
        )
        (mlp_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_post): LayerNorm(

In [24]:
def get_audio_duration(audio_path):
    """Get audio duration using ffprobe"""
    try:
        cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration",
               "-of", "default=noprint_wrappers=1:nokey=1", audio_path]
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
        return float(result.stdout.strip())
    except Exception as e:
        print(f"Could not get duration: {e}")
        return 0

In [25]:
def audio_preprocessing_v1(input_path, output_path):
    """
    Version 1: Advanced filters (your original working approach)
    """
    print("--- Trying Advanced Audio Preprocessing ---")

    ffmpeg_command = [
        "ffmpeg", "-i", input_path,
        "-acodec", "pcm_s16le",
        "-ac", "1",
        "-ar", "16000",
        "-af", "loudnorm=I=-16:TP=-1.5:LRA=11, highpass=f=200, lowpass=f=3000",
        "-y", output_path
    ]

    try:
        result = subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
        print("Advanced preprocessing successful")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Advanced preprocessing failed: {e.returncode}")
        return False

def audio_preprocessing_v2(input_path, output_path):
    """
    Version 2: Simplified but effective
    """
    print("--- Trying Simplified Audio Preprocessing ---")

    ffmpeg_command = [
        "ffmpeg", "-i", input_path,
        "-acodec", "pcm_s16le",
        "-ac", "1",
        "-ar", "16000",
        "-af", "loudnorm,highpass=f=100,lowpass=f=7000",
        "-y", output_path
    ]

    try:
        result = subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
        print("Simplified preprocessing successful")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Simplified preprocessing failed: {e.returncode}")
        return False

def audio_preprocessing_v3(input_path, output_path):
    """
    Version 3: Basic but reliable
    """
    print("--- Trying Basic Audio Preprocessing ---")

    ffmpeg_command = [
        "ffmpeg", "-i", input_path,
        "-acodec", "pcm_s16le",
        "-ac", "1",
        "-ar", "16000",
        "-af", "loudnorm",
        "-y", output_path
    ]

    try:
        result = subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
        print("Basic preprocessing successful")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Basic preprocessing failed: {e.returncode}")
        return False

def audio_preprocessing_v4(input_path, output_path):
    """
    Version 4: Minimal - just format conversion
    """
    print("--- Trying Minimal Audio Processing ---")

    ffmpeg_command = [
        "ffmpeg", "-i", input_path,
        "-acodec", "pcm_s16le",
        "-ac", "1",
        "-ar", "16000",
        "-y", output_path
    ]

    try:
        result = subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
        print("Minimal processing successful")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Minimal processing failed: {e.returncode}")
        return False

def smart_audio_preprocessing(input_path, output_path):
    """
    Try different preprocessing methods in order of preference
    """
    original_duration = get_audio_duration(input_path)
    print(f"Original audio duration: {original_duration:.2f} seconds")

    # Try methods in order of sophistication
    methods = [
        audio_preprocessing_v1,  # Your original working method
        audio_preprocessing_v2,  # Simplified
        audio_preprocessing_v3,  # Basic
        audio_preprocessing_v4   # Minimal
    ]

    for i, method in enumerate(methods, 1):
        if method(input_path, output_path):
            # Verify the output
            if os.path.exists(output_path):
                processed_duration = get_audio_duration(output_path)
                print(f"Processed audio duration: {processed_duration:.2f} seconds")

                if abs(original_duration - processed_duration) < 1.0:  # Allow 1 second difference
                    print(f"✅ Audio preprocessing successful with method {i}")
                    return True
                else:
                    print(f"⚠️  Duration mismatch with method {i}, trying next...")
                    continue

    print("❌ All preprocessing methods failed!")
    return False


In [23]:
model

Whisper(
  (encoder): AudioEncoder(
    (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
    (blocks): ModuleList(
      (0-31): 32 x ResidualAttentionBlock(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=1280, out_features=1280, bias=True)
          (key): Linear(in_features=1280, out_features=1280, bias=False)
          (value): Linear(in_features=1280, out_features=1280, bias=True)
          (out): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (attn_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=1280, out_features=5120, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=5120, out_features=1280, bias=True)
        )
        (mlp_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_post): LayerNorm(

In [26]:
def enhanced_whisper_transcription(audio_path):
    """
    Enhanced Whisper transcription with better parameters
    """
    print("--- Enhanced Whisper Transcription ---")

    # Load model
    # model
    # Enhanced initial prompt for insurance domain
    initial_prompt = (
        "Axis Maxlife Insurance, Policy number, fund value, risk term, Premium Due, Due date, "
        "Sum Assured, Policy Status, Late Fee, Google Pay, GPay, PhonePe, Paytm, netbanking, "
        "risk coverage, policy benefits, health declaration form, renewal, premium payment, "
        "claim settlement, maturity benefit, surrender value, loan against policy, "
        "nominee details, KYC documents, medical examination, insurance advisor, customer service"
    )

    # Transcribe with enhanced parameters
    result = model.transcribe(
        audio_path,
        language="ta",           # Tamil
        task="translate",        # Translate to English
        verbose=True,

        # Enhanced parameters for better accuracy
        # temperature=(0.0, 0.2, 0.4),  # Try multiple temperatures
        temperature=0.0,
        # beam_size=5,                  # Use beam search
        # best_of=5,                    # Generate multiple candidates

        initial_prompt=initial_prompt,
        word_timestamps=True,         # Get word-level timestamps

        # Thresholds for better quality
        compression_ratio_threshold=2.4,
        logprob_threshold=-1.0,
        no_speech_threshold=0.6,
        condition_on_previous_text=True,
        # condition_on_previous_text=False,
    )

    return result

In [27]:
def post_process_text(text):
    """
    Clean up transcribed text
    """
    if not text:
        return ""

    # Common corrections for Indian insurance context
    corrections = {
        'access max life': 'Axis Max Life',
        'axis max life': 'Axis Max Life',
        'g pay': 'GPay',
        'google pay': 'Google Pay',
        'phone pay': 'PhonePe',
        'phone pe': 'PhonePe',
        'pay tm': 'Paytm',
        'net banking': 'netbanking',
        'some assured': 'sum assured',
        'premium do': 'premium due',
        'do date': 'due date',
    }

    text_lower = text.lower()
    for wrong, correct in corrections.items():
        text_lower = text_lower.replace(wrong, correct)

    # Capitalize first letter of sentences
    text_lower = re.sub(r'(^|[.!?]\s+)([a-z])',
                       lambda m: m.group(1) + m.group(2).upper(), text_lower)

    return text_lower.strip()


In [29]:
def main():
    """
    Main processing pipeline
    """
    print("🎯 Starting Enhanced Audio Processing Pipeline")
    print("=" * 60)

    # Step 1: Smart Audio Preprocessing
    if not smart_audio_preprocessing(INPUT_AUDIO_PATH, CLEAN_AUDIO_PATH):
        print("❌ Audio preprocessing failed completely. Exiting.")
        return

    # Step 2: Enhanced Whisper Transcription
    try:
        whisper_result = enhanced_whisper_transcription(CLEAN_AUDIO_PATH)
        print("✅ Whisper transcription completed")
    except Exception as e:
        print(f"❌ Whisper transcription failed: {e}")
        return

    # Step 3: Post-process transcription
    processed_segments = []
    for segment in whisper_result["segments"]:
        processed_text = post_process_text(segment['text'])
        if processed_text.strip():  # Only keep non-empty segments
            segment_copy = segment.copy()
            segment_copy['text'] = processed_text
            processed_segments.append(segment_copy)

    whisper_result["segments"] = processed_segments

    # Step 4: Speaker Diarization
    print("\n--- Speaker Diarization ---")
    try:
        pipeline = Pipeline.from_pretrained(
            "pyannote/speaker-diarization-3.1",
            use_auth_token=HUGGING_FACE_ACCESS_TOKEN
        )

        if torch.cuda.is_available():
            pipeline.to(torch.device("cuda"))
            print("✅ Using GPU for diarization")

        diarization = pipeline(CLEAN_AUDIO_PATH)
        print("✅ Speaker diarization completed")

    except Exception as e:
        print(f"⚠️  Speaker diarization failed: {e}")
        diarization = None

    # Step 5: Generate Enhanced Dialogue
    print("\n--- Generating Dialogue ---")

    def get_dominant_speaker(start_time, end_time, diarization_result):
        if not diarization_result:
            return "Speaker_Unknown"

        speakers = {}
        for segment, _, speaker in diarization_result.itertracks(yield_label=True):
            overlap_start = max(start_time, segment.start)
            overlap_end = min(end_time, segment.end)
            overlap_duration = max(0, overlap_end - overlap_start)

            if overlap_duration > 0:
                speakers[speaker] = speakers.get(speaker, 0) + overlap_duration

        return max(speakers, key=speakers.get) if speakers else "Speaker_Unknown"

    # Combine segments by speaker
    dialogue = []
    current_speaker = None
    current_texts = []
    current_start = 0
    current_end = 0

    for segment in processed_segments:
        start = segment['start']
        end = segment['end']
        text = segment['text'].strip()

        speaker = get_dominant_speaker(start, end, diarization)

        # Merge consecutive segments from same speaker (within 3 seconds)
        if (speaker == current_speaker and
            current_speaker and
            (start - current_end) < 3.0):
            current_texts.append(text)
            current_end = end
        else:
            # Save previous speaker's dialogue
            if current_speaker and current_texts:
                dialogue.append({
                    'speaker': current_speaker,
                    'text': ' '.join(current_texts),
                    'start_time': current_start,
                    'end_time': current_end
                })

            # Start new speaker segment
            current_speaker = speaker
            current_texts = [text]
            current_start = start
            current_end = end

    # Add final segment
    if current_speaker and current_texts:
        dialogue.append({
            'speaker': current_speaker,
            'text': ' '.join(current_texts),
            'start_time': current_start,
            'end_time': current_end
        })

    # Step 6: Display Results
    print("\n" + "🎭 DIALOGUE OUTPUT" + "=" * 40)

    for entry in dialogue:
        timestamp = f"[{entry['start_time']:.1f}s - {entry['end_time']:.1f}s]"
        print(f"\n{entry['speaker']} {timestamp}:")
        print(f"  📝 {entry['text']}")

    # Step 7: Save Results
    output_data = {
        'metadata': {
            'total_duration': whisper_result.get('duration', 0),
            'total_speakers': len(set(d['speaker'] for d in dialogue)),
            'total_segments': len(dialogue),
            'model_used': 'whisper-large',s
            'processing_successful': True
        },
        'dialogue': dialogue,
        'raw_transcription': whisper_result
    }

    with open('enhanced_transcription_results.json', 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)

    print(f"\n💾 Results saved to: enhanced_transcription_results.json")
    print("✅ Processing completed successfully!")

if __name__ == "__main__":
    main()