In [1]:
!pip install --break-system-packages git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-fongbyha
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-fongbyha
  Resolved https://github.com/openai/whisper.git to commit c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_c

In [2]:
!pip install --break-system-packages pyannote.audio torchaudio

Collecting pyannote.audio
  Downloading pyannote.audio-3.3.2-py2.py3-none-any.whl.metadata (11 kB)
Collecting asteroid-filterbanks>=0.4 (from pyannote.audio)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting lightning>=2.0.1 (from pyannote.audio)
  Downloading lightning-2.5.2-py3-none-any.whl.metadata (38 kB)
Collecting pyannote.core>=5.0.0 (from pyannote.audio)
  Downloading pyannote.core-5.0.0-py3-none-any.whl.metadata (1.4 kB)
Collecting pyannote.database>=5.0.1 (from pyannote.audio)
  Downloading pyannote.database-5.1.3-py3-none-any.whl.metadata (1.1 kB)
Collecting pyannote.metrics>=3.2 (from pyannote.audio)
  Downloading pyannote.metrics-3.2.1-py3-none-any.whl.metadata (1.3 kB)
Collecting pyannote.pipeline>=3.0.1 (from pyannote.audio)
  Downloading pyannote.pipeline-3.0.1-py3-none-any.whl.metadata (897 bytes)
Collecting pytorch-metric-learning>=2.1.0 (from pyannote.audio)
  Downloading pytorch_metric_learning-2.8.1-py3-none-any.whl.metadata (18

In [3]:
import whisper
from pyannote.audio import Pipeline
import torch
import re
import os
import subprocess
import json
import librosa
import numpy as np

In [4]:
class Config:
    INPUT_AUDIO_PATH = "call4.wav"
    CLEAN_AUDIO_PATH = "cleaned_audio_for_asr_and_diarization.wav"
    HUGGING_FACE_ACCESS_TOKEN = "hf_"

    # Single optimized prompt (optional - you can remove this)
    INSURANCE_PROMPT = (
        "This is customer support for Axis Max Life Insurance. "
        "Keywords: policy number, due date, fund value, sum assured, "
        "Google Pay, PhonePe, Paytm, netbanking, premium, late fee."
    )

In [6]:
model = whisper.load_model("large")

100%|█████████████████████████████████████| 2.88G/2.88G [01:01<00:00, 49.9MiB/s]


In [5]:
def get_audio_duration(audio_path):
    """Get audio duration using ffprobe"""
    try:
        cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration",
               "-of", "default=noprint_wrappers=1:nokey=1", audio_path]
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
        return float(result.stdout.strip())
    except Exception as e:
        print(f"Could not get duration: {e}")
        return 0

def quick_audio_check(audio_path):
    """Quick audio quality check without heavy processing"""
    try:
        # Just check basic properties
        duration = get_audio_duration(audio_path)

        # Simple file size vs duration ratio (rough quality indicator)
        file_size = os.path.getsize(audio_path)
        size_per_second = file_size / duration if duration > 0 else 0

        # Very rough heuristic: smaller files might be lower quality
        if size_per_second < 8000:  # bytes per second
            return "low_quality"
        elif size_per_second < 16000:
            return "medium_quality"
        else:
            return "good_quality"

    except Exception as e:
        print(f"Quick audio check failed: {e}")
        return "unknown_quality"

def fast_preprocessing(input_path, output_path, quality_hint="unknown_quality"):
    """Fast, single-pass preprocessing"""
    print(f"🔧 Fast preprocessing for {quality_hint} audio")

    # Choose preprocessing based on simple quality hint
    if quality_hint == "low_quality":
        # More aggressive for poor quality
        af_filter = (
            "highpass=f=100,"
            "lowpass=f=7500,"
            "loudnorm=I=-20:TP=-2,"
            "afftdn=nr=15"
        )
    elif quality_hint == "medium_quality":
        # Moderate processing
        af_filter = (
            "highpass=f=80,"
            "lowpass=f=8000,"
            "loudnorm=I=-23:TP=-2,"
            "afftdn=nr=10"
        )
    else:
        # Minimal processing for good quality
        af_filter = "loudnorm=I=-23:TP=-2"

    ffmpeg_command = [
        "ffmpeg", "-i", input_path,
        "-acodec", "pcm_s16le",
        "-ac", "1",
        "-ar", "16000",
        "-af", af_filter,
        "-y", output_path
    ]

    try:
        subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
        print("✅ Fast preprocessing completed")
        return True
    except subprocess.CalledProcessError as e:
        print(f"❌ Fast preprocessing failed: {e}")
        return False

In [12]:
def optimized_whisper_transcription(model, audio_path):
    """Single-pass optimized Whisper transcription"""
    print("🎯 Running optimized Whisper transcription")

    # Single optimal parameter set (no multiple strategies)
    result = model.transcribe(
        audio_path,
        language="ta",  # Tamil with English translation
        task="translate",
        temperature=0.0,  # Deterministic
        beam_size=5,     # Good balance of speed vs accuracy
        patience=1.0,    # Standard patience
        condition_on_previous_text=True,  # Use context
        no_speech_threshold=0.6,
        compression_ratio_threshold=2.4,
        logprob_threshold=-1.0,
        word_timestamps=False,  # Disable to save time
        initial_prompt=Config.INSURANCE_PROMPT,  # Optional domain hint
        verbose=True,   # Reduce output
    )

    print("✅ Whisper transcription completed")
    return result


In [13]:
def smart_chunking_for_long_audio(model, audio_path, max_duration=300):
    """Only use chunking for very long audio (>5 minutes)"""
    duration = get_audio_duration(audio_path)

    if duration <= max_duration:
        # Short audio - process normally
        return optimized_whisper_transcription(model, audio_path)

    print(f"🔄 Long audio detected ({duration:.1f}s). Using smart chunking...")

    chunk_duration = 120  # 2-minute chunks
    overlap = 10  # 10-second overlap
    all_segments = []

    for start_time in range(0, int(duration), chunk_duration - overlap):
        end_time = min(start_time + chunk_duration, duration)

        print(f"Processing chunk: {start_time}s - {end_time}s")

        # Extract chunk
        chunk_path = f"temp_chunk_{start_time}.wav"
        extract_command = [
            "ffmpeg", "-i", audio_path,
            "-ss", str(start_time),
            "-t", str(end_time - start_time),
            "-acodec", "pcm_s16le",
            "-ar", "16000",
            "-ac", "1",
            "-y", chunk_path
        ]

        try:
            subprocess.run(extract_command, check=True, capture_output=True)

            # Transcribe chunk
            chunk_result = optimized_whisper_transcription(model, chunk_path)

            # Adjust timestamps and add to all segments
            for segment in chunk_result.get('segments', []):
                segment['start'] += start_time
                segment['end'] += start_time
                all_segments.append(segment)

            # Clean up
            os.remove(chunk_path)

        except Exception as e:
            print(f"Chunk processing failed for {start_time}-{end_time}: {e}")
            continue

    return {'segments': all_segments}

In [14]:
def lightweight_repetition_removal(segments):
    """Lightweight repetition removal (faster than aggressive version)"""
    print("🔍 Quick repetition removal...")

    cleaned_segments = []

    for segment in segments:
        text = segment['text'].strip()
        words = text.split()

        # Skip very short segments
        if len(words) < 2:
            continue

        # Simple checks only:

        # 1. Check for excessive same-word repetition
        word_counts = {}
        for word in words:
            word_lower = word.lower().strip('.,!?')
            word_counts[word_lower] = word_counts.get(word_lower, 0) + 1

        max_word_count = max(word_counts.values()) if word_counts else 0
        word_dominance = max_word_count / len(words) if words else 0

        if word_dominance > 0.5:  # If any word is >50% of segment
            continue

        # 2. Check for immediate consecutive repetitions
        max_consecutive = 0
        consecutive = 0

        for i in range(1, len(words)):
            if words[i].lower() == words[i-1].lower():
                consecutive += 1
                max_consecutive = max(max_consecutive, consecutive + 1)
            else:
                consecutive = 0

        if max_consecutive > 4:  # More than 4 consecutive same words
            continue

        # Segment passed basic checks
        cleaned_segments.append(segment)

    removed_count = len(segments) - len(cleaned_segments)
    print(f"📊 Quick cleaning: {len(segments)} → {len(cleaned_segments)} segments")
    print(f"🗑️  Removed {removed_count} problematic segments")

    return cleaned_segments


In [15]:
def simple_post_process_text(text):
    """Simplified text post-processing (faster)"""
    if not text:
        return ""

    # Basic cleanup only
    text = text.strip()

    # Remove excessive spaces
    text = re.sub(r'\s{2,}', ' ', text)

    # Simple domain corrections (most common ones only)
    corrections = {
        'axis max life': 'Axis Max Life',
        'g pay': 'GPay',
        'phone pe': 'PhonePe',
        'pay tm': 'Paytm',
        'some assured': 'sum assured',
        'do date': 'due date',
    }

    text_lower = text.lower()
    for wrong, correct in corrections.items():
        text_lower = text_lower.replace(wrong, correct)

    # Capitalize first letter
    if text_lower:
        text_lower = text_lower[0].upper() + text_lower[1:]

    return text_lower

In [16]:
def fast_main():
    """Fast, optimized main pipeline"""
    print("🚀 Starting FAST Audio Processing Pipeline")
    print("=" * 50)

    # Step 1: Quick audio quality check (optional)
    quality_hint = quick_audio_check(Config.INPUT_AUDIO_PATH)
    print(f"Audio quality hint: {quality_hint}")

    # Step 2: Fast preprocessing
    if not fast_preprocessing(Config.INPUT_AUDIO_PATH, Config.CLEAN_AUDIO_PATH, quality_hint):
        print("❌ Audio preprocessing failed. Exiting.")
        return

    # Step 3: Load model once
    print("📂 Loading Whisper model...")
    # model = whisper.load_model("large")

    # Step 4: Smart transcription (chunking only for very long audio)
    print("🎤 Starting transcription...")
    whisper_result = smart_chunking_for_long_audio(model, Config.CLEAN_AUDIO_PATH)

    # Step 5: Quick repetition removal
    if 'segments' in whisper_result:
        cleaned_segments = lightweight_repetition_removal(whisper_result['segments'])
    else:
        cleaned_segments = []

    # Step 6: Simple post-processing
    processed_segments = []
    for segment in cleaned_segments:
        processed_text = simple_post_process_text(segment['text'])
        if processed_text and len(processed_text) > 3:
            segment_copy = segment.copy()
            segment_copy['text'] = processed_text
            processed_segments.append(segment_copy)

    # Step 7: Speaker diarization (keep your existing code)
    print("👥 Starting speaker diarization...")
    try:
        pipeline = Pipeline.from_pretrained(
            "pyannote/speaker-diarization-3.1",
            use_auth_token=Config.HUGGING_FACE_ACCESS_TOKEN
        )

        if torch.cuda.is_available():
            pipeline.to(torch.device("cuda"))

        diarization = pipeline(Config.CLEAN_AUDIO_PATH)
        print("✅ Speaker diarization completed")

    except Exception as e:
        print(f"⚠️  Speaker diarization failed: {e}")
        diarization = None

    # Step 8: Generate dialogue (simplified)
    dialogue = []

    def get_dominant_speaker(start_time, end_time, diarization_result):
        if not diarization_result:
            return "Speaker_Unknown"

        speakers = {}
        for segment, _, speaker in diarization_result.itertracks(yield_label=True):
            overlap_start = max(start_time, segment.start)
            overlap_end = min(end_time, segment.end)
            overlap_duration = max(0, overlap_end - overlap_start)

            if overlap_duration > 0:
                speakers[speaker] = speakers.get(speaker, 0) + overlap_duration

        return max(speakers, key=speakers.get) if speakers else "Speaker_Unknown"

    # Simple dialogue generation
    for segment in processed_segments:
        speaker = get_dominant_speaker(segment['start'], segment['end'], diarization)
        dialogue.append({
            'speaker': speaker,
            'text': segment['text'],
            'start_time': segment['start'],
            'end_time': segment['end']
        })

    # Step 9: Display results
    print("\n" + "🎭 DIALOGUE OUTPUT" + "=" * 30)
    for entry in dialogue:
        timestamp = f"[{entry['start_time']:.1f}s - {entry['end_time']:.1f}s]"
        print(f"\n{entry['speaker']} {timestamp}:")
        print(f"  📝 {entry['text']}")

    # Step 10: Save results
    output_data = {
        'metadata': {
            'total_duration': get_audio_duration(Config.INPUT_AUDIO_PATH),
            'total_speakers': len(set(d['speaker'] for d in dialogue)),
            'total_segments': len(dialogue),
            'processing_mode': 'fast_optimized'
        },
        'dialogue': dialogue
    }

    with open('fast_transcription_results.json', 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)

    print(f"\n💾 Results saved to: fast_transcription_results.json")
    print("✅ Fast processing completed!")

if __name__ == "__main__":
    fast_main()

🚀 Starting FAST Audio Processing Pipeline
Audio quality hint: good_quality
🔧 Fast preprocessing for good_quality audio
✅ Fast preprocessing completed
📂 Loading Whisper model...
🎤 Starting transcription...
🎯 Running optimized Whisper transcription
[00:00.000 --> 00:30.000]  Hello! Hello! Hello! Hello! Hello! Hello! Hello! Hello! Hello! Hello! Hello! Hello! Hello! Hello!
[00:30.000 --> 00:37.000]  Hello! Hello! Hello! Hello! Hello!
[01:00.000 --> 01:02.000]  Hello! Hello! Hello! Hello!
[01:02.000 --> 01:04.000]  Hello! Hello! Hello! Hello!
[01:04.000 --> 01:06.000]  Hello! Hello! Hello!
[01:06.000 --> 01:08.000]  Hello! Hello! Hello!
[01:08.000 --> 01:10.000]  Hello! Hello! Hello!
[01:10.000 --> 01:12.000]  Hello! Hello! Hello!
[01:12.000 --> 01:14.000]  Hello! Hello! Hello!
[01:14.000 --> 01:16.000]  Hello! Hello! Hello!
[01:16.000 --> 01:18.000]  Hello! Hello! Hello!
[01:18.000 --> 01:20.000]  Hello! Hello! Hello!
[01:20.000 --> 01:22.000]  Hello! Hello! Hello!
[01:22.000 --> 01:24.000

  std = sequences.std(dim=-1, correction=1)


✅ Speaker diarization completed


💾 Results saved to: fast_transcription_results.json
✅ Fast processing completed!
