In [1]:
!pip install --break-system-packages git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-53qkjzae
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-53qkjzae
  Resolved https://github.com/openai/whisper.git to commit c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_c

In [2]:
!pip install --break-system-packages pyannote.audio torchaudio

Collecting pyannote.audio
  Downloading pyannote.audio-3.3.2-py2.py3-none-any.whl.metadata (11 kB)
Collecting asteroid-filterbanks>=0.4 (from pyannote.audio)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting lightning>=2.0.1 (from pyannote.audio)
  Downloading lightning-2.5.2-py3-none-any.whl.metadata (38 kB)
Collecting pyannote.core>=5.0.0 (from pyannote.audio)
  Downloading pyannote.core-5.0.0-py3-none-any.whl.metadata (1.4 kB)
Collecting pyannote.database>=5.0.1 (from pyannote.audio)
  Downloading pyannote.database-5.1.3-py3-none-any.whl.metadata (1.1 kB)
Collecting pyannote.metrics>=3.2 (from pyannote.audio)
  Downloading pyannote.metrics-3.2.1-py3-none-any.whl.metadata (1.3 kB)
Collecting pyannote.pipeline>=3.0.1 (from pyannote.audio)
  Downloading pyannote.pipeline-3.0.1-py3-none-any.whl.metadata (897 bytes)
Collecting pytorch-metric-learning>=2.1.0 (from pyannote.audio)
  Downloading pytorch_metric_learning-2.8.1-py3-none-any.whl.metadata (18

In [3]:
import whisper
from pyannote.audio import Pipeline
import torch
import re
import os
import subprocess
import json
from typing import Optional, List, Dict, Any

In [None]:
# Configuration
INPUT_AUDIO_PATH = "call4.wav"
CLEAN_AUDIO_PATH = "cleaned_audio_for_asr_and_diarization.wav"
HUGGING_FACE_ACCESS_TOKEN = "hf_"

In [5]:
# Load model globally
model = whisper.load_model("large-v3")

100%|█████████████████████████████████████| 2.88G/2.88G [05:14<00:00, 9.83MiB/s]


In [6]:
model

Whisper(
  (encoder): AudioEncoder(
    (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
    (blocks): ModuleList(
      (0-31): 32 x ResidualAttentionBlock(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=1280, out_features=1280, bias=True)
          (key): Linear(in_features=1280, out_features=1280, bias=False)
          (value): Linear(in_features=1280, out_features=1280, bias=True)
          (out): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (attn_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=1280, out_features=5120, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=5120, out_features=1280, bias=True)
        )
        (mlp_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_post): LayerNorm(

In [7]:
def get_audio_duration(audio_path: str) -> float:
    """Get audio duration using ffprobe"""
    try:
        cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration",
               "-of", "default=noprint_wrappers=1:nokey=1", audio_path]
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
        return float(result.stdout.strip())
    except Exception as e:
        print(f"Could not get duration: {e}")
        return 0


In [8]:
def analyze_audio_quality(audio_path: str) -> Dict[str, Any]:
    """Detect if audio is from call center environment"""
    try:
        cmd = ["ffmpeg", "-i", audio_path, "-af", "volumedetect", "-f", "null", "-"]
        result = subprocess.run(cmd, capture_output=True, text=True)

        is_callcenter = False
        if "mean_volume" in result.stderr:
            mean_vol = float(re.search(r"mean_volume: ([\d.-]+) dB", result.stderr).group(1))
            if mean_vol < -25:
                is_callcenter = True

        return {
            "is_callcenter": is_callcenter,
            "duration": get_audio_duration(audio_path)
        }
    except Exception as e:
        print(f"Audio analysis failed: {e}")
        return {"is_callcenter": False, "duration": 0}

def callcenter_specific_cleaning(input_path: str, output_path: str) -> bool:
    """Specialized cleaning for BPO call center audio"""
    print("--- Applying Call Center Specific Cleaning ---")
    ffmpeg_command = [
        "ffmpeg", "-i", input_path,
        "-acodec", "pcm_s16le",
        "-ac", "1",
        "-ar", "16000",
        "-af", "highpass=300,lowpass=3500,afftdn=nr=25:nf=-30,adeclick,adeclip",
        "-y", output_path
    ]
    try:
        subprocess.run(ffmpeg_command, check=True)
        return True
    except subprocess.CalledProcessError as e:
        print(f"Call center cleaning failed: {e}")
        return False

def clean_audio_preprocessing(input_path: str, output_path: str) -> bool:
    """Optimized preprocessing for clean 1:1 calls"""
    print("--- Applying Clean Audio Processing ---")
    ffmpeg_command = [
        "ffmpeg", "-i", input_path,
        "-acodec", "pcm_s16le",
        "-ac", "1",
        "-ar", "16000",
        "-af", "loudnorm=I=-23:TP=-2,highpass=100",
        "-y", output_path
    ]
    try:
        subprocess.run(ffmpeg_command, check=True)
        return True
    except subprocess.CalledProcessError as e:
        print(f"Clean audio processing failed: {e}")
        return False

def smart_audio_preprocessing(input_path: str, output_path: str) -> bool:
    """Automatically selects the best preprocessing approach"""
    audio_info = analyze_audio_quality(input_path)
    print(f"Audio Duration: {audio_info['duration']:.2f}s | Call Center: {audio_info['is_callcenter']}")

    if audio_info['is_callcenter']:
        return callcenter_specific_cleaning(input_path, output_path)
    else:
        return clean_audio_preprocessing(input_path, output_path)

In [9]:
def enhanced_whisper_transcription(audio_path: str, is_callcenter: bool = False) -> Dict:
    """Optimized Whisper transcription based on audio type"""
    print("--- Starting Enhanced Transcription ---")

    insurance_prompt = (
        "This is a customer support call for Axis Maxlife Insurance. "
        "Discussion includes policy numbers, due dates, fund values, "
        "sum assured amounts, and payment methods like Google Pay, PhonePe."
    )

    params = {
        "language": "ta",
        "task": "translate",
        "temperature": 0.0,
        "beam_size": 5,
        "condition_on_previous_text": False,
        "word_timestamps": True,
        "initial_prompt": insurance_prompt,
        "verbose": True
    }

    if is_callcenter:
        params.update({
            "vad_filter": True,
            "no_speech_threshold": 0.4,
            "compression_ratio_threshold": 2.4,
            "logprob_threshold": -0.4
        })

    result = model.transcribe(audio_path, **params)
    print("✅ Transcription completed")
    return result


In [10]:
def enhanced_post_process_text(text: str) -> str:
    """Advanced text cleaning with insurance-specific corrections"""
    if not text:
        return ""

    # 1. Remove excessive filler words and repetitions
    text = re.sub(r'\b(\w+)(?:\s+\1\b)+', r'\1', text)
    text = re.sub(r'\b(?:yes|okay|sir)\b\s*(?:yes|okay|sir\b\s*)+', 'yes sir', text, flags=re.IGNORECASE)

    # 2. Insurance term normalization
    corrections = {
        r'\bmax\s*life\b': 'Max Life',
        r'\bmaxlife\b': 'Max Life',
        r'\baxis\s*max\s*life\b': 'Axis Max Life',
        r'\b(\d{3})\s*(\d{3})\s*(\d{3})\b': r'\1\2\3',  # Policy numbers
        r'\b(\d{2})-(\d{2})-(\d{4})\b': r'\1/\2/\3',    # Dates
        r'\b(\d+),(\d+)\b': r'\1,\2',                   # Numbers
        r'\bdiscontinue\s*fund\b': 'discontinued fund',
        r'\bplaning\b': 'planning',
        r'\bgets\b': 'will get',
        r'\bhaa\b': 'yes',
        r'\bhmm\b': 'I understand',
        r'\b(rs|rs\.)\s*': '₹',
        r'\b(\d+)\s*(lakhs|lakh)\b': r'₹\1,00,000'
    }

    for pattern, replacement in corrections.items():
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)

    # 3. Capitalization and punctuation
    text = re.sub(r'\bi\b', 'I', text)
    text = re.sub(r'\bsir\b', 'Sir', text, flags=re.IGNORECASE)
    text = re.sub(r'(\w)(\?|\.|\,)', r'\1 \2', text)
    text = re.sub(r'\s+([.,?!])', r'\1', text)

    # 4. Sentence capitalization
    sentences = re.split(r'(?<=[.!?])\s+', text)
    sentences = [s[0].upper() + s[1:] if s else s for s in sentences]
    text = ' '.join(sentences)

    return text.strip()

In [11]:
def improve_transcription_alignment(segments: List[Dict]) -> List[Dict]:
    """Enhance segments to match insurance conversation patterns"""
    improved_segments = []
    current_speaker = None

    for segment in segments:
        text = segment['text'].strip()

        # Identify speaker changes based on content
        if not current_speaker:
            if re.search(r'(calling from|policy number|due date)', text, re.IGNORECASE):
                current_speaker = "AGENT"
            else:
                current_speaker = "CUSTOMER"

        # Merge short segments with same speaker
        if len(text.split()) < 5 and improved_segments and improved_segments[-1]['speaker'] == current_speaker:
            improved_segments[-1]['text'] += " " + text
            improved_segments[-1]['end'] = segment['end']
        else:
            improved_segments.append({
                'speaker': current_speaker,
                'text': text,
                'start': segment['start'],
                'end': segment['end']
            })

        # Toggle speaker for next segment
        current_speaker = "AGENT" if current_speaker == "CUSTOMER" else "CUSTOMER"

    return improved_segments

In [15]:
def detect_and_remove_repetitions(segments: List[Dict], is_callcenter: bool) -> List[Dict]:
    """Adaptive repetition removal based on audio type"""
    print("🔍 Starting Repetition Detection")

    thresholds = {
        "word_dominance": 0.4 if is_callcenter else 0.5,
        "max_consecutive": 3 if is_callcenter else 4,
        "similarity": 0.7 if is_callcenter else 0.75
    }

    cleaned_segments = []

    for segment in segments:
        text = segment['text'].strip()
        words = [w.lower().strip('.,!?') for w in text.split()]

        if len(words) < 2:
            continue

        # Check word dominance
        word_counts = {}
        for word in words:
            word_counts[word] = word_counts.get(word, 0) + 1
        max_count = max(word_counts.values(), default=0)
        if max_count / len(words) > thresholds["word_dominance"]:
            continue

        # Check consecutive repeats
        consecutive = 0
        for i in range(1, len(words)):
            consecutive = consecutive + 1 if words[i] == words[i-1] else 0
            if consecutive > thresholds["max_consecutive"]:
                break
        if consecutive > thresholds["max_consecutive"]:
            continue

        cleaned_segments.append(segment)

    print(f"📊 Removed {len(segments) - len(cleaned_segments)} repetitive segments")
    return cleaned_segments

def format_final_output(dialogue: List[Dict]) -> str:
    """Generate properly formatted conversation output"""
    output = []
    for entry in dialogue:
        speaker = entry['speaker']  # Keep original AGENT/CUSTOMER labels
        timestamp = f"[{entry['start']:.1f}s-{entry['end']:.1f}s]"
        text = enhanced_post_process_text(entry['text'])
        output.append(f"{speaker} {timestamp}:\n  {text}")
    return '\n\n'.join(output)

def process_insurance_call(whisper_segments: List[Dict], is_callcenter: bool) -> tuple:
    """Enhanced processing pipeline for insurance calls"""
    # 1. Remove repetitions
    cleaned_segments = detect_and_remove_repetitions(whisper_segments, is_callcenter)

    # 2. Improve speaker alignment
    aligned_segments = improve_transcription_alignment(cleaned_segments)

    # 3. Apply advanced post-processing
    for segment in aligned_segments:
        segment['text'] = enhanced_post_process_text(segment['text'])

    # 4. Generate final formatted output
    final_output = format_final_output(aligned_segments)

    return final_output, aligned_segments

In [16]:
def main():
    print("🎯 Starting Enhanced Audio Processing Pipeline")
    print("=" * 60)

    # Step 1: Audio Analysis and Preprocessing
    audio_info = analyze_audio_quality(INPUT_AUDIO_PATH)
    if not smart_audio_preprocessing(INPUT_AUDIO_PATH, CLEAN_AUDIO_PATH):
        print("❌ Audio preprocessing failed")
        return

    # Step 2: Optimized Transcription
    try:
        whisper_result = enhanced_whisper_transcription(
            CLEAN_AUDIO_PATH,
            is_callcenter=audio_info["is_callcenter"]
        )
    except Exception as e:
        print(f"❌ Transcription failed: {e}")
        return

    # Step 3: Advanced Processing
    final_output, processed_segments = process_insurance_call(
        whisper_result["segments"],
        audio_info["is_callcenter"]
    )
    whisper_result["segments"] = processed_segments

    # Step 4: Speaker Diarization
    print("\n--- Speaker Diarization ---")
    try:
        pipeline = Pipeline.from_pretrained(
            "pyannote/speaker-diarization-3.1",
            use_auth_token=HUGGING_FACE_ACCESS_TOKEN
        )
        if torch.cuda.is_available():
            pipeline.to(torch.device("cuda"))
        diarization = pipeline(CLEAN_AUDIO_PATH)
        print("✅ Diarization completed")
    except Exception as e:
        print(f"⚠️ Diarization failed: {e}")
        diarization = None

    # Step 5: Generate Dialogue
    print("\n--- Generating Dialogue ---")
    dialogue = []
    current_speaker = None
    current_text = []
    current_start = 0

    for segment in processed_segments:
        speaker = "AGENT"  # Default to AGENT if diarization fails
        if diarization:
            # Calculate speaker overlap durations
            speaker_overlaps = []
            for seg, _, speaker_label in diarization.itertracks(yield_label=True):
                overlap_start = max(segment['start'], seg.start)
                overlap_end = min(segment['end'], seg.end)
                overlap_duration = max(0, overlap_end - overlap_start)
                if overlap_duration > 0:
                    speaker_overlaps.append((speaker_label, overlap_duration))

            # Get speaker with maximum overlap
            if speaker_overlaps:
                speaker_label = max(speaker_overlaps, key=lambda x: x[1])[0]
                speaker = "AGENT" if speaker_label.endswith("0") else "CUSTOMER"

        if speaker == current_speaker and segment['start'] - current_start < 3.0:
            current_text.append(segment['text'])
        else:
            if current_speaker and current_text:
                dialogue.append({
                    'speaker': current_speaker,
                    'text': ' '.join(current_text),
                    'start': current_start,
                    'end': segment['start']
                })
            current_speaker = speaker
            current_text = [segment['text']]
            current_start = segment['start']

    if current_speaker and current_text:
        dialogue.append({
            'speaker': current_speaker,
            'text': ' '.join(current_text),
            'start': current_start,
            'end': processed_segments[-1]['end']
        })

    # Step 6: Output Results
    print("\n🎭 FINAL DIALOGUE OUTPUT")
    print("=" * 40)
    print(format_final_output(dialogue))

    output_data = {
        'metadata': {
            'audio_type': 'callcenter' if audio_info['is_callcenter'] else 'clean',
            'duration': audio_info['duration'],
            'speakers': len(set(d['speaker'] for d in dialogue)),
            'model': 'whisper-large-v3'
        },
        'dialogue': [{
            'speaker': d['speaker'],
            'text': d['text'],
            'start_time': d['start'],
            'end_time': d['end']
        } for d in dialogue]
    }

    with open('enhanced_results.json', 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)
    print("\n💾 Results saved to enhanced_results.json")

if __name__ == "__main__":
    main()

🎯 Starting Enhanced Audio Processing Pipeline
Audio Duration: 160.96s | Call Center: False
--- Applying Clean Audio Processing ---
--- Starting Enhanced Transcription ---




[00:00.000 --> 00:00.820]  Hello!
[00:02.060 --> 00:04.720]  Hello, my name is Axis Maxlife Insurance and I am calling you.
[00:05.840 --> 00:06.400]  Yes, tell me.
[00:06.900 --> 00:08.180]  My name is Thala Kumar.
[00:09.280 --> 00:10.740]  Yes, tell me.
[00:10.920 --> 00:13.320]  Can you tell me the policy number of Axis Maxlife Insurance?
[00:14.720 --> 00:15.420]  Yes, tell me.
[00:16.160 --> 00:21.720]  The policy number of Axis Maxlife Insurance is Rs.690,000.
[00:21.720 --> 00:23.520]  The price is Rs.6178,000.
[00:23.700 --> 00:25.740]  The due date is June 6, 2024.
[00:26.960 --> 00:27.440]  Yes, it is.
[00:27.440 --> 00:28.740]  The total amount is Rs. 3.1 lakhs.
[00:29.220 --> 00:30.760]  It used to be Rs. 6 lakhs in 2017.
[00:30.760 --> 00:31.660]  Now it is Rs. 2 lakhs.
[00:33.040 --> 00:33.700]  Yes, yes.
[00:34.080 --> 00:36.160]  When are you going to pay for this?
[00:37.660 --> 00:39.080]  I will see if I can pay for it.
[00:39.360 --> 00:40.440]  I am not sure if I 

config.yaml:   0%|          | 0.00/469 [00:00<?, ?B/s]

DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover


pytorch_model.bin:   0%|          | 0.00/5.91M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/399 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/26.6M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/221 [00:00<?, ?B/s]

It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.

  std = sequences.std(dim=-1, correction=1)


✅ Diarization completed

--- Generating Dialogue ---

🎭 FINAL DIALOGUE OUTPUT
AGENT [2.1s-5.8s]:
  Hello, my name is Axis Max Life Insurance and I am calling you.

CUSTOMER [5.8s-6.9s]:
  Yes, tell me.

AGENT [6.9s-10.9s]:
  My name is Thala Kumar. Yes, tell me.

AGENT [10.9s-14.7s]:
  Can you tell me the policy number of Axis Max Life Insurance?

CUSTOMER [14.7s-16.2s]:
  Yes, tell me.

AGENT [16.2s-21.7s]:
  The policy number of Axis Max Life Insurance is ₹.690,000.

AGENT [21.7s-27.0s]:
  The price is ₹.6178,000. The due date is June 6, 2024.

AGENT [27.0s-30.8s]:
  Yes, it is. The total amount is ₹. 3.₹1,00,000. It used to be ₹. ₹6,00,000 in 2017.

AGENT [30.8s-34.1s]:
  Now it is ₹. ₹2,00,000.

AGENT [34.1s-37.7s]:
  When are you going to pay for this?

CUSTOMER [37.7s-41.5s]:
  I will see if I can pay for it. I am not sure if I will be able to pay for it.

CUSTOMER [41.5s-46.6s]:
  I will see if I can pay for it. I am not sure if I will be able to pay for it.

CUSTOMER [46.6s-49.

In [None]:
# Configuration
INPUT_AUDIO_PATH = "call6.wav"
CLEAN_AUDIO_PATH = "cleaned_audio_call-6.wav"
HUGGING_FACE_ACCESS_TOKEN = "hf_"

In [18]:
def main():
    print("🎯 Starting Enhanced Audio Processing Pipeline")
    print("=" * 60)

    # Step 1: Audio Analysis and Preprocessing
    audio_info = analyze_audio_quality(INPUT_AUDIO_PATH)
    if not smart_audio_preprocessing(INPUT_AUDIO_PATH, CLEAN_AUDIO_PATH):
        print("❌ Audio preprocessing failed")
        return

    # Step 2: Optimized Transcription
    try:
        whisper_result = enhanced_whisper_transcription(
            CLEAN_AUDIO_PATH,
            is_callcenter=audio_info["is_callcenter"]
        )
    except Exception as e:
        print(f"❌ Transcription failed: {e}")
        return

    # Step 3: Advanced Processing
    final_output, processed_segments = process_insurance_call(
        whisper_result["segments"],
        audio_info["is_callcenter"]
    )
    whisper_result["segments"] = processed_segments

    # Step 4: Speaker Diarization
    print("\n--- Speaker Diarization ---")
    try:
        pipeline = Pipeline.from_pretrained(
            "pyannote/speaker-diarization-3.1",
            use_auth_token=HUGGING_FACE_ACCESS_TOKEN
        )
        if torch.cuda.is_available():
            pipeline.to(torch.device("cuda"))
        diarization = pipeline(CLEAN_AUDIO_PATH)
        print("✅ Diarization completed")
    except Exception as e:
        print(f"⚠️ Diarization failed: {e}")
        diarization = None

    # Step 5: Generate Dialogue
    print("\n--- Generating Dialogue ---")
    dialogue = []
    current_speaker = None
    current_text = []
    current_start = 0

    for segment in processed_segments:
        speaker = "AGENT"  # Default to AGENT if diarization fails
        if diarization:
            # Calculate speaker overlap durations
            speaker_overlaps = []
            for seg, _, speaker_label in diarization.itertracks(yield_label=True):
                overlap_start = max(segment['start'], seg.start)
                overlap_end = min(segment['end'], seg.end)
                overlap_duration = max(0, overlap_end - overlap_start)
                if overlap_duration > 0:
                    speaker_overlaps.append((speaker_label, overlap_duration))

            # Get speaker with maximum overlap
            if speaker_overlaps:
                speaker_label = max(speaker_overlaps, key=lambda x: x[1])[0]
                speaker = "AGENT" if speaker_label.endswith("0") else "CUSTOMER"

        if speaker == current_speaker and segment['start'] - current_start < 3.0:
            current_text.append(segment['text'])
        else:
            if current_speaker and current_text:
                dialogue.append({
                    'speaker': current_speaker,
                    'text': ' '.join(current_text),
                    'start': current_start,
                    'end': segment['start']
                })
            current_speaker = speaker
            current_text = [segment['text']]
            current_start = segment['start']

    if current_speaker and current_text:
        dialogue.append({
            'speaker': current_speaker,
            'text': ' '.join(current_text),
            'start': current_start,
            'end': processed_segments[-1]['end']
        })

    # Step 6: Output Results
    print("\n🎭 FINAL DIALOGUE OUTPUT")
    print("=" * 40)
    print(format_final_output(dialogue))

    output_data = {
        'metadata': {
            'audio_type': 'callcenter' if audio_info['is_callcenter'] else 'clean',
            'duration': audio_info['duration'],
            'speakers': len(set(d['speaker'] for d in dialogue)),
            'model': 'whisper-large-v3'
        },
        'dialogue': [{
            'speaker': d['speaker'],
            'text': d['text'],
            'start_time': d['start'],
            'end_time': d['end']
        } for d in dialogue]
    }

    with open('enhanced_results-call6.json', 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)
    print("\n💾 Results saved to enhanced_results-call6.json")

if __name__ == "__main__":
    main()

🎯 Starting Enhanced Audio Processing Pipeline
Audio Duration: 172.48s | Call Center: False
--- Applying Clean Audio Processing ---
--- Starting Enhanced Transcription ---




[00:01.980 --> 00:10.000]  Discussion includes policy numbers, due dates, fund values, and payment methods like Google Pay, PhonePe.
[00:19.800 --> 00:25.880]  Discussion includes policy numbers, due dates, fund values, and payment methods like Google Pay, PhonePe.
[00:25.880 --> 00:27.140]  Do you know who is Maheshwari Vinodkumari Arun?
[00:29.100 --> 00:30.960]  She is my madam.
[00:31.880 --> 00:33.580]  She has a policy in her name.
[00:33.660 --> 00:34.620]  We have called her to talk about it.
[00:36.860 --> 00:39.160]  You spoke to her once that day.
[00:39.480 --> 00:41.760]  She even told my staff about it.
[00:44.060 --> 00:46.040]  What have you updated?
[00:48.620 --> 00:49.640]  One minute.
[00:50.220 --> 00:51.600]  I will talk to my staff.
[00:51.600 --> 00:56.720]  Yes, but till last month they have said that they will be paying for it.
[01:23.000 --> 01:23.960]  Hello ma'am.
[01:23.960 --> 01:24.620]  Hello ma'am.
[01:25.400 --> 01:27.160]  Ma'am, they are saying that

  std = sequences.std(dim=-1, correction=1)


✅ Diarization completed

--- Generating Dialogue ---

🎭 FINAL DIALOGUE OUTPUT
CUSTOMER [2.0s-19.8s]:
  Discussion includes policy numbers, due dates, fund values, and payment methods like Google Pay, PhonePe.

CUSTOMER [19.8s-25.9s]:
  Discussion includes policy numbers, due dates, fund values, and payment methods like Google Pay, PhonePe.

CUSTOMER [25.9s-29.1s]:
  Do you know who is Maheshwari Vinodkumari Arun?

AGENT [29.1s-31.9s]:
  She is my madam.

CUSTOMER [31.9s-36.9s]:
  She has a policy in her name. We have called her to talk about it.

AGENT [36.9s-44.1s]:
  You spoke to her once that day. She even told my staff about it.

CUSTOMER [44.1s-48.6s]:
  What have you updated?

AGENT [48.6s-51.6s]:
  One minute. I will talk to my staff.

CUSTOMER [51.6s-83.0s]:
  Yes, but till last month they have said that they will be paying for it.

AGENT [83.0s-90.5s]:
  Hello ma'am. Hello ma'am. Ma'am, they are saying that they will pay you on Monday.

CUSTOMER [90.5s-97.3s]:
  Monday? Okay m

In [19]:
from google.colab import files
files.download("cleaned_audio_for_asr_and_diarization.wav")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>