In [1]:
!pip install --break-system-packages git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-hargemjw
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-hargemjw
  Resolved https://github.com/openai/whisper.git to commit c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_c

In [2]:
!pip install --break-system-packages pyannote.audio torchaudio

Collecting pyannote.audio
  Downloading pyannote.audio-3.3.2-py2.py3-none-any.whl.metadata (11 kB)
Collecting asteroid-filterbanks>=0.4 (from pyannote.audio)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting lightning>=2.0.1 (from pyannote.audio)
  Downloading lightning-2.5.2-py3-none-any.whl.metadata (38 kB)
Collecting pyannote.core>=5.0.0 (from pyannote.audio)
  Downloading pyannote.core-5.0.0-py3-none-any.whl.metadata (1.4 kB)
Collecting pyannote.database>=5.0.1 (from pyannote.audio)
  Downloading pyannote.database-5.1.3-py3-none-any.whl.metadata (1.1 kB)
Collecting pyannote.metrics>=3.2 (from pyannote.audio)
  Downloading pyannote.metrics-3.2.1-py3-none-any.whl.metadata (1.3 kB)
Collecting pyannote.pipeline>=3.0.1 (from pyannote.audio)
  Downloading pyannote.pipeline-3.0.1-py3-none-any.whl.metadata (897 bytes)
Collecting pytorch-metric-learning>=2.1.0 (from pyannote.audio)
  Downloading pytorch_metric_learning-2.8.1-py3-none-any.whl.metadata (18

In [3]:
import whisper
from pyannote.audio import Pipeline
import torch
import re
import os
import subprocess
import json

In [4]:
# Configuration
INPUT_AUDIO_PATH = "/content/001_t2.wav"
CLEAN_AUDIO_PATH = "cleaned_audio_for_asr_and_diarization.wav"
HUGGING_FACE_ACCESS_TOKEN = "hf_"

In [5]:
model = whisper.load_model("large")

100%|█████████████████████████████████████| 2.88G/2.88G [01:20<00:00, 38.3MiB/s]


In [6]:
model

Whisper(
  (encoder): AudioEncoder(
    (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
    (blocks): ModuleList(
      (0-31): 32 x ResidualAttentionBlock(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=1280, out_features=1280, bias=True)
          (key): Linear(in_features=1280, out_features=1280, bias=False)
          (value): Linear(in_features=1280, out_features=1280, bias=True)
          (out): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (attn_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=1280, out_features=5120, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=5120, out_features=1280, bias=True)
        )
        (mlp_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_post): LayerNorm(

In [7]:
def get_audio_duration(audio_path):
    """Get audio duration using ffprobe"""
    try:
        cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration",
               "-of", "default=noprint_wrappers=1:nokey=1", audio_path]
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
        return float(result.stdout.strip())
    except Exception as e:
        print(f"Could not get duration: {e}")
        return 0


In [8]:
def audio_preprocessing_v1(input_path, output_path):
    """Advanced audio preprocessing with better parameters"""
    print("--- Trying Advanced Audio Preprocessing ---")

    # Improved ffmpeg command - less aggressive filtering to preserve speech
    ffmpeg_command = [
        "ffmpeg", "-i", input_path,
        "-acodec", "pcm_s16le",
        "-ac", "1",  # Mono
        "-ar", "16000",  # 16kHz sample rate
        "-af", "loudnorm=I=-23:TP=-2:LRA=7,highpass=f=80,lowpass=f=8000,afftdn=nr=10",
        "-y", output_path
    ]

    try:
        result = subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
        print("Advanced preprocessing successful")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Advanced preprocessing failed: {e.returncode}")
        return False

def audio_preprocessing_v2(input_path, output_path):
    """Simplified but effective preprocessing"""
    print("--- Trying Simplified Audio Preprocessing ---")

    ffmpeg_command = [
        "ffmpeg", "-i", input_path,
        "-acodec", "pcm_s16le",
        "-ac", "1",
        "-ar", "16000",
        "-af", "loudnorm=I=-23:TP=-2,highpass=f=100",
        "-y", output_path
    ]

    try:
        result = subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
        print("Simplified preprocessing successful")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Simplified preprocessing failed: {e.returncode}")
        return False

def audio_preprocessing_v3(input_path, output_path):
    """Basic but reliable preprocessing"""
    print("--- Trying Basic Audio Preprocessing ---")

    ffmpeg_command = [
        "ffmpeg", "-i", input_path,
        "-acodec", "pcm_s16le",
        "-ac", "1",
        "-ar", "16000",
        "-af", "loudnorm",
        "-y", output_path
    ]

    try:
        result = subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
        print("Basic preprocessing successful")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Basic preprocessing failed: {e.returncode}")
        return False

def audio_preprocessing_v4(input_path, output_path):
    """Minimal processing - just format conversion"""
    print("--- Trying Minimal Audio Processing ---")

    ffmpeg_command = [
        "ffmpeg", "-i", input_path,
        "-acodec", "pcm_s16le",
        "-ac", "1",
        "-ar", "16000",
        "-y", output_path
    ]

    try:
        result = subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
        print("Minimal processing successful")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Minimal processing failed: {e.returncode}")
        return False

def smart_audio_preprocessing(input_path, output_path):
    """Try different preprocessing methods in order of preference"""
    original_duration = get_audio_duration(input_path)
    print(f"Original audio duration: {original_duration:.2f} seconds")

    methods = [
        audio_preprocessing_v1,
        audio_preprocessing_v2,
        audio_preprocessing_v3,
        audio_preprocessing_v4
    ]

    for i, method in enumerate(methods, 1):
        if method(input_path, output_path):
            if os.path.exists(output_path):
                processed_duration = get_audio_duration(output_path)
                print(f"Processed audio duration: {processed_duration:.2f} seconds")

                if abs(original_duration - processed_duration) < 1.0:
                    print(f"✅ Audio preprocessing successful with method {i}")
                    return True
                else:
                    print(f"⚠️  Duration mismatch with method {i}, trying next...")
                    continue

    print("❌ All preprocessing methods failed!")
    return False

In [10]:
def enhanced_whisper_transcription(audio_path):
    """
    Enhanced Whisper transcription with optimal anti-repetition parameters
    """
    print("--- Enhanced Whisper Transcription (Optimal Single Strategy) ---")

    initial_prompt1 = (
        "Axis Maxlife Insurance, Policy number, fund value, Due date,"
        "Sum Assured, Policy Status, Late Fee, Google Pay, GPay, PhonePe, Paytm, netbanking,"
    )

    # initial_prompt1=(
    #     "Maxlife Insurance call, Policy number 600603260, due date 30th September 2021, amount Rs 1,20,000, "
    #     "status discontinued, payment, Google Pay, PhonePe, financial problem, fund value, locking period, "
    #     "Axis Maxlife, online payment, partial withdrawal, premium, 5 years, 10 years."
    # )

    # Single optimal strategy - no need for multiple attempts
    result = model.transcribe(
        audio_path,
        # language="ta",                      # Tamil
        task="translate",                   # Translate to English
        verbose=True,                       # Keep verbose for monitoring

        # temperature=[0.0],
        # beam_size=5,
        # best_of=5,
        temperature=[0.2, 0.4],
        beam_size=1,
        best_of=1,
        logprob_threshold=-1.0,
        compression_ratio_threshold=2.4,
        no_speech_threshold=0.3,
        condition_on_previous_text=False,
        # initial_prompt=None,
        word_timestamps=False,
    )

    print("✅ Whisper transcription completed with optimal parameters")
    return result

def calculate_repetition_score(segments):
    """
    Calculate a repetition score for transcription segments
    Lower score = less repetition = better
    """
    if not segments:
        return 0.0

    total_repetition = 0
    total_words = 0

    for segment in segments:
        text = segment.get('text', '').strip().lower()
        words = text.split()

        if len(words) < 2:
            continue

        total_words += len(words)

        # Count immediate word repetitions
        for i in range(len(words) - 1):
            if words[i] == words[i + 1]:
                total_repetition += 1

        # Count phrase repetitions within segment
        for phrase_len in range(2, min(len(words)//2 + 1, 6)):
            for start in range(len(words) - phrase_len * 2 + 1):
                phrase1 = ' '.join(words[start:start + phrase_len])
                phrase2 = ' '.join(words[start + phrase_len:start + phrase_len * 2])
                if phrase1 == phrase2:
                    total_repetition += phrase_len * 2  # Heavy penalty

    return total_repetition / max(total_words, 1)

def detect_and_remove_repetitions(segments, max_repetition_ratio=0.3):
    """
    AGGRESSIVE post-processing function to detect and remove repetitive segments
    """
    print("🔍 Starting aggressive repetition detection...")
    cleaned_segments = []

    for i, segment in enumerate(segments):
        text = segment['text'].strip()
        words = text.split()

        # Skip very short segments
        if len(words) < 2:
            continue

        # AGGRESSIVE: Check for excessive word repetition
        is_repetitive = False

        # Count word frequencies
        word_counts = {}
        for word in words:
            word_lower = word.lower().strip('.,!?')
            word_counts[word_lower] = word_counts.get(word_lower, 0) + 1

        # Check if any single word dominates the segment
        max_word_count = max(word_counts.values()) if word_counts else 0
        word_dominance = max_word_count / len(words) if words else 0

        if word_dominance > 0.4:  # If any word is >40% of the segment
            print(f"🚫 Rejecting word-dominated segment: {text[:50]}... (dominance: {word_dominance:.2f})")
            continue

        # Check for immediate repetitions (same word repeated consecutively)
        consecutive_repeats = 0
        max_consecutive = 0

        for j in range(1, len(words)):
            if words[j].lower().strip('.,!?') == words[j-1].lower().strip('.,!?'):
                consecutive_repeats += 1
                max_consecutive = max(max_consecutive, consecutive_repeats + 1)
            else:
                consecutive_repeats = 0

        if max_consecutive > 3:  # More than 3 consecutive identical words
            print(f"🚫 Rejecting consecutive repeat segment: {text[:50]}... (max consecutive: {max_consecutive})")
            continue

        # Check for pattern repetitions within segment
        for phrase_len in range(2, min(len(words)//3 + 1, 8)):
            for start in range(len(words) - phrase_len * 2 + 1):
                phrase1 = ' '.join(words[start:start + phrase_len]).lower()
                phrase2 = ' '.join(words[start + phrase_len:start + phrase_len * 2]).lower()

                if phrase1 == phrase2:
                    repetition_coverage = (phrase_len * 2) / len(words)
                    if repetition_coverage > max_repetition_ratio:
                        print(f"🚫 Rejecting pattern repeat segment: {text[:50]}... (coverage: {repetition_coverage:.2f})")
                        is_repetitive = True
                        break
            if is_repetitive:
                break

        if is_repetitive:
            continue

        # Check for similarity with recent segments (avoid near-duplicates)
        is_near_duplicate = False
        for prev_segment in cleaned_segments[-5:]:  # Check last 5 segments
            prev_words = prev_segment['text'].lower().split()
            current_words = [w.lower() for w in words]

            if prev_words and current_words:
                # Calculate Jaccard similarity
                prev_set = set(prev_words)
                current_set = set(current_words)
                intersection = len(prev_set.intersection(current_set))
                union = len(prev_set.union(current_set))

                similarity = intersection / union if union > 0 else 0

                if similarity > 0.7 and abs(len(prev_words) - len(current_words)) < 5:
                    print(f"🚫 Rejecting near-duplicate: {text[:30]}... (similarity: {similarity:.2f})")
                    is_near_duplicate = True
                    break

        if is_near_duplicate:
            continue

        # If we reach here, the segment passed all checks
        cleaned_segments.append(segment)

    removed_count = len(segments) - len(cleaned_segments)
    print(f"📊 Aggressive cleaning: {len(segments)} → {len(cleaned_segments)} segments")
    print(f"🗑️  Removed {removed_count} repetitive/problematic segments")

    return cleaned_segments

In [11]:
def post_process_text(text):
    """Clean up transcribed text"""
    if not text:
        return ""

    # Remove excessive repetitions within text
    words = text.split()
    cleaned_words = []

    i = 0
    while i < len(words):
        current_word = words[i].lower()

        # Look for immediate repetitions (same word repeated 3+ times)
        repetition_count = 1
        j = i + 1
        while j < len(words) and words[j].lower() == current_word:
            repetition_count += 1
            j += 1

        # Keep only 1-2 repetitions maximum
        keep_count = min(repetition_count, 2) if repetition_count <= 3 else 1
        for _ in range(keep_count):
            cleaned_words.append(words[i])

        i = i + repetition_count

    text = ' '.join(cleaned_words)

    # Common corrections for Indian insurance context
    corrections = {
        'access max life': 'Axis Max Life',
        'axis max life': 'Axis Max Life',
        'g pay': 'GPay',
        'google pay': 'Google Pay',
        'phone pay': 'PhonePe',
        'phone pe': 'PhonePe',
        'pay tm': 'Paytm',
        'net banking': 'netbanking',
        'some assured': 'sum assured',
        'premium do': 'premium due',
        'do date': 'due date',
    }

    text_lower = text.lower()
    for wrong, correct in corrections.items():
        text_lower = text_lower.replace(wrong, correct)

    # Capitalize first letter of sentences
    text_lower = re.sub(r'(^|[.!?]\s+)([a-z])',
                       lambda m: m.group(1) + m.group(2).upper(), text_lower)

    return text_lower.strip()

In [12]:
def main():
    """Main processing pipeline with repetition prevention"""
    print("🎯 Starting Enhanced Audio Processing Pipeline (Anti-Repetition)")
    print("=" * 60)

    # Step 1: Smart Audio Preprocessing
    if not smart_audio_preprocessing(INPUT_AUDIO_PATH, CLEAN_AUDIO_PATH):
        print("❌ Audio preprocessing failed completely. Exiting.")
        return

    # model = whisper.load_model("large")

    # Step 2: Enhanced Whisper Transcription with anti-repetition
    try:
        whisper_result = enhanced_whisper_transcription(CLEAN_AUDIO_PATH)
        print("✅ Whisper transcription completed")
    except Exception as e:
        print(f"❌ Whisper transcription failed: {e}")
        return

    # Step 3: Remove repetitive segments BEFORE post-processing
    print("\n--- Removing Repetitive Segments ---")
    cleaned_segments = detect_and_remove_repetitions(whisper_result["segments"])

    # Step 4: Post-process remaining transcription
    processed_segments = []
    for segment in cleaned_segments:
        processed_text = post_process_text(segment['text'])
        if processed_text.strip() and len(processed_text.strip()) > 5:  # Only keep meaningful segments
            segment_copy = segment.copy()
            segment_copy['text'] = processed_text
            processed_segments.append(segment_copy)

    whisper_result["segments"] = processed_segments

    # Step 5: Speaker Diarization
    print("\n--- Speaker Diarization ---")
    try:
        pipeline = Pipeline.from_pretrained(
            "pyannote/speaker-diarization-3.1",
            use_auth_token=HUGGING_FACE_ACCESS_TOKEN
        )

        if torch.cuda.is_available():
            pipeline.to(torch.device("cuda"))
            print("✅ Using GPU for diarization")

        diarization = pipeline(CLEAN_AUDIO_PATH)
        print("✅ Speaker diarization completed")

    except Exception as e:
        print(f"⚠️  Speaker diarization failed: {e}")
        diarization = None

    # Step 6: Generate Enhanced Dialogue
    print("\n--- Generating Dialogue ---")

    def get_dominant_speaker(start_time, end_time, diarization_result):
        if not diarization_result:
            return "Speaker_Unknown"

        speakers = {}
        for segment, _, speaker in diarization_result.itertracks(yield_label=True):
            overlap_start = max(start_time, segment.start)
            overlap_end = min(end_time, segment.end)
            overlap_duration = max(0, overlap_end - overlap_start)

            if overlap_duration > 0:
                speakers[speaker] = speakers.get(speaker, 0) + overlap_duration

        return max(speakers, key=speakers.get) if speakers else "Speaker_Unknown"

    # Combine segments by speaker
    dialogue = []
    current_speaker = None
    current_texts = []
    current_start = 0
    current_end = 0

    for segment in processed_segments:
        start = segment['start']
        end = segment['end']
        text = segment['text'].strip()

        speaker = get_dominant_speaker(start, end, diarization)

        # Merge consecutive segments from same speaker (within 3 seconds)
        if (speaker == current_speaker and
            current_speaker and
            (start - current_end) < 3.0):
            current_texts.append(text)
            current_end = end
        else:
            # Save previous speaker's dialogue
            if current_speaker and current_texts:
                combined_text = ' '.join(current_texts)
                # Final check for repetition in combined text
                if len(combined_text.strip()) > 10:  # Only keep substantial dialogue
                    dialogue.append({
                        'speaker': current_speaker,
                        'text': combined_text,
                        'start_time': current_start,
                        'end_time': current_end
                    })

            # Start new speaker segment
            current_speaker = speaker
            current_texts = [text]
            current_start = start
            current_end = end

    # Add final segment
    if current_speaker and current_texts:
        combined_text = ' '.join(current_texts)
        if len(combined_text.strip()) > 10:
            dialogue.append({
                'speaker': current_speaker,
                'text': combined_text,
                'start_time': current_start,
                'end_time': current_end
            })

    # Step 7: Display Results
    print("\n" + "🎭 DIALOGUE OUTPUT" + "=" * 40)

    for entry in dialogue:
        timestamp = f"[{entry['start_time']:.1f}s - {entry['end_time']:.1f}s]"
        print(f"\n{entry['speaker']} {timestamp}:")
        print(f"  📝 {entry['text']}")

    # Step 8: Save Results
    output_data = {
        'metadata': {
            'total_duration': whisper_result.get('duration', 0),
            'total_speakers': len(set(d['speaker'] for d in dialogue)),
            'total_segments': len(dialogue),
            'model_used': 'whisper-large',
            'processing_successful': True,
            'anti_repetition_applied': True
        },
        'dialogue': dialogue,
        'raw_transcription': whisper_result
    }

    with open('enhanced_transcription_results.json', 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)

    print(f"\n💾 Results saved to: enhanced_transcription_results.json")
    print("✅ Processing completed successfully!")

if __name__ == "__main__":
    main()

🎯 Starting Enhanced Audio Processing Pipeline (Anti-Repetition)
Original audio duration: 238.30 seconds
--- Trying Advanced Audio Preprocessing ---
Advanced preprocessing successful
Processed audio duration: 238.28 seconds
✅ Audio preprocessing successful with method 1
--- Enhanced Whisper Transcription (Optimal Single Strategy) ---




Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: Tamil
[00:00.000 --> 00:07.000]  Please press any key to enter the search option.
[00:07.000 --> 00:15.000]  Hello Sir, My name is Arookya Mehri. Can I talk to Mr. Subbaraj Sir?
[00:15.000 --> 00:17.000]  Yes, I am coming.
[00:17.000 --> 00:23.000]  I have received a service call from Maxlife Insurance. Can I talk to you about the policy? Are you free?
[00:23.000 --> 00:25.000]  Yes, I am free.
[00:25.000 --> 00:28.000]  Thank you. You have taken a policy from your name.
[00:28.000 --> 00:33.000]  Max Life Fast Track Super Plan. Policy number is 00603260.
[00:33.000 --> 00:37.000]  The due date is 30th September 2021.
[00:37.000 --> 00:40.000]  The amount to be paid is 1,20,000.
[00:40.000 --> 00:44.000]  The policy status is discontinued. Can you tell me why you are not paying?
[00:44.000 --> 00:47.000]  Finance is not good.
[00:47.000 --> 00:48.000]  Sorry sir.
[00:48.000 

config.yaml:   0%|          | 0.00/469 [00:00<?, ?B/s]

DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover


pytorch_model.bin:   0%|          | 0.00/5.91M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/399 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/26.6M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/221 [00:00<?, ?B/s]

  std = sequences.std(dim=-1, correction=1)


✅ Speaker diarization completed

--- Generating Dialogue ---


SPEAKER_01 [0.0s - 15.0s]:
  📝 Please press any key to enter the search option. Hello sir, my name is arookya mehri. Can i talk to mr. Subbaraj sir?

SPEAKER_00 [15.0s - 17.0s]:
  📝 Yes, i am coming.

SPEAKER_01 [17.0s - 23.0s]:
  📝 I have received a service call from maxlife insurance. Can i talk to you about the policy? Are you free?

SPEAKER_00 [23.0s - 25.0s]:
  📝 Yes, i am free.

SPEAKER_01 [25.0s - 44.0s]:
  📝 Thank you. You have taken a policy from your name. Max life fast track super plan. Policy number is 00603260. The due date is 30th september 2021. The amount to be paid is 1,20,000. The policy status is discontinued. Can you tell me why you are not paying?

SPEAKER_00 [44.0s - 47.0s]:
  📝 Finance is not good.

SPEAKER_00 [50.0s - 58.1s]:
  📝 If i ask for 1 lakh rupees, can i take this amount? How long can i take?

SPEAKER_01 [58.1s - 68.2s]:
  📝 So you have a 5 year locking period. You have a policy in 2018. You

In [13]:
# Configuration
INPUT_AUDIO_PATH = "/content/001_t1.wav"
CLEAN_AUDIO_PATH = "cleaned_audio_for_asr_and_diarization_new.wav"
HUGGING_FACE_ACCESS_TOKEN = "hf_"

In [14]:
def main():
    """Main processing pipeline with repetition prevention"""
    print("🎯 Starting Enhanced Audio Processing Pipeline (Anti-Repetition)")
    print("=" * 60)

    # Step 1: Smart Audio Preprocessing
    if not smart_audio_preprocessing(INPUT_AUDIO_PATH, CLEAN_AUDIO_PATH):
        print("❌ Audio preprocessing failed completely. Exiting.")
        return

    # model = whisper.load_model("large")

    # Step 2: Enhanced Whisper Transcription with anti-repetition
    try:
        whisper_result = enhanced_whisper_transcription(CLEAN_AUDIO_PATH)
        print("✅ Whisper transcription completed")
    except Exception as e:
        print(f"❌ Whisper transcription failed: {e}")
        return

    # Step 3: Remove repetitive segments BEFORE post-processing
    print("\n--- Removing Repetitive Segments ---")
    cleaned_segments = detect_and_remove_repetitions(whisper_result["segments"])

    # Step 4: Post-process remaining transcription
    processed_segments = []
    for segment in cleaned_segments:
        processed_text = post_process_text(segment['text'])
        if processed_text.strip() and len(processed_text.strip()) > 5:  # Only keep meaningful segments
            segment_copy = segment.copy()
            segment_copy['text'] = processed_text
            processed_segments.append(segment_copy)

    whisper_result["segments"] = processed_segments

    # Step 5: Speaker Diarization
    print("\n--- Speaker Diarization ---")
    try:
        pipeline = Pipeline.from_pretrained(
            "pyannote/speaker-diarization-3.1",
            use_auth_token=HUGGING_FACE_ACCESS_TOKEN
        )

        if torch.cuda.is_available():
            pipeline.to(torch.device("cuda"))
            print("✅ Using GPU for diarization")

        diarization = pipeline(CLEAN_AUDIO_PATH)
        print("✅ Speaker diarization completed")

    except Exception as e:
        print(f"⚠️  Speaker diarization failed: {e}")
        diarization = None

    # Step 6: Generate Enhanced Dialogue
    print("\n--- Generating Dialogue ---")

    def get_dominant_speaker(start_time, end_time, diarization_result):
        if not diarization_result:
            return "Speaker_Unknown"

        speakers = {}
        for segment, _, speaker in diarization_result.itertracks(yield_label=True):
            overlap_start = max(start_time, segment.start)
            overlap_end = min(end_time, segment.end)
            overlap_duration = max(0, overlap_end - overlap_start)

            if overlap_duration > 0:
                speakers[speaker] = speakers.get(speaker, 0) + overlap_duration

        return max(speakers, key=speakers.get) if speakers else "Speaker_Unknown"

    # Combine segments by speaker
    dialogue = []
    current_speaker = None
    current_texts = []
    current_start = 0
    current_end = 0

    for segment in processed_segments:
        start = segment['start']
        end = segment['end']
        text = segment['text'].strip()

        speaker = get_dominant_speaker(start, end, diarization)

        # Merge consecutive segments from same speaker (within 3 seconds)
        if (speaker == current_speaker and
            current_speaker and
            (start - current_end) < 3.0):
            current_texts.append(text)
            current_end = end
        else:
            # Save previous speaker's dialogue
            if current_speaker and current_texts:
                combined_text = ' '.join(current_texts)
                # Final check for repetition in combined text
                if len(combined_text.strip()) > 10:  # Only keep substantial dialogue
                    dialogue.append({
                        'speaker': current_speaker,
                        'text': combined_text,
                        'start_time': current_start,
                        'end_time': current_end
                    })

            # Start new speaker segment
            current_speaker = speaker
            current_texts = [text]
            current_start = start
            current_end = end

    # Add final segment
    if current_speaker and current_texts:
        combined_text = ' '.join(current_texts)
        if len(combined_text.strip()) > 10:
            dialogue.append({
                'speaker': current_speaker,
                'text': combined_text,
                'start_time': current_start,
                'end_time': current_end
            })

    # Step 7: Display Results
    print("\n" + "🎭 DIALOGUE OUTPUT" + "=" * 40)

    for entry in dialogue:
        timestamp = f"[{entry['start_time']:.1f}s - {entry['end_time']:.1f}s]"
        print(f"\n{entry['speaker']} {timestamp}:")
        print(f"  📝 {entry['text']}")

    # Step 8: Save Results
    output_data = {
        'metadata': {
            'total_duration': whisper_result.get('duration', 0),
            'total_speakers': len(set(d['speaker'] for d in dialogue)),
            'total_segments': len(dialogue),
            'model_used': 'whisper-large',
            'processing_successful': True,
            'anti_repetition_applied': True
        },
        'dialogue': dialogue,
        'raw_transcription': whisper_result
    }

    with open('enhanced_transcription_results-t2.json', 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)

    print(f"\n💾 Results saved to: enhanced_transcription_results.json")
    print("✅ Processing completed successfully!")

if __name__ == "__main__":
    main()

🎯 Starting Enhanced Audio Processing Pipeline (Anti-Repetition)
Original audio duration: 258.16 seconds
--- Trying Advanced Audio Preprocessing ---
Advanced preprocessing successful
Processed audio duration: 258.12 seconds
✅ Audio preprocessing successful with method 1
--- Enhanced Whisper Transcription (Optimal Single Strategy) ---




Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: Tamil
[00:00.000 --> 00:14.000]  Hello, Hello, Hello.
[00:14.000 --> 00:15.000]  Hello.
[00:15.000 --> 00:18.000]  Hello sir, my name is Aruke. Can you speak to Mrs. Vijaya?
[00:18.000 --> 00:23.000]  Yes, she is at home. I am at home.
[00:23.000 --> 00:25.000]  What is your relation with her?
[00:25.000 --> 00:28.000]  Yes, I am her husband.
[00:28.000 --> 00:31.000]  We have called from Maxlife Insurance. This is a service call.
[00:31.000 --> 00:33.000]  Can you talk about the policy for 2 minutes?
[00:33.000 --> 00:36.000]  Sir, do you have any details about it?
[00:36.000 --> 00:41.000]  Nothing. I am applying for it for 8 days.
[00:41.000 --> 00:48.000]  Sir, you have applied for the policy for 8600 rupees in 2018.
[00:48.000 --> 00:51.000]  Now, you have not paid 3 rupees and it is pending.
[00:51.000 --> 00:54.000]  Sir, you have applied for the Maxlife Gain Premier 

  std = sequences.std(dim=-1, correction=1)


✅ Speaker diarization completed

--- Generating Dialogue ---


SPEAKER_01 [15.0s - 18.0s]:
  📝 Hello sir, my name is aruke. Can you speak to mrs. Vijaya?

SPEAKER_00 [18.0s - 23.0s]:
  📝 Yes, she is at home. I am at home.

SPEAKER_01 [23.0s - 25.0s]:
  📝 What is your relation with her?

SPEAKER_00 [25.0s - 28.0s]:
  📝 Yes, i am her husband.

SPEAKER_01 [28.0s - 36.0s]:
  📝 We have called from maxlife insurance. This is a service call. Can you talk about the policy for 2 minutes? Sir, do you have any details about it?

SPEAKER_00 [36.0s - 41.0s]:
  📝 Nothing. I am applying for it for 8 days.

SPEAKER_01 [41.0s - 78.0s]:
  📝 Sir, you have applied for the policy for 8600 rupees in 2018. Now, you have not paid 3 rupees and it is pending. Sir, you have applied for the maxlife gain premier plan. The policy number is 31, 5, 53, 11. The due date is 23rd november 2020. The amount is 29,162.36 paise. You have taken a lot of time for the due date. The policy status is paid. The day by day you are