In [1]:
!pip install --break-system-packages git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-f22m3p_p
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-f22m3p_p
  Resolved https://github.com/openai/whisper.git to commit c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_c

In [2]:
!pip install --break-system-packages pyannote.audio torchaudio

Collecting pyannote.audio
  Downloading pyannote.audio-3.3.2-py2.py3-none-any.whl.metadata (11 kB)
Collecting asteroid-filterbanks>=0.4 (from pyannote.audio)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting lightning>=2.0.1 (from pyannote.audio)
  Downloading lightning-2.5.2-py3-none-any.whl.metadata (38 kB)
Collecting pyannote.core>=5.0.0 (from pyannote.audio)
  Downloading pyannote.core-5.0.0-py3-none-any.whl.metadata (1.4 kB)
Collecting pyannote.database>=5.0.1 (from pyannote.audio)
  Downloading pyannote.database-5.1.3-py3-none-any.whl.metadata (1.1 kB)
Collecting pyannote.metrics>=3.2 (from pyannote.audio)
  Downloading pyannote.metrics-3.2.1-py3-none-any.whl.metadata (1.3 kB)
Collecting pyannote.pipeline>=3.0.1 (from pyannote.audio)
  Downloading pyannote.pipeline-3.0.1-py3-none-any.whl.metadata (897 bytes)
Collecting pytorch-metric-learning>=2.1.0 (from pyannote.audio)
  Downloading pytorch_metric_learning-2.8.1-py3-none-any.whl.metadata (18

In [3]:
import whisper
from pyannote.audio import Pipeline
import torch
import re
import os
import subprocess
from glob import glob
from pathlib import Path
import json

In [None]:
# Configuration
# INPUT_AUDIO_PATH = "call6.wav"
# CLEAN_AUDIO_PATH = "cleaned_audio_for_asr_and_diarization.wav"
HUGGING_FACE_ACCESS_TOKEN = "hf_"

In [5]:
model = whisper.load_model("large")

100%|█████████████████████████████████████| 2.88G/2.88G [00:33<00:00, 91.0MiB/s]


In [6]:
model

Whisper(
  (encoder): AudioEncoder(
    (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
    (blocks): ModuleList(
      (0-31): 32 x ResidualAttentionBlock(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=1280, out_features=1280, bias=True)
          (key): Linear(in_features=1280, out_features=1280, bias=False)
          (value): Linear(in_features=1280, out_features=1280, bias=True)
          (out): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (attn_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=1280, out_features=5120, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=5120, out_features=1280, bias=True)
        )
        (mlp_ln): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_post): LayerNorm(

In [7]:
def get_audio_duration(audio_path):
    """Get audio duration using ffprobe"""
    try:
        cmd = ["ffprobe", "-v", "error", "-show_entries", "format=duration",
               "-of", "default=noprint_wrappers=1:nokey=1", audio_path]
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
        return float(result.stdout.strip())
    except Exception as e:
        print(f"Could not get duration: {e}")
        return 0


In [8]:
def audio_preprocessing_v1(input_path, output_path):
    """Advanced audio preprocessing with better parameters"""
    print("--- Trying Advanced Audio Preprocessing ---")

    # Improved ffmpeg command - less aggressive filtering to preserve speech
    ffmpeg_command = [
        "ffmpeg", "-i", input_path,
        "-acodec", "pcm_s16le",
        "-ac", "1",  # Mono
        "-ar", "16000",  # 16kHz sample rate
        "-af", "loudnorm=I=-23:TP=-2:LRA=7,highpass=f=80,lowpass=f=8000,afftdn=nr=10",
        "-y", output_path
    ]

    try:
        result = subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
        print("Advanced preprocessing successful")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Advanced preprocessing failed: {e.returncode}")
        return False

def audio_preprocessing_v2(input_path, output_path):
    """Simplified but effective preprocessing"""
    print("--- Trying Simplified Audio Preprocessing ---")

    ffmpeg_command = [
        "ffmpeg", "-i", input_path,
        "-acodec", "pcm_s16le",
        "-ac", "1",
        "-ar", "16000",
        "-af", "loudnorm=I=-23:TP=-2,highpass=f=100",
        "-y", output_path
    ]

    try:
        result = subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
        print("Simplified preprocessing successful")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Simplified preprocessing failed: {e.returncode}")
        return False

def audio_preprocessing_v3(input_path, output_path):
    """Basic but reliable preprocessing"""
    print("--- Trying Basic Audio Preprocessing ---")

    ffmpeg_command = [
        "ffmpeg", "-i", input_path,
        "-acodec", "pcm_s16le",
        "-ac", "1",
        "-ar", "16000",
        "-af", "loudnorm",
        "-y", output_path
    ]

    try:
        result = subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
        print("Basic preprocessing successful")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Basic preprocessing failed: {e.returncode}")
        return False

def audio_preprocessing_v4(input_path, output_path):
    """Minimal processing - just format conversion"""
    print("--- Trying Minimal Audio Processing ---")

    ffmpeg_command = [
        "ffmpeg", "-i", input_path,
        "-acodec", "pcm_s16le",
        "-ac", "1",
        "-ar", "16000",
        "-y", output_path
    ]

    try:
        result = subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
        print("Minimal processing successful")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Minimal processing failed: {e.returncode}")
        return False

def smart_audio_preprocessing(input_path, output_path):
    """Try different preprocessing methods in order of preference"""
    original_duration = get_audio_duration(input_path)
    print(f"Original audio duration: {original_duration:.2f} seconds")

    methods = [
        audio_preprocessing_v1,
        audio_preprocessing_v2,
        audio_preprocessing_v3,
        audio_preprocessing_v4
    ]

    for i, method in enumerate(methods, 1):
        if method(input_path, output_path):
            if os.path.exists(output_path):
                processed_duration = get_audio_duration(output_path)
                print(f"Processed audio duration: {processed_duration:.2f} seconds")

                if abs(original_duration - processed_duration) < 1.0:
                    print(f"✅ Audio preprocessing successful with method {i}")
                    return True
                else:
                    print(f"⚠️  Duration mismatch with method {i}, trying next...")
                    continue

    print("❌ All preprocessing methods failed!")
    return False

In [9]:
def post_process_text(text):
    """Clean up transcribed text from Whisper output for call center insurance context."""
    if not text:
        return ""

    # === 1. Remove excessive immediate repetitions ===
    words = text.split()
    cleaned_words = []
    i = 0
    while i < len(words):
        current_word = words[i].lower()
        repetition_count = 1
        j = i + 1
        while j < len(words) and words[j].lower() == current_word:
            repetition_count += 1
            j += 1

        keep_count = min(repetition_count, 2) if repetition_count <= 3 else 1
        for _ in range(keep_count):
            cleaned_words.append(words[i])
        i += repetition_count

    text = ' '.join(cleaned_words)

    # === 2. Remove filler sounds (non-verbal, repetitive) ===
    filler_sounds = ["uh", "um", "mm", "hmm", "ah", "oh", "huh", "ha ha"]
    soft_fillers = ["okay okay", "yes yes", "yes yes yes", "i mean", "you know", "like like", "ok ok"]

    for filler in filler_sounds + soft_fillers:
        text = re.sub(rf'\b{re.escape(filler)}\b', '', text, flags=re.IGNORECASE)

    # === 3. Insurance domain term normalization ===
    corrections = {
        'access max life': 'Axis Max Life',
        'axis max life': 'Axis Max Life',
        'g pay': 'GPay',
        'google pay': 'Google Pay',
        'phone pay': 'PhonePe',
        'phone pe': 'PhonePe',
        'pay tm': 'Paytm',
        'net banking': 'netbanking',
        'some assured': 'sum assured',
        'premium do': 'premium due',
        'do date': 'due date',
        'okay sir': 'Okay sir',
    }

    text_lower = text.lower()
    for wrong, correct in corrections.items():
        text_lower = text_lower.replace(wrong, correct)

    # === 3.5 Replace 'Rs.', 'Rs' → '₹' with optional space cleanup ===
    text_lower = re.sub(r'\brs[.]?\s*', '₹', text_lower)

    # === 4. Punctuation cleanup ===
    text_lower = re.sub(r'\s{2,}', ' ', text_lower)          # Extra spaces
    text_lower = re.sub(r'[,]{2,}', ',', text_lower)         # Repeated commas
    text_lower = re.sub(r'\s+,', ',', text_lower)            # Space before comma
    text_lower = re.sub(r'\s+\.', '.', text_lower)           # Space before period
    text_lower = re.sub(r'\s+[!?]', lambda m: m.group(0).strip(), text_lower)

    # === 5. Capitalize sentences ===
    text_lower = re.sub(r'(^|[.!?]\s+)([a-z])',
                        lambda m: m.group(1) + m.group(2).upper(),
                        text_lower)

    return text_lower.strip()

In [10]:
def enhanced_whisper_transcription(audio_path):
    """
    Enhanced Whisper transcription with optimal anti-repetition parameters
    """
    print("--- Enhanced Whisper Transcription (Optimal Single Strategy) ---")

    # prompt = (
    #     "Axis Maxlife Insurance, Policy number, fund value, Due date,"
    #     "Sum Assured, Policy Status, Late Fee, Google Pay, GPay, PhonePe, Paytm, netbanking,"
    # )

    prompt = (
        "This is a customer support call for Axis Maxlife Insurance. "
        "We will discuss policy numbers, due date, fund value, sum assured, late fee, "
        "and payment methods such as Google Pay, PhonePe, Paytm and net banking."
    )

    # Single optimal strategy - no need for multiple attempts
    result = model.transcribe(
        audio_path,
        language="ta",
        task="translate",
        temperature=0.0,
        beam_size=5,
        patience=1.2,
        condition_on_previous_text=False,
        no_speech_threshold=0.8,
        compression_ratio_threshold=2.0,
        logprob_threshold=-0.35,
        word_timestamps=False,
        initial_prompt=prompt,
        verbose=True,
    )

    print("✅ Whisper transcription completed with optimal parameters")
    return result


def calculate_repetition_score(segments):
    """
    Calculate a repetition score for transcription segments
    Lower score = less repetition = better
    """
    if not segments:
        return 0.0

    total_repetition = 0
    total_words = 0

    for segment in segments:
        text = segment.get('text', '').strip().lower()
        words = text.split()

        if len(words) < 2:
            continue

        total_words += len(words)

        # Count immediate word repetitions
        for i in range(len(words) - 1):
            if words[i] == words[i + 1]:
                total_repetition += 1

        # Count phrase repetitions within segment
        for phrase_len in range(2, min(len(words)//2 + 1, 6)):
            for start in range(len(words) - phrase_len * 2 + 1):
                phrase1 = ' '.join(words[start:start + phrase_len])
                phrase2 = ' '.join(words[start + phrase_len:start + phrase_len * 2])
                if phrase1 == phrase2:
                    total_repetition += phrase_len * 2  # Heavy penalty

    return total_repetition / max(total_words, 1)

def detect_and_remove_repetitions(segments, max_repetition_ratio=0.3):
    """
    AGGRESSIVE post-processing function to detect and remove repetitive segments
    """
    print("🔍 Starting aggressive repetition detection...")
    cleaned_segments = []

    for i, segment in enumerate(segments):
        text = segment['text'].strip()
        words = text.split()

        # Skip very short segments
        if len(words) < 2:
            continue

        # AGGRESSIVE: Check for excessive word repetition
        is_repetitive = False

        # Count word frequencies
        word_counts = {}
        for word in words:
            word_lower = word.lower().strip('.,!?')
            word_counts[word_lower] = word_counts.get(word_lower, 0) + 1

        # Check if any single word dominates the segment
        max_word_count = max(word_counts.values()) if word_counts else 0
        word_dominance = max_word_count / len(words) if words else 0

        if word_dominance > 0.4:  # If any word is >40% of the segment
            print(f"🚫 Rejecting word-dominated segment: {text[:50]}... (dominance: {word_dominance:.2f})")
            continue

        # Check for immediate repetitions (same word repeated consecutively)
        consecutive_repeats = 0
        max_consecutive = 0

        for j in range(1, len(words)):
            if words[j].lower().strip('.,!?') == words[j-1].lower().strip('.,!?'):
                consecutive_repeats += 1
                max_consecutive = max(max_consecutive, consecutive_repeats + 1)
            else:
                consecutive_repeats = 0

        if max_consecutive > 3:  # More than 3 consecutive identical words
            print(f"🚫 Rejecting consecutive repeat segment: {text[:50]}... (max consecutive: {max_consecutive})")
            continue

        # Check for pattern repetitions within segment
        for phrase_len in range(2, min(len(words)//3 + 1, 8)):
            for start in range(len(words) - phrase_len * 2 + 1):
                phrase1 = ' '.join(words[start:start + phrase_len]).lower()
                phrase2 = ' '.join(words[start + phrase_len:start + phrase_len * 2]).lower()

                if phrase1 == phrase2:
                    repetition_coverage = (phrase_len * 2) / len(words)
                    if repetition_coverage > max_repetition_ratio:
                        print(f"🚫 Rejecting pattern repeat segment: {text[:50]}... (coverage: {repetition_coverage:.2f})")
                        is_repetitive = True
                        break
            if is_repetitive:
                break

        if is_repetitive:
            continue

        # Check for similarity with recent segments (avoid near-duplicates)
        is_near_duplicate = False
        for prev_segment in cleaned_segments[-5:]:  # Check last 5 segments
            prev_words = prev_segment['text'].lower().split()
            current_words = [w.lower() for w in words]

            if prev_words and current_words:
                # Calculate Jaccard similarity
                prev_set = set(prev_words)
                current_set = set(current_words)
                intersection = len(prev_set.intersection(current_set))
                union = len(prev_set.union(current_set))

                similarity = intersection / union if union > 0 else 0

                if similarity > 0.7 and abs(len(prev_words) - len(current_words)) < 5:
                    print(f"🚫 Rejecting near-duplicate: {text[:30]}... (similarity: {similarity:.2f})")
                    is_near_duplicate = True
                    break

        if is_near_duplicate:
            continue

        # If we reach here, the segment passed all checks
        cleaned_segments.append(segment)

    removed_count = len(segments) - len(cleaned_segments)
    print(f"📊 Aggressive cleaning: {len(segments)} → {len(cleaned_segments)} segments")
    print(f"🗑️  Removed {removed_count} repetitive/problematic segments")

    return cleaned_segments

In [11]:
# Directory paths
INPUT_DIR = Path("training_data")
OUTPUT_DIR = Path("processed_output")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Manifest for Whisper fine-tuning
manifest_path = OUTPUT_DIR / "training_manifest.jsonl"
manifest_entries = []

def process_single_file(audio_file_path):
    print(f"\n📁 Processing file: {audio_file_path.name}")
    clean_audio_path = OUTPUT_DIR / f"{audio_file_path.stem}_clean.wav"
    json_output_path = OUTPUT_DIR / f"{audio_file_path.stem}_transcription.json"

    if not smart_audio_preprocessing(str(audio_file_path), str(clean_audio_path)):
        print("❌ Preprocessing failed, skipping file.")
        return

    try:
        whisper_result = enhanced_whisper_transcription(str(clean_audio_path))
    except Exception as e:
        print(f"❌ Whisper transcription failed: {e}")
        return

    cleaned_segments = detect_and_remove_repetitions(whisper_result["segments"])

    processed_segments = []
    for segment in cleaned_segments:
        cleaned = post_process_text(segment['text'])
        if cleaned.strip() and len(cleaned.strip()) > 5:
            new_segment = segment.copy()
            new_segment['text'] = cleaned
            processed_segments.append(new_segment)

    whisper_result["segments"] = processed_segments

    print("🔊 Performing Speaker Diarization...")
    try:
        pipeline = Pipeline.from_pretrained(
            "pyannote/speaker-diarization-3.1",
            use_auth_token=HUGGING_FACE_ACCESS_TOKEN
        )
        if torch.cuda.is_available():
            pipeline.to(torch.device("cuda"))
            print("✅ Using GPU")
        diarization = pipeline(str(clean_audio_path))
    except Exception as e:
        print(f"⚠️ Diarization failed: {e}")
        diarization = None

    def get_dominant_speaker(start, end, diarization_result):
        if not diarization_result:
            return "Speaker_Unknown"
        speakers = {}
        for segment, _, speaker in diarization_result.itertracks(yield_label=True):
            overlap = max(0, min(end, segment.end) - max(start, segment.start))
            if overlap > 0:
                speakers[speaker] = speakers.get(speaker, 0) + overlap
        return max(speakers, key=speakers.get) if speakers else "Speaker_Unknown"

    dialogue = []
    current_speaker, current_texts, current_start, current_end = None, [], 0, 0
    for seg in processed_segments:
        start, end, text = seg['start'], seg['end'], seg['text'].strip()
        speaker = get_dominant_speaker(start, end, diarization)
        if (speaker == current_speaker and current_speaker and (start - current_end) < 3.0):
            current_texts.append(text)
            current_end = end
        else:
            if current_speaker and current_texts:
                combined = ' '.join(current_texts)
                if len(combined.strip()) > 10:
                    dialogue.append({
                        'speaker': current_speaker,
                        'text': combined,
                        'start_time': current_start,
                        'end_time': current_end
                    })
            current_speaker, current_texts, current_start, current_end = speaker, [text], start, end

    if current_speaker and current_texts:
        combined = ' '.join(current_texts)
        if len(combined.strip()) > 10:
            dialogue.append({
                'speaker': current_speaker,
                'text': combined,
                'start_time': current_start,
                'end_time': current_end
            })

    # Save JSON per file
    output_data = {
        'metadata': {
            'audio_file': str(audio_file_path.name),
            'total_duration': whisper_result.get('duration', 0),
            'total_speakers': len(set(d['speaker'] for d in dialogue)),
            'total_segments': len(dialogue),
            'model_used': 'whisper-large',
            'processing_successful': True,
            'anti_repetition_applied': True
        },
        'dialogue': dialogue,
        'raw_transcription': whisper_result
    }

    with open(json_output_path, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)
    print(f"✅ Output saved: {json_output_path.name}")

    # Generate final conversation string
    full_text = "\n".join([f"{d['speaker']}: {d['text']}" for d in dialogue])

    # Add to manifest entry
    manifest_entries.append({
        "audio_filepath": str(clean_audio_path),
        "text": full_text,
        "language": "ta",
        "task": "translate"
    })

def main():
    audio_files = list(INPUT_DIR.glob("*.wav"))
    if not audio_files:
        print("❌ No .wav files found in 'training_data/' folder.")
        return

    print(f"🚀 Found {len(audio_files)} files to process...")
    for audio_file in audio_files:
        process_single_file(audio_file)

    # Save consolidated JSONL manifest
    with open(manifest_path, 'w', encoding='utf-8') as f:
        for entry in manifest_entries:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

    print(f"\n📄 Training manifest saved to: {manifest_path}")
    print("✅ All files processed.")

if __name__ == "__main__":
    main()

🚀 Found 5 files to process...

📁 Processing file: call5.wav
Original audio duration: 169.70 seconds
--- Trying Advanced Audio Preprocessing ---
Advanced preprocessing successful
Processed audio duration: 169.68 seconds
✅ Audio preprocessing successful with method 1
--- Enhanced Whisper Transcription (Optimal Single Strategy) ---
[00:00.000 --> 00:04.000]  Hello, Good morning ma'am.
[00:04.000 --> 00:05.000]  Yes ma'am.
[00:05.000 --> 00:11.000]  I am calling from Axis Maxlife. I am from Sir.
[00:11.000 --> 00:12.000]  Okay.
[00:12.000 --> 00:13.000]  Hello.
[00:13.000 --> 00:19.000]  Actually, I have received the payment yesterday ma'am. 12,574.65.
[00:19.000 --> 00:20.000]  Okay.
[00:20.000 --> 00:29.000]  Okay. The health declaration form is pending. Did you submit it online ma'am?
[00:29.000 --> 00:35.000]  No, I haven't done the health declaration yet. I have already paid for it.
[00:35.000 --> 00:38.000]  Okay, so the form hasn't been opened yet?
[00:38.000 --> 00:39.000]  Not yet

config.yaml:   0%|          | 0.00/469 [00:00<?, ?B/s]

DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover


pytorch_model.bin:   0%|          | 0.00/5.91M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/399 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/26.6M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/221 [00:00<?, ?B/s]

✅ Using GPU


It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.

  std = sequences.std(dim=-1, correction=1)


✅ Output saved: call5_transcription.json

📁 Processing file: call2.wav
Original audio duration: 190.76 seconds
--- Trying Advanced Audio Preprocessing ---
Advanced preprocessing successful
Processed audio duration: 190.74 seconds
✅ Audio preprocessing successful with method 1
--- Enhanced Whisper Transcription (Optimal Single Strategy) ---
[00:00.000 --> 00:10.000]  Hello! Greetings! My name is Aathi. We are calling from Licensure. This is an email call. You have taken a policy from Axis Maxlife Insurance. Can you speak for 2 minutes?
[00:10.000 --> 00:18.000]  Yes, Madam. Quick call. I can't get it. If we cancel the policy, will it be refunded?
[00:18.000 --> 00:23.000]  Okay. I will inform you about the details of your policy in a short time.
[00:23.000 --> 00:27.000]  Okay.
[00:27.000 --> 00:29.000]  Okay. Can you speak in Himalayan?
[00:29.000 --> 00:31.000]  Yes.
[00:31.000 --> 00:33.000]  Okay. Can you tell me the reason why you did not commit the crime?
[00:33.000 --> 00:35.000]

  std = sequences.std(dim=-1, correction=1)


✅ Output saved: call2_transcription.json

📁 Processing file: call4.wav
Original audio duration: 160.96 seconds
--- Trying Advanced Audio Preprocessing ---
Advanced preprocessing successful
Processed audio duration: 160.93 seconds
✅ Audio preprocessing successful with method 1
--- Enhanced Whisper Transcription (Optimal Single Strategy) ---
[00:00.000 --> 00:02.000]  Hello!
[00:02.000 --> 00:06.000]  Hello! My name is Axis Maxlife Insurance.
[00:06.000 --> 00:08.000]  Hello!
[00:08.000 --> 00:10.000]  My name is Salwa Kumar.
[00:10.000 --> 00:12.000]  Hello!
[00:12.000 --> 00:14.000]  I would like to ask you about the policy for Axis Maxlife Insurance.
[00:14.000 --> 00:16.000]  Hello!
[00:16.000 --> 00:18.000]  The policy for Axis Maxlife Insurance is,
[00:18.000 --> 00:20.000]  The policy for Axis Maxlife Insurance is,
[00:20.000 --> 00:22.000]  The policy for Axis Maxlife Insurance is,
[00:22.000 --> 00:24.000]  The policy for Axis Maxlife Insurance is,
[00:24.000 --> 00:27.000]  It 

  std = sequences.std(dim=-1, correction=1)


✅ Output saved: call4_transcription.json

📁 Processing file: call1.wav
Original audio duration: 106.92 seconds
--- Trying Advanced Audio Preprocessing ---
Advanced preprocessing successful
Processed audio duration: 106.89 seconds
✅ Audio preprocessing successful with method 1
--- Enhanced Whisper Transcription (Optimal Single Strategy) ---
[00:00.000 --> 00:09.000]  Hello sir, I am Jai Prakash from Axis Maxlife. I am calling from Govindhara.
[00:09.000 --> 00:14.000]  We have a renewal call for policy DVR. Shall we talk about the policy details?
[00:44.000 --> 00:46.000]  It has been paid up.
[00:46.000 --> 00:52.000]  Rs.1,14,713.47
[00:54.000 --> 00:56.000]  The policy has been paid up.
[00:56.000 --> 00:57.000]  It is not active.
[00:57.000 --> 00:58.000]  It is not safe.
[00:58.000 --> 00:59.000]  It is not safe.
[00:59.000 --> 01:00.000]  It is not safe.
[01:00.000 --> 01:01.000]  It is not safe.
[01:01.000 --> 01:02.000]  It is not safe.
[01:02.000 --> 01:03.000]  It is not safe.

  std = sequences.std(dim=-1, correction=1)


✅ Output saved: call1_transcription.json

📁 Processing file: call3.wav
Original audio duration: 140.72 seconds
--- Trying Advanced Audio Preprocessing ---
Advanced preprocessing successful
Processed audio duration: 140.69 seconds
✅ Audio preprocessing successful with method 1
--- Enhanced Whisper Transcription (Optimal Single Strategy) ---
[00:00.000 --> 00:02.000]  Hello!
[00:02.000 --> 00:05.000]  Hello! I am Chandra from Axis Maxlife Insurance.
[00:05.000 --> 00:07.000]  What is this?
[00:07.000 --> 00:09.000]  Axis Maxlife Insurance.
[00:09.000 --> 00:11.000]  Yes, tell me.
[00:11.000 --> 00:15.000]  Are you free to talk about Axis Maxlife Insurance policy for a minute?
[00:15.000 --> 00:18.000]  Yes, I will.
[00:18.000 --> 00:21.000]  One minute. I will set the date.
[00:21.000 --> 00:25.000]  Russell B. Madan is speaking.
[00:25.000 --> 00:27.000]  Yes.
[00:27.000 --> 00:41.000]  The policy number is 142115666 and the total amount is Rs.3,522,10,898.
[00:41.000 --> 00:44.000]  Yo

  std = sequences.std(dim=-1, correction=1)


✅ Output saved: call3_transcription.json

📄 Training manifest saved to: processed_output/training_manifest.jsonl
✅ All files processed.


In [15]:
import shutil

# Zip the folder into processed_outputs.zip
shutil.make_archive('processed_output1-5', 'zip', 'processed_output')

'/content/processed_output1-5.zip'

In [16]:
from google.colab import files

# Download the zipped folder
files.download('processed_output1-5.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [17]:
# Directory paths
INPUT_DIR = Path("training_data_1")
OUTPUT_DIR = Path("processed_output_1")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Manifest for Whisper fine-tuning
manifest_path = OUTPUT_DIR / "training_manifest.jsonl"
manifest_entries = []

def process_single_file(audio_file_path):
    print(f"\n📁 Processing file: {audio_file_path.name}")
    clean_audio_path = OUTPUT_DIR / f"{audio_file_path.stem}_clean.wav"
    json_output_path = OUTPUT_DIR / f"{audio_file_path.stem}_transcription.json"

    if not smart_audio_preprocessing(str(audio_file_path), str(clean_audio_path)):
        print("❌ Preprocessing failed, skipping file.")
        return

    try:
        whisper_result = enhanced_whisper_transcription(str(clean_audio_path))
    except Exception as e:
        print(f"❌ Whisper transcription failed: {e}")
        return

    cleaned_segments = detect_and_remove_repetitions(whisper_result["segments"])

    processed_segments = []
    for segment in cleaned_segments:
        cleaned = post_process_text(segment['text'])
        if cleaned.strip() and len(cleaned.strip()) > 5:
            new_segment = segment.copy()
            new_segment['text'] = cleaned
            processed_segments.append(new_segment)

    whisper_result["segments"] = processed_segments

    print("🔊 Performing Speaker Diarization...")
    try:
        pipeline = Pipeline.from_pretrained(
            "pyannote/speaker-diarization-3.1",
            use_auth_token=HUGGING_FACE_ACCESS_TOKEN
        )
        if torch.cuda.is_available():
            pipeline.to(torch.device("cuda"))
            print("✅ Using GPU")
        diarization = pipeline(str(clean_audio_path))
    except Exception as e:
        print(f"⚠️ Diarization failed: {e}")
        diarization = None

    def get_dominant_speaker(start, end, diarization_result):
        if not diarization_result:
            return "Speaker_Unknown"
        speakers = {}
        for segment, _, speaker in diarization_result.itertracks(yield_label=True):
            overlap = max(0, min(end, segment.end) - max(start, segment.start))
            if overlap > 0:
                speakers[speaker] = speakers.get(speaker, 0) + overlap
        return max(speakers, key=speakers.get) if speakers else "Speaker_Unknown"

    dialogue = []
    current_speaker, current_texts, current_start, current_end = None, [], 0, 0
    for seg in processed_segments:
        start, end, text = seg['start'], seg['end'], seg['text'].strip()
        speaker = get_dominant_speaker(start, end, diarization)
        if (speaker == current_speaker and current_speaker and (start - current_end) < 3.0):
            current_texts.append(text)
            current_end = end
        else:
            if current_speaker and current_texts:
                combined = ' '.join(current_texts)
                if len(combined.strip()) > 10:
                    dialogue.append({
                        'speaker': current_speaker,
                        'text': combined,
                        'start_time': current_start,
                        'end_time': current_end
                    })
            current_speaker, current_texts, current_start, current_end = speaker, [text], start, end

    if current_speaker and current_texts:
        combined = ' '.join(current_texts)
        if len(combined.strip()) > 10:
            dialogue.append({
                'speaker': current_speaker,
                'text': combined,
                'start_time': current_start,
                'end_time': current_end
            })

    # Save JSON per file
    output_data = {
        'metadata': {
            'audio_file': str(audio_file_path.name),
            'total_duration': whisper_result.get('duration', 0),
            'total_speakers': len(set(d['speaker'] for d in dialogue)),
            'total_segments': len(dialogue),
            'model_used': 'whisper-large',
            'processing_successful': True,
            'anti_repetition_applied': True
        },
        'dialogue': dialogue,
        'raw_transcription': whisper_result
    }

    with open(json_output_path, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)
    print(f"✅ Output saved: {json_output_path.name}")

    # Generate final conversation string
    full_text = "\n".join([f"{d['speaker']}: {d['text']}" for d in dialogue])

    # Add to manifest entry
    manifest_entries.append({
        "audio_filepath": str(clean_audio_path),
        "text": full_text,
        "language": "ta",
        "task": "translate"
    })

def main():
    audio_files = list(INPUT_DIR.glob("*.wav"))
    if not audio_files:
        print("❌ No .wav files found in 'training_data/' folder.")
        return

    print(f"🚀 Found {len(audio_files)} files to process...")
    for audio_file in audio_files:
        process_single_file(audio_file)

    # Save consolidated JSONL manifest
    with open(manifest_path, 'w', encoding='utf-8') as f:
        for entry in manifest_entries:
            f.write(json.dumps(entry, ensure_ascii=False) + "\n")

    print(f"\n📄 Training manifest saved to: {manifest_path}")
    print("✅ All files processed.")

if __name__ == "__main__":
    main()

🚀 Found 5 files to process...

📁 Processing file: call6.wav
Original audio duration: 172.48 seconds
--- Trying Advanced Audio Preprocessing ---
Advanced preprocessing successful
Processed audio duration: 172.45 seconds
✅ Audio preprocessing successful with method 1
--- Enhanced Whisper Transcription (Optimal Single Strategy) ---
[00:00.000 --> 00:06.000]  Good afternoon ma'am. I am calling from Axis Maxlife for renewal.
[00:06.000 --> 00:09.000]  I am calling for renewal.
[00:09.000 --> 00:12.000]  Hello.
[00:12.000 --> 00:15.000]  Hello.
[00:15.000 --> 00:18.000]  Hello.
[00:18.000 --> 00:20.000]  Hello.
[00:20.000 --> 00:22.000]  Hello ma'am. Good afternoon.
[00:22.000 --> 00:23.000]  Yes, tell me ma'am.
[00:23.000 --> 00:28.000]  I am calling from Axis Maxlife. Do you know who is Ms. Maheshwari Vinodkumar?
[00:28.000 --> 00:32.000]  It's my madam.
[00:32.000 --> 00:36.000]  There is a policy in her name. We have called her to talk about it.
[00:36.000 --> 00:43.000]  You spoke to he

  std = sequences.std(dim=-1, correction=1)


✅ Output saved: call6_transcription.json

📁 Processing file: call9.wav
Original audio duration: 326.84 seconds
--- Trying Advanced Audio Preprocessing ---
Advanced preprocessing successful
Processed audio duration: 326.81 seconds
✅ Audio preprocessing successful with method 1
--- Enhanced Whisper Transcription (Optimal Single Strategy) ---
[00:00.000 --> 00:07.000]  Hello. Good morning sir. My name is Anand. I am a customer support for Axis Maxlife Insurance.
[00:07.000 --> 00:09.000]  I am from Maths Science Institute.
[00:09.000 --> 00:11.000]  Hello.
[00:11.000 --> 00:13.000]  Yes.
[00:13.000 --> 00:15.000]  This is Mr. Kannan Thyagaraj.
[00:15.000 --> 00:17.000]  Yes Madam.
[00:17.000 --> 00:19.000]  Sir, do you have a policy?
[00:19.000 --> 00:21.000]  I have called for policy renewal.
[00:21.000 --> 00:23.000]  Can I talk to you for 2 minutes?
[00:23.000 --> 00:25.000]  What is the procedure to close the policy?
[00:27.000 --> 00:29.000]  Actually, if you want to close it,
[00:29

  std = sequences.std(dim=-1, correction=1)


✅ Output saved: call9_transcription.json

📁 Processing file: call10.wav
Original audio duration: 439.42 seconds
--- Trying Advanced Audio Preprocessing ---
Advanced preprocessing successful
Processed audio duration: 439.39 seconds
✅ Audio preprocessing successful with method 1
--- Enhanced Whisper Transcription (Optimal Single Strategy) ---
[00:00.000 --> 00:07.000]  Good afternoon, sir. I am K. R. Singh. We are talking about Axis Maxlife Insurance.
[00:07.000 --> 00:09.000]  Yes, tell me.
[00:09.000 --> 00:11.000]  Do you know Ms. Mala's name?
[00:11.000 --> 00:14.000]  I am the one who took the policy.
[00:14.000 --> 00:20.000]  Okay, okay, sir. Actually, we have a policy in our name. We have called the policy for anyone.
[00:20.000 --> 00:23.000]  We will talk to you for 2 minutes, sir.
[00:23.000 --> 00:24.000]  Yes, tell me.
[00:24.000 --> 00:25.000]  Thank you, sir.
[00:25.000 --> 00:35.000]  Policy number is 333-744-8419. Plan name is Life Co-Fair Partner Super Plan. You have st

  std = sequences.std(dim=-1, correction=1)


✅ Output saved: call10_transcription.json

📁 Processing file: call7.wav
Original audio duration: 263.77 seconds
--- Trying Advanced Audio Preprocessing ---
Advanced preprocessing successful
Processed audio duration: 263.74 seconds
✅ Audio preprocessing successful with method 1
--- Enhanced Whisper Transcription (Optimal Single Strategy) ---
[00:00.000 --> 00:07.000]  Good afternoon, sir. I am Nithya Goswami. I am a customer support call for Axis Maxlife Insurance.
[00:07.000 --> 00:09.000]  Hello.
[00:09.000 --> 00:11.000]  Hello.
[00:11.000 --> 00:13.000]  Hello.
[00:13.000 --> 00:15.000]  Sir, this is Mr. Sengar Vaidyanathan.
[00:15.000 --> 00:17.000]  Yes.
[00:17.000 --> 00:22.000]  Sir, you have taken a policy and have called for policy renewal. Can I talk to you for one minute, sir?
[00:22.000 --> 00:23.000]  Yes.
[00:23.000 --> 00:25.000]  Can you hear me?
[00:25.000 --> 00:27.000]  Yes, I can hear you.
[00:27.000 --> 00:32.000]  Have you taken a policy? We have called for a poli

  std = sequences.std(dim=-1, correction=1)


✅ Output saved: call7_transcription.json

📁 Processing file: call8.wav
Original audio duration: 176.00 seconds
--- Trying Advanced Audio Preprocessing ---
Advanced preprocessing successful
Processed audio duration: 175.97 seconds
✅ Audio preprocessing successful with method 1
--- Enhanced Whisper Transcription (Optimal Single Strategy) ---
[00:00.000 --> 00:10.000]  We will discuss policy numbers, due date, fund value, sum assured, late fee, and payment methods such as Google Pay, PhonePe, Paytm and net banking.
[00:10.000 --> 00:19.000]  We will discuss policy numbers, due date, fund value, sum assured, late fee, and payment methods such as Google Paytm and net banking.
[00:19.000 --> 00:22.500]  10,37,29,10,69,20 ma'am
[00:22.500 --> 00:26.000]  Policy Divided 10,29,6,2,5,23 Divided
[00:26.000 --> 00:27.000]  Ok
[00:27.000 --> 00:30.000]  So, the amount to be paid is 1.35 lakhs
[00:30.000 --> 00:32.000]  and Rs.431.97
[00:32.000 --> 00:35.000]  The policy to be paid is in lapse ma'am

  std = sequences.std(dim=-1, correction=1)


✅ Output saved: call8_transcription.json

📄 Training manifest saved to: processed_output_1/training_manifest.jsonl
✅ All files processed.


In [18]:
import shutil

# Zip the folder into processed_outputs.zip
shutil.make_archive('processed_output6-10', 'zip', 'processed_output_1')

'/content/processed_output6-10.zip'

In [19]:
from google.colab import files

# Download the zipped folder
files.download('processed_output6-10.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>