In [1]:
!pip install --break-system-packages git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-dbpxvvre
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-dbpxvvre
  Resolved https://github.com/openai/whisper.git to commit c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->openai-whisper==20250625)
  Downloading nvidia_cuda_c

In [2]:
!pip install --break-system-packages pyannote.audio torchaudio # pydub not strictly needed if only ffmpeg is used for audio proc

Collecting pyannote.audio
  Downloading pyannote.audio-3.3.2-py2.py3-none-any.whl.metadata (11 kB)
Collecting asteroid-filterbanks>=0.4 (from pyannote.audio)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting lightning>=2.0.1 (from pyannote.audio)
  Downloading lightning-2.5.2-py3-none-any.whl.metadata (38 kB)
Collecting pyannote.core>=5.0.0 (from pyannote.audio)
  Downloading pyannote.core-5.0.0-py3-none-any.whl.metadata (1.4 kB)
Collecting pyannote.database>=5.0.1 (from pyannote.audio)
  Downloading pyannote.database-5.1.3-py3-none-any.whl.metadata (1.1 kB)
Collecting pyannote.metrics>=3.2 (from pyannote.audio)
  Downloading pyannote.metrics-3.2.1-py3-none-any.whl.metadata (1.3 kB)
Collecting pyannote.pipeline>=3.0.1 (from pyannote.audio)
  Downloading pyannote.pipeline-3.0.1-py3-none-any.whl.metadata (897 bytes)
Collecting pytorch-metric-learning>=2.1.0 (from pyannote.audio)
  Downloading pytorch_metric_learning-2.8.1-py3-none-any.whl.metadata (18

In [3]:
import whisper
from pyannote.audio import Pipeline
import torch
import re
import os
import subprocess

In [4]:
input_audio_path = "/content/001_hindi.wav"
clean_audio_path = "cleaned_audio_for_asr_and_diarization.wav"

In [None]:
HUGGING_FACE_ACCESS_TOKEN = "hf_" # <-- REPLACE THIS!

In [6]:
# --- Step 1: Enhanced Audio Preprocessing with ffmpeg and Duration Verification ---
print("--- Starting Audio Preprocessing ---")

# First, get the duration of the original input file for comparison
print(f"--- Verifying Original Input Audio Duration ({input_audio_path}) ---")
ffprobe_command_input = ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", input_audio_path]
try:
    input_duration_output = subprocess.run(ffprobe_command_input, check=True, capture_output=True, text=True).stdout.strip()
    original_input_duration_seconds = float(input_duration_output)
    print(f"Original input audio duration: {original_input_duration_seconds:.2f} seconds")
except Exception as e:
    print(f"Error getting original input audio duration: {e}. Please ensure the input WAV file exists and is valid.")
    # Exit or raise error if input audio duration cannot be determined
    raise

# FFmpeg command using the original successful filters
ffmpeg_command = [
    "ffmpeg",
    "-i", input_audio_path,
    "-acodec", "pcm_s16le",
    "-ac", "1",
    "-ar", "16000",
    # Reverted to the original working filters: loudnorm, highpass, lowpass
    "-af", "loudnorm=I=-16:TP=-1.5:LRA=11, highpass=f=200, lowpass=f=3000",
    "-y", clean_audio_path # -y to overwrite output file without asking
]

try:
    result = subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
    if result.stdout:
        print("FFmpeg stdout:")
        print(result.stdout)
    if result.stderr:
        print("FFmpeg stderr (might contain warnings/info):")
        print(result.stderr)
    print(f"--- Audio Preprocessing Complete. Cleaned audio saved to {clean_audio_path} ---")

    # Verify duration of the cleaned audio file
    print(f"--- Verifying Cleaned Audio Duration ({clean_audio_path}) ---")
    ffprobe_command_output = ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", clean_audio_path]
    duration_output = subprocess.run(ffprobe_command_output, check=True, capture_output=True, text=True).stdout.strip()
    try:
        cleaned_audio_duration_seconds = float(duration_output)
        print(f"Cleaned audio duration: {cleaned_audio_duration_seconds:.2f} seconds")
        if abs(original_input_duration_seconds - cleaned_audio_duration_seconds) > 0.1: # Allow for small floating point differences
            print(f"WARNING: Cleaned audio duration ({cleaned_audio_duration_seconds:.2f}s) significantly differs from input ({original_input_duration_seconds:.2f}s). This might indicate a truncation problem during FFmpeg processing.")
        else:
            print("Cleaned audio duration matches input audio duration.")
    except ValueError:
        print(f"Could not parse duration from ffprobe for cleaned audio: {duration_output}")

except subprocess.CalledProcessError as e:
    print(f"--- FFmpeg Error ---")
    print(f"Command: {' '.join(e.cmd)}")
    print(f"Return Code: {e.returncode}")
    print(f"STDOUT:\n{e.stdout}")
    print(f"STDERR:\n{e.stderr}")
    print(f"--- Audio Preprocessing Failed. Cannot proceed. ---")
    raise e
except FileNotFoundError:
    print("--- FFmpeg/FFprobe not found ---")
    print("Please ensure FFmpeg and FFprobe are installed and accessible in your environment's PATH.")
    raise

--- Starting Audio Preprocessing ---
--- Verifying Original Input Audio Duration (/content/001_hindi.wav) ---
Original input audio duration: 372.36 seconds
ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-li

In [7]:
# --- Step 2: Whisper Transcription ---
print("--- Starting Whisper Transcription ---")

--- Starting Whisper Transcription ---


In [8]:
model = whisper.load_model("large")

100%|█████████████████████████████████████| 2.88G/2.88G [01:13<00:00, 42.0MiB/s]


In [9]:
initial_prompt_text = (
    "Axis Maxlife Insurance, Policy number, fund value, risk term, Premium Due, Due date, Sum Assured, Policy Status, "
    "Late Fee, Google pay, gpay, phone pay, paytm, netbacking, risk coverage, policy benefits, health declaration form"
)

In [10]:
whisper_result = model.transcribe(
    clean_audio_path,
    language="hi",       # Explicitly set source language as Tamil
    task="translate",     # Translate from Tamil to English
    verbose=True,
    initial_prompt=initial_prompt_text
)



[00:00.000 --> 00:17.000]  Hello, Good Morning, this is Maxlife Insurance, I am speaking to you from Prabhas,
[00:17.000 --> 00:23.000]  Yes, tell me, Can we speak to you about the policy regarding Maxlife Insurance,
[00:23.000 --> 00:25.000]  Yes, tell me,
[00:25.000 --> 00:32.000]  Yes, thank you sir, this is a service call, the policy you have done is Maxlife Future Genius Education Plan,
[00:32.000 --> 00:45.000]  Policy number is 105368914, your policy issue which is 4 November 2016, your due date has crossed 4 November 2020,
[00:45.000 --> 00:52.000]  so for now your due premium amount is 1,3083.22 rupees,
[00:52.000 --> 00:59.000]  your late fee which has been added is 11,058.07 rupees,
[00:59.000 --> 01:06.000]  so can we know, sir, you have already deposited 4 years in this, why you are not able to pay now?
[01:06.000 --> 01:10.000]  Due to Corona, I have to pay 2 days of interest,
[01:10.000 --> 01:15.000]  Sorry sir, in which language are you comfortable?
[01:15.000 --> 01:2

In [11]:
print("--- Whisper Transcription Complete ---")

--- Whisper Transcription Complete ---


In [12]:
# --- Step 3: Speaker Diarization with pyannote.audio ---
print("\n--- Starting Speaker Diarization ---")
try:
    pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization-3.1",
        use_auth_token=HUGGING_FACE_ACCESS_TOKEN
    )

    # Send pipeline to GPU (if available)
    if torch.cuda.is_available():
        pipeline.to(torch.device("cuda"))
        print("Pyannote.audio moved to GPU.")
    else:
        print("CUDA not available, running pyannote.audio on CPU. This might be slow.")

    diarization = pipeline(clean_audio_path)
    print("--- Speaker Diarization Complete ---")

except Exception as e:
    print(f"--- Speaker Diarization Failed ---")
    print(f"Error: {e}")
    print("Please ensure your Hugging Face Access Token is correct and has access to pyannote/speaker-diarization-3.1.")
    # If diarization fails, we can still proceed with transcription but without speaker labels
    diarization = None # Set diarization to None if it failed


--- Starting Speaker Diarization ---


config.yaml:   0%|          | 0.00/469 [00:00<?, ?B/s]

DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover


pytorch_model.bin:   0%|          | 0.00/5.91M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/399 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/26.6M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/221 [00:00<?, ?B/s]

CUDA not available, running pyannote.audio on CPU. This might be slow.


  std = sequences.std(dim=-1, correction=1)


--- Speaker Diarization Complete ---


In [13]:
# --- Step 4: Combine and Format Output ---
print("\n--- Generating Dialogue Output ---")
dialogue_output = []

# Helper function to find the dominant speaker for a given time segment
def get_dominant_speaker_for_segment(start_time, end_time, diarization_result):
    if not diarization_result: # If diarization failed
        return "Unknown"

    speakers_in_segment = {}
    for segment, _, speaker_label in diarization_result.itertracks(yield_label=True):
        # Calculate overlap between transcription segment and diarization segment
        overlap_start = max(start_time, segment.start)
        overlap_end = min(end_time, segment.end)
        overlap_duration = max(0, overlap_end - overlap_start)

        if overlap_duration > 0:
            speakers_in_segment[speaker_label] = speakers_in_segment.get(speaker_label, 0) + overlap_duration

    if not speakers_in_segment:
        return "Unknown"

    # Return the speaker with the most overlap
    return max(speakers_in_segment, key=speakers_in_segment.get)

# Group segments by speaker for better dialogue flow (experimental, can be adjusted)
current_speaker = None
current_text = []
current_start = -1
current_end = -1

for i, segment in enumerate(whisper_result["segments"]):
    start = segment['start']
    end = segment['end']
    text = segment['text'].strip()

    speaker = get_dominant_speaker_for_segment(start, end, diarization)

    if speaker == current_speaker and current_speaker is not None and (start - current_end) < 2.0: # Merge if same speaker and short pause
        current_text.append(text)
        current_end = end
    else:
        if current_speaker is not None:
            dialogue_output.append(f"Speaker {current_speaker}: {' '.join(current_text)}")
        current_speaker = speaker
        current_text = [text]
        current_start = start
        current_end = end

# Add the last accumulated segment
if current_speaker is not None:
    dialogue_output.append(f"Speaker {current_speaker}: {' '.join(current_text)}")



--- Generating Dialogue Output ---


In [14]:
# --- Step 5: Print the final dialogue ---
for line in dialogue_output:
    print(line)

print("\n--- Processing Complete ---")

Speaker SPEAKER_00: Hello, Good Morning, this is Maxlife Insurance, I am speaking to you from Prabhas, Yes, tell me, Can we speak to you about the policy regarding Maxlife Insurance,
Speaker SPEAKER_01: Yes, tell me,
Speaker SPEAKER_00: Yes, thank you sir, this is a service call, the policy you have done is Maxlife Future Genius Education Plan, Policy number is 105368914, your policy issue which is 4 November 2016, your due date has crossed 4 November 2020, so for now your due premium amount is 1,3083.22 rupees, your late fee which has been added is 11,058.07 rupees, so can we know, sir, you have already deposited 4 years in this, why you are not able to pay now?
Speaker SPEAKER_01: Due to Corona, I have to pay 2 days of interest,
Speaker SPEAKER_00: Sorry sir, in which language are you comfortable?
Speaker SPEAKER_01: I am saying that when Corona came, that is why I could not pay,
Speaker SPEAKER_00: Okay, Sorry sir, look, I understand what you are saying, but now your policy is in pa