In [2]:
!pip install --break-system-packages git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-hm1ihwtk
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-hm1ihwtk
  Resolved https://github.com/openai/whisper.git to commit c0d2f624c09dc18e709e37c2ad90c039a4eb72a2
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [3]:
!pip install --break-system-packages pyannote.audio torchaudio # pydub not strictly needed if only ffmpeg is used for audio proc



In [4]:
import whisper
from pyannote.audio import Pipeline
import torch
import re
import os
import subprocess

In [5]:
input_audio_path = "/content/s1.wav"
clean_audio_path = "cleaned_audio_for_asr_and_diarization.wav"

In [6]:
HUGGING_FACE_ACCESS_TOKEN = "hf_" # <-- REPLACE THIS!

In [7]:
# --- Step 1: Enhanced Audio Preprocessing with ffmpeg and Duration Verification ---
print("--- Starting Audio Preprocessing ---")

# First, get the duration of the original input file for comparison
print(f"--- Verifying Original Input Audio Duration ({input_audio_path}) ---")
ffprobe_command_input = ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", input_audio_path]
try:
    input_duration_output = subprocess.run(ffprobe_command_input, check=True, capture_output=True, text=True).stdout.strip()
    original_input_duration_seconds = float(input_duration_output)
    print(f"Original input audio duration: {original_input_duration_seconds:.2f} seconds")
except Exception as e:
    print(f"Error getting original input audio duration: {e}. Please ensure the input WAV file exists and is valid.")
    # Exit or raise error if input audio duration cannot be determined
    raise

# FFmpeg command using the original successful filters
ffmpeg_command = [
    "ffmpeg",
    "-i", input_audio_path,
    "-acodec", "pcm_s16le",
    "-ac", "1",
    "-ar", "16000",
    # Reverted to the original working filters: loudnorm, highpass, lowpass
    "-af", "loudnorm=I=-16:TP=-1.5:LRA=11, highpass=f=200, lowpass=f=3000",
    "-y", clean_audio_path # -y to overwrite output file without asking
]

try:
    result = subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
    if result.stdout:
        print("FFmpeg stdout:")
        print(result.stdout)
    if result.stderr:
        print("FFmpeg stderr (might contain warnings/info):")
        print(result.stderr)
    print(f"--- Audio Preprocessing Complete. Cleaned audio saved to {clean_audio_path} ---")

    # Verify duration of the cleaned audio file
    print(f"--- Verifying Cleaned Audio Duration ({clean_audio_path}) ---")
    ffprobe_command_output = ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", clean_audio_path]
    duration_output = subprocess.run(ffprobe_command_output, check=True, capture_output=True, text=True).stdout.strip()
    try:
        cleaned_audio_duration_seconds = float(duration_output)
        print(f"Cleaned audio duration: {cleaned_audio_duration_seconds:.2f} seconds")
        if abs(original_input_duration_seconds - cleaned_audio_duration_seconds) > 0.1: # Allow for small floating point differences
            print(f"WARNING: Cleaned audio duration ({cleaned_audio_duration_seconds:.2f}s) significantly differs from input ({original_input_duration_seconds:.2f}s). This might indicate a truncation problem during FFmpeg processing.")
        else:
            print("Cleaned audio duration matches input audio duration.")
    except ValueError:
        print(f"Could not parse duration from ffprobe for cleaned audio: {duration_output}")

except subprocess.CalledProcessError as e:
    print(f"--- FFmpeg Error ---")
    print(f"Command: {' '.join(e.cmd)}")
    print(f"Return Code: {e.returncode}")
    print(f"STDOUT:\n{e.stdout}")
    print(f"STDERR:\n{e.stderr}")
    print(f"--- Audio Preprocessing Failed. Cannot proceed. ---")
    raise e
except FileNotFoundError:
    print("--- FFmpeg/FFprobe not found ---")
    print("Please ensure FFmpeg and FFprobe are installed and accessible in your environment's PATH.")
    raise

--- Starting Audio Preprocessing ---
--- Verifying Original Input Audio Duration (/content/s1.wav) ---
Original input audio duration: 178.13 seconds
ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr -

In [8]:
# --- Step 2: Whisper Transcription ---
print("--- Starting Whisper Transcription ---")

--- Starting Whisper Transcription ---


In [9]:
model = whisper.load_model("large")

In [10]:
# model

In [11]:
initial_prompt_text = (
    "Axis Maxlife Insurance, Policy number, fund value, risk term, Premium Due, Due date, Sum Assured, Policy Status, "
    "Late Fee, Google pay, gpay, phone pay, paytm, netbacking, risk coverage, policy benefits"
)

In [12]:
whisper_result = model.transcribe(
    clean_audio_path,
    language="ta",       # Explicitly set source language as Tamil
    task="translate",     # Translate from Tamil to English
    verbose=True,
    initial_prompt=initial_prompt_text
)



[00:00.000 --> 00:02.000]  Hello!
[00:02.000 --> 00:06.000]  Hello! My name is Swathi. We have called from License.
[00:06.000 --> 00:08.000]  Are you Ganeshwari mam?
[00:08.000 --> 00:10.000]  No, I am Nyaneshwari mam.
[00:10.000 --> 00:13.000]  Sorry, Nyaneshwari mam. Are you Nyaneshwari mam?
[00:13.000 --> 00:14.000]  Yes mam.
[00:14.000 --> 00:19.000]  You have taken policy in Maxlife Insurance. I will call you back. Please talk.
[00:19.000 --> 00:21.000]  Yes mam.
[00:21.000 --> 00:29.000]  Thank you. Your policy number is 831-801-543. Maxlife Home Life Participating Plan Policy.
[00:29.000 --> 00:35.000]  The duration of 10-12-2024 is Rs.5352.
[00:35.000 --> 00:39.000]  What are the reasons for your payment? When are you going to pay?
[00:39.000 --> 00:45.000]  We are going to pay in the evening. Normally, I go to the office for one session.
[00:45.000 --> 00:47.000]  After Monday, we pay.
[00:47.000 --> 00:49.000]  After Monday, we pay.
[00:49.000 --> 00:51.000]  You pay between

In [13]:
print("--- Whisper Transcription Complete ---")

--- Whisper Transcription Complete ---


In [14]:
# --- Step 3: Speaker Diarization with pyannote.audio ---
print("\n--- Starting Speaker Diarization ---")
try:
    pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization-3.1",
        use_auth_token=HUGGING_FACE_ACCESS_TOKEN
    )

    # Send pipeline to GPU (if available)
    if torch.cuda.is_available():
        pipeline.to(torch.device("cuda"))
        print("Pyannote.audio moved to GPU.")
    else:
        print("CUDA not available, running pyannote.audio on CPU. This might be slow.")

    diarization = pipeline(clean_audio_path)
    print("--- Speaker Diarization Complete ---")

except Exception as e:
    print(f"--- Speaker Diarization Failed ---")
    print(f"Error: {e}")
    print("Please ensure your Hugging Face Access Token is correct and has access to pyannote/speaker-diarization-3.1.")
    # If diarization fails, we can still proceed with transcription but without speaker labels
    diarization = None # Set diarization to None if it failed


--- Starting Speaker Diarization ---


DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover


CUDA not available, running pyannote.audio on CPU. This might be slow.


  std = sequences.std(dim=-1, correction=1)


--- Speaker Diarization Complete ---


In [15]:
# --- Step 4: Combine and Format Output ---
print("\n--- Generating Dialogue Output ---")
dialogue_output = []

# Helper function to find the dominant speaker for a given time segment
def get_dominant_speaker_for_segment(start_time, end_time, diarization_result):
    if not diarization_result: # If diarization failed
        return "Unknown"

    speakers_in_segment = {}
    for segment, _, speaker_label in diarization_result.itertracks(yield_label=True):
        # Calculate overlap between transcription segment and diarization segment
        overlap_start = max(start_time, segment.start)
        overlap_end = min(end_time, segment.end)
        overlap_duration = max(0, overlap_end - overlap_start)

        if overlap_duration > 0:
            speakers_in_segment[speaker_label] = speakers_in_segment.get(speaker_label, 0) + overlap_duration

    if not speakers_in_segment:
        return "Unknown"

    # Return the speaker with the most overlap
    return max(speakers_in_segment, key=speakers_in_segment.get)

# Group segments by speaker for better dialogue flow (experimental, can be adjusted)
current_speaker = None
current_text = []
current_start = -1
current_end = -1

for i, segment in enumerate(whisper_result["segments"]):
    start = segment['start']
    end = segment['end']
    text = segment['text'].strip()

    speaker = get_dominant_speaker_for_segment(start, end, diarization)

    if speaker == current_speaker and current_speaker is not None and (start - current_end) < 2.0: # Merge if same speaker and short pause
        current_text.append(text)
        current_end = end
    else:
        if current_speaker is not None:
            dialogue_output.append(f"Speaker {current_speaker}: {' '.join(current_text)}")
        current_speaker = speaker
        current_text = [text]
        current_start = start
        current_end = end

# Add the last accumulated segment
if current_speaker is not None:
    dialogue_output.append(f"Speaker {current_speaker}: {' '.join(current_text)}")



--- Generating Dialogue Output ---


In [16]:
# --- Step 5: Print the final dialogue ---
for line in dialogue_output:
    print(line)

print("\n--- Processing Complete ---")

Speaker SPEAKER_00: Hello! Hello! My name is Swathi. We have called from License. Are you Ganeshwari mam? No, I am Nyaneshwari mam. Sorry, Nyaneshwari mam. Are you Nyaneshwari mam?
Speaker SPEAKER_02: Yes mam.
Speaker SPEAKER_00: You have taken policy in Maxlife Insurance. I will call you back. Please talk.
Speaker SPEAKER_01: Yes mam.
Speaker SPEAKER_00: Thank you. Your policy number is 831-801-543. Maxlife Home Life Participating Plan Policy. The duration of 10-12-2024 is Rs.5352. What are the reasons for your payment? When are you going to pay?
Speaker SPEAKER_01: We are going to pay in the evening. Normally, I go to the office for one session. After Monday, we pay. After Monday, we pay.
Speaker SPEAKER_00: You pay between Monday and Tuesday.
Speaker SPEAKER_01: Yes.
Speaker SPEAKER_00: Okay. Are you doing payment in branch or online?
Speaker SPEAKER_01: Online. Online. Online.
Speaker SPEAKER_00: Can you do online? Yes. Okay. Its health declaration form is also in pending. You subm

In [17]:
#Second call

In [20]:
input_audio_path = "/content/s2.wav"
clean_audio_path = "cleaned_audio_for_asr_and_diarization.wav"

In [21]:
# --- Step 1: Enhanced Audio Preprocessing with ffmpeg and Duration Verification ---
print("--- Starting Audio Preprocessing ---")

# First, get the duration of the original input file for comparison
print(f"--- Verifying Original Input Audio Duration ({input_audio_path}) ---")
ffprobe_command_input = ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", input_audio_path]
try:
    input_duration_output = subprocess.run(ffprobe_command_input, check=True, capture_output=True, text=True).stdout.strip()
    original_input_duration_seconds = float(input_duration_output)
    print(f"Original input audio duration: {original_input_duration_seconds:.2f} seconds")
except Exception as e:
    print(f"Error getting original input audio duration: {e}. Please ensure the input WAV file exists and is valid.")
    # Exit or raise error if input audio duration cannot be determined
    raise

# FFmpeg command using the original successful filters
ffmpeg_command = [
    "ffmpeg",
    "-i", input_audio_path,
    "-acodec", "pcm_s16le",
    "-ac", "1",
    "-ar", "16000",
    # Reverted to the original working filters: loudnorm, highpass, lowpass
    "-af", "loudnorm=I=-16:TP=-1.5:LRA=11, highpass=f=200, lowpass=f=3000",
    "-y", clean_audio_path # -y to overwrite output file without asking
]

try:
    result = subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
    if result.stdout:
        print("FFmpeg stdout:")
        print(result.stdout)
    if result.stderr:
        print("FFmpeg stderr (might contain warnings/info):")
        print(result.stderr)
    print(f"--- Audio Preprocessing Complete. Cleaned audio saved to {clean_audio_path} ---")

    # Verify duration of the cleaned audio file
    print(f"--- Verifying Cleaned Audio Duration ({clean_audio_path}) ---")
    ffprobe_command_output = ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", clean_audio_path]
    duration_output = subprocess.run(ffprobe_command_output, check=True, capture_output=True, text=True).stdout.strip()
    try:
        cleaned_audio_duration_seconds = float(duration_output)
        print(f"Cleaned audio duration: {cleaned_audio_duration_seconds:.2f} seconds")
        if abs(original_input_duration_seconds - cleaned_audio_duration_seconds) > 0.1: # Allow for small floating point differences
            print(f"WARNING: Cleaned audio duration ({cleaned_audio_duration_seconds:.2f}s) significantly differs from input ({original_input_duration_seconds:.2f}s). This might indicate a truncation problem during FFmpeg processing.")
        else:
            print("Cleaned audio duration matches input audio duration.")
    except ValueError:
        print(f"Could not parse duration from ffprobe for cleaned audio: {duration_output}")

except subprocess.CalledProcessError as e:
    print(f"--- FFmpeg Error ---")
    print(f"Command: {' '.join(e.cmd)}")
    print(f"Return Code: {e.returncode}")
    print(f"STDOUT:\n{e.stdout}")
    print(f"STDERR:\n{e.stderr}")
    print(f"--- Audio Preprocessing Failed. Cannot proceed. ---")
    raise e
except FileNotFoundError:
    print("--- FFmpeg/FFprobe not found ---")
    print("Please ensure FFmpeg and FFprobe are installed and accessible in your environment's PATH.")
    raise

--- Starting Audio Preprocessing ---
--- Verifying Original Input Audio Duration (/content/s2.wav) ---
Original input audio duration: 231.34 seconds
ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr -

In [22]:
print("--- Starting Whisper Transcription ---")

--- Starting Whisper Transcription ---


In [23]:
whisper_result = model.transcribe(
    clean_audio_path,
    language="ta",       # Explicitly set source language as Tamil
    task="translate",     # Translate from Tamil to English
    verbose=True,
    initial_prompt=initial_prompt_text
)



[00:00.000 --> 00:04.000]  Hello, My name is Smati, We are calling from Life Insurance,
[00:04.000 --> 00:05.000]  Hello,
[00:05.000 --> 00:07.000]  Do you know Maheshwari Maa?
[00:07.000 --> 00:08.000]  Yes, I know, Tell me Maa
[00:08.000 --> 00:13.000]  Ok, Do you have policy in Access Maxlife Insurance? Can we talk about Renewal Pearl for 2 minutes?
[00:13.000 --> 00:15.000]  Yes, Tell me Maa
[00:15.000 --> 00:27.000]  Thank you, Your policy number is 326166147, Maxlife Holders Superfund Policy, 2772024, 34640 Rs.
[00:27.000 --> 00:36.000]  We have talked with you for last month, you have informed us about payment in this month, but still not received, do you know when and why you are getting delayed?
[00:36.000 --> 00:40.000]  I have paid it, I have paid it 4 days before
[00:40.000 --> 00:42.000]  You have paid it 4 days before?
[00:42.000 --> 00:43.000]  Yes, I have paid it
[00:43.000 --> 00:44.000]  When did you pay it?
[00:44.000 --> 00:47.000]  I have paid it for 1 year
[00:47.

In [24]:
print("--- Whisper Transcription Complete ---")

--- Whisper Transcription Complete ---


In [25]:
# --- Step 3: Speaker Diarization with pyannote.audio ---
print("\n--- Starting Speaker Diarization ---")
try:
    pipeline = Pipeline.from_pretrained(
        "pyannote/speaker-diarization-3.1",
        use_auth_token=HUGGING_FACE_ACCESS_TOKEN
    )

    # Send pipeline to GPU (if available)
    if torch.cuda.is_available():
        pipeline.to(torch.device("cuda"))
        print("Pyannote.audio moved to GPU.")
    else:
        print("CUDA not available, running pyannote.audio on CPU. This might be slow.")

    diarization = pipeline(clean_audio_path)
    print("--- Speaker Diarization Complete ---")

except Exception as e:
    print(f"--- Speaker Diarization Failed ---")
    print(f"Error: {e}")
    print("Please ensure your Hugging Face Access Token is correct and has access to pyannote/speaker-diarization-3.1.")
    # If diarization fails, we can still proceed with transcription but without speaker labels
    diarization = None # Set diarization to None if it failed


--- Starting Speaker Diarization ---
CUDA not available, running pyannote.audio on CPU. This might be slow.


  std = sequences.std(dim=-1, correction=1)


--- Speaker Diarization Complete ---


In [26]:
# --- Step 4: Combine and Format Output ---
print("\n--- Generating Dialogue Output ---")
dialogue_output = []

# Helper function to find the dominant speaker for a given time segment
def get_dominant_speaker_for_segment(start_time, end_time, diarization_result):
    if not diarization_result: # If diarization failed
        return "Unknown"

    speakers_in_segment = {}
    for segment, _, speaker_label in diarization_result.itertracks(yield_label=True):
        # Calculate overlap between transcription segment and diarization segment
        overlap_start = max(start_time, segment.start)
        overlap_end = min(end_time, segment.end)
        overlap_duration = max(0, overlap_end - overlap_start)

        if overlap_duration > 0:
            speakers_in_segment[speaker_label] = speakers_in_segment.get(speaker_label, 0) + overlap_duration

    if not speakers_in_segment:
        return "Unknown"

    # Return the speaker with the most overlap
    return max(speakers_in_segment, key=speakers_in_segment.get)

# Group segments by speaker for better dialogue flow (experimental, can be adjusted)
current_speaker = None
current_text = []
current_start = -1
current_end = -1

for i, segment in enumerate(whisper_result["segments"]):
    start = segment['start']
    end = segment['end']
    text = segment['text'].strip()

    speaker = get_dominant_speaker_for_segment(start, end, diarization)

    if speaker == current_speaker and current_speaker is not None and (start - current_end) < 2.0: # Merge if same speaker and short pause
        current_text.append(text)
        current_end = end
    else:
        if current_speaker is not None:
            dialogue_output.append(f"Speaker {current_speaker}: {' '.join(current_text)}")
        current_speaker = speaker
        current_text = [text]
        current_start = start
        current_end = end

# Add the last accumulated segment
if current_speaker is not None:
    dialogue_output.append(f"Speaker {current_speaker}: {' '.join(current_text)}")



--- Generating Dialogue Output ---


In [27]:
# --- Step 5: Print the final dialogue ---
for line in dialogue_output:
    print(line)

print("\n--- Processing Complete ---")

Speaker SPEAKER_01: Hello, My name is Smati, We are calling from Life Insurance, Hello, Do you know Maheshwari Maa?
Speaker SPEAKER_00: Yes, I know, Tell me Maa
Speaker SPEAKER_01: Ok, Do you have policy in Access Maxlife Insurance? Can we talk about Renewal Pearl for 2 minutes?
Speaker SPEAKER_00: Yes, Tell me Maa
Speaker SPEAKER_01: Thank you, Your policy number is 326166147, Maxlife Holders Superfund Policy, 2772024, 34640 Rs. We have talked with you for last month, you have informed us about payment in this month, but still not received, do you know when and why you are getting delayed?
Speaker SPEAKER_00: I have paid it, I have paid it 4 days before
Speaker SPEAKER_01: You have paid it 4 days before?
Speaker SPEAKER_00: Yes, I have paid it
Speaker SPEAKER_01: When did you pay it?
Speaker SPEAKER_00: I have paid it for 1 year
Speaker SPEAKER_01: For 1 year, you have paid it for 1.5-3 months
Speaker SPEAKER_00: Yes, I have paid it for 1.5-3 months
Speaker SPEAKER_01: Ok, When did yo