In [1]:
!pip install --break-system-packages transformers torch



In [2]:
!pip install --break-system-packages azure-cognitiveservices-speech

Collecting azure-cognitiveservices-speech
  Downloading azure_cognitiveservices_speech-1.45.0-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting azure-core>=1.31.0 (from azure-cognitiveservices-speech)
  Downloading azure_core-1.35.0-py3-none-any.whl.metadata (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Downloading azure_cognitiveservices_speech-1.45.0-py3-none-manylinux1_x86_64.whl (41.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 MB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading azure_core-1.35.0-py3-none-any.whl (210 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.7/210.7 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: azure-core, azure-cognitiveservices-speech
Successfully ins

In [3]:
import os
import json
import time
import soundfile as sf
import numpy as np
import pandas as pd

# Azure Speech SDK
import azure.cognitiveservices.speech as speechsdk

# Add for post-processing enhancement
from transformers import pipeline
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig

In [None]:
# ---------- Configuration ----------
# 👉 Put your Azure Speech credentials here 👈
SPEECH_KEY = ""
SPEECH_REGION = "centralindia"   # e.g., "eastus", "southeastasia", "centralindia"

INPUT_FILE    = "call2.wav"

SRC_LANGUAGE = "ta-IN"            # source audio language (Tamil – India)
TARGET_LANGUAGE = "en"            # translate into English

AGENT_LABEL = "Agent"
CUSTOMER_LABEL = "Customer"

# If your stereo channels are swapped, flip these
LEFT_CHANNEL_SPEAKER = AGENT_LABEL
RIGHT_CHANNEL_SPEAKER = CUSTOMER_LABEL
# ---------- Configuration ----------

In [5]:
def read_stereo_wav(path):
    data, sr = sf.read(path, always_2d=True)
    if data.shape[1] != 2:
        raise ValueError("Input must be a *stereo* WAV (2 channels).")
    return data, sr

def write_temp_mono_wav(np_float_mono, sr, path):
    sf.write(path, np_float_mono, sr, subtype="PCM_16")

def run_translation_on_file(wav_path, speaker_label, src_lang, tgt_lang):
    translation_config = speechsdk.translation.SpeechTranslationConfig(
        subscription=SPEECH_KEY,
        region=SPEECH_REGION,
        speech_recognition_language=src_lang
    )
    translation_config.add_target_language(tgt_lang)

    audio_config = speechsdk.AudioConfig(filename=wav_path)
    translator = speechsdk.translation.TranslationRecognizer(
        translation_config=translation_config,
        audio_config=audio_config
    )

    segments = []
    done = False

    def recognized(evt):
        res = evt.result
        if res.reason == speechsdk.ResultReason.TranslatedSpeech:
            start_sec = res.offset / 10_000_000.0
            end_sec   = (res.offset + res.duration) / 10_000_000.0
            text_src  = res.text or ""
            text_en   = (res.translations.get(tgt_lang) or "").strip()
            if text_en:
                segments.append({
                    "speaker": speaker_label,
                    "start_sec": float(start_sec),
                    "end_sec": float(end_sec),
                    "text_src": text_src,
                    "text_en": text_en
                })

    def canceled(evt):
        if evt.reason == speechsdk.CancellationReason.Error:
            print("CANCELED due to error:", evt.error_details)

    def session_stopped(evt):
        nonlocal done
        done = True

    translator.recognized.connect(recognized)
    translator.canceled.connect(canceled)
    translator.session_stopped.connect(session_stopped)

    translator.start_continuous_recognition_async().get()
    while not done:
        time.sleep(0.1)
    translator.stop_continuous_recognition_async().get()
    return segments

def merge_and_format(agent_segments, customer_segments):
    all_segments = agent_segments + customer_segments
    all_segments.sort(key=lambda s: (s["start_sec"], s["speaker"]))
    full_transcript_en = " ".join(s["text_en"] for s in all_segments).strip()

    dialog_lines = []
    for s in all_segments:
        t0 = s["start_sec"]
        mm, ss = divmod(int(t0), 60)
        timestamp = f"{mm:02d}:{ss:02d}"
        dialog_lines.append(f"[{timestamp}] {s['speaker']}: {s['text_en']}")
    return full_transcript_en, dialog_lines, all_segments

In [7]:
# ===================== RUN =====================
if not os.path.exists(INPUT_FILE):
    raise FileNotFoundError(f"File not found: {INPUT_FILE}")

print("Reading:", INPUT_FILE)
stereo, sr = read_stereo_wav(INPUT_FILE)
left, right = stereo[:,0], stereo[:,1]

left_path, right_path = "tmp_left.wav", "tmp_right.wav"
write_temp_mono_wav(left.astype(np.float32),  sr, left_path)
write_temp_mono_wav(right.astype(np.float32), sr, right_path)

print("Transcribing LEFT channel as", LEFT_CHANNEL_SPEAKER)
left_segments  = run_translation_on_file(left_path,  LEFT_CHANNEL_SPEAKER,  SRC_LANGUAGE, TARGET_LANGUAGE)
print("Transcribing RIGHT channel as", RIGHT_CHANNEL_SPEAKER)
right_segments = run_translation_on_file(right_path, RIGHT_CHANNEL_SPEAKER, SRC_LANGUAGE, TARGET_LANGUAGE)

full_en, dialog_lines, all_segments = merge_and_format(left_segments, right_segments)

Reading: call2.wav
Transcribing LEFT channel as Agent
Transcribing RIGHT channel as Customer


In [19]:
model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
device = "cpu" if not torch.cuda.is_available() else 0  # Use "cpu" for no GPU, 0 for first GPU
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 1024)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 1024)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1024, out_features=1024, bias=False)
              (k): Linear(in_features=1024, out_features=1024, bias=False)
              (v): Linear(in_features=1024, out_features=1024, bias=False)
              (o): Linear(in_features=1024, out_features=1024, bias=False)
              (relative_attention_bias): Embedding(32, 16)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=1024, out_features=2816, bias=False)
              (wi_1): Linear(in_features=1024, out_features=2816, bias=False)
       

In [20]:
# Set generation config to use max_length only
generation_config = GenerationConfig(max_length=512, num_beams=1, do_sample=False, decoder_start_token_id=tokenizer.pad_token_id)
model.generation_config = generation_config  # Assign the generation config to the model

In [21]:
# Create pipeline without passing generation_config directly
corrector = pipeline("text2text-generation", model=model, tokenizer=tokenizer, device=device)

Device set to use cuda:0


In [22]:
# Prepare prompts for batch processing with truncation
prompt_prefix = "Correct this speech: "
max_input_length = 400  # Leave room for generation (512 - max generation length)
prompts = [
    prompt_prefix + tokenizer.decode(
        tokenizer.encode(s['text_en'], max_length=max_input_length, truncation=True)[0],
        skip_special_tokens=True
    )
    for s in all_segments
]

In [23]:
# Batch processing with the pipeline (reduced batch_size)
corrected_texts = corrector(prompts, num_return_sequences=1, batch_size=8)  # Reduced from 32 to 8

In [24]:
# Rebuild full_en and dialog_lines with corrected text
full_en = " ".join(s["text_en"] for s in all_segments).strip()

In [25]:
dialog_lines = []
for s in all_segments:
    t0 = s["start_sec"]
    mm, ss = divmod(int(t0), 60)
    timestamp = f"{mm:02d}:{ss:02d}"
    dialog_lines.append(f"[{timestamp}] {s['speaker']}: {s['text_en']}")
# ----- End of enhancement -----

In [26]:
dialog_lines

['[00:01] Customer: Hello..................................................... side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. side. sid