In [None]:

!pip install azure-cognitiveservices-speech


In [None]:
import azure.cognitiveservices.speech as speechsdk
import time
import os
from typing import Optional, Dict, Any

import logging

transcribing_stop=False
# Setup logger with timestamp
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.DEBUG)
logger = logging.getLogger(__name__)

class AzureConversationTranscriber:
    """
    Azure Conversation Transcriber with configurable VAD parameters
    """
    
    def __init__(
        self,
        subscription_key: str,
        region: str,
        language: str = "en-US"
    ):
        """
        Initialize the conversation transcriber
        
        Args:
            subscription_key: Azure Speech Services subscription key
            region: Azure region (e.g., 'eastus', 'westus')
            language: Speech recognition language code
        """
        self.subscription_key = subscription_key
        self.region = region
        self.language = language
        self.transcriber: Optional[speechsdk.transcription.ConversationTranscriber] = None
        
    def configure_speech_config(
        self,
        initial_silence_timeout_ms: int = 5000,
        segmentation_silence_timeout_ms: int = 1500,
        segmentation_max_time_ms: Optional[int] = None,
        start_sensitivity: str = "medium",
        segmentation_strategy: str = "Time"
    ) -> speechsdk.SpeechConfig:
        """
        Configure speech recognition with comprehensive VAD parameters
        
        Args:
            initial_silence_timeout_ms: Time to wait for speech to begin (default: 5000ms)
                - Range: 2000-15000ms
                - Use lower for fast interactions, higher for thoughtful responses
            
            segmentation_silence_timeout_ms: Silence duration to end utterance (default: 1500ms)
                - Range: 500-5000ms
                - Lower = faster response but may cut speech, Higher = complete utterances
            
            segmentation_max_time_ms: Maximum single utterance duration (optional)
                - Range: 5000-30000ms
                - Forces segmentation after this time regardless of pauses
            
            start_sensitivity: Speech start detection sensitivity (default: "medium")
                - Options: "low", "medium", "high"
                - high = faster detection, low = fewer false positives
            
            segmentation_strategy: How to segment speech (default: "Time")
                - Options: "Automatic", "Time", "Semantic"
                - "Time" recommended for predictable behavior
            
        Returns:
            Configured SpeechConfig object
        """
        
        logger.info("Configuring SpeechConfig with VAD parameters")
        logger.debug(f"Initial Silence Timeout: {self.subscription_key} ms")
        
        speech_config = speechsdk.SpeechConfig(
            subscription=self.subscription_key,
            region=self.region
        )
        
        # Set the recognition language
        speech_config.speech_recognition_language = self.language
        
        # 1. Configure initial silence timeout - how long to wait for speech to begin
        speech_config.set_property(
            speechsdk.PropertyId.SpeechServiceConnection_InitialSilenceTimeoutMs,
            str(initial_silence_timeout_ms)
        )
        
        # 2. Configure segmentation strategy - use time-based for predictable behavior
        speech_config.set_property(
            speechsdk.PropertyId.Speech_SegmentationStrategy,
            segmentation_strategy
        )
        
        # 3. Configure speech start detection sensitivity
        # Validates and sets the sensitivity level
        valid_sensitivities = ["low", "medium", "high"]
        if start_sensitivity not in valid_sensitivities:
            raise ValueError(f"start_sensitivity must be one of {valid_sensitivities}")
        
        speech_config.set_property(
            speechsdk.PropertyId.Speech_StartEventSensitivity,
            start_sensitivity
        )
        
        # 4. Configure segmentation silence timeout - silence duration to end utterance
        speech_config.set_property(
            speechsdk.PropertyId.Speech_SegmentationSilenceTimeoutMs,
            str(segmentation_silence_timeout_ms)
        )
        
        # 5. Optional: Configure maximum utterance duration
        if segmentation_max_time_ms:
            speech_config.set_property(
                speechsdk.PropertyId.Speech_SegmentationMaximumTimeMs,
                str(segmentation_max_time_ms)
            )
        
        # Enable detailed result output for better analysis
        speech_config.output_format = speechsdk.OutputFormat.Detailed
        
        # # Enable speaker diarization for conversation scenarios
        # speech_config.set_property(
        #     speechsdk.PropertyId.SpeechServiceConnection_RecoMode,
        #     "CONVERSATION"
        # )
        
        return speech_config
        
    def create_conversation_transcriber(
        self,
        audio_config: Optional[speechsdk.AudioConfig] = None,
        initial_silence_timeout_ms: int = 5000,
        segmentation_silence_timeout_ms: int = 1500,
        segmentation_max_time_ms: Optional[int] = None,
        start_sensitivity: str = "medium",
        segmentation_strategy: str = "Time"
    ) -> speechsdk.transcription.ConversationTranscriber:
        """
        Create and configure a conversation transcriber with full VAD control
        
        Args:
            audio_config: Audio input configuration (default: microphone)
            initial_silence_timeout_ms: Wait time before speech starts (default: 5000ms)
            segmentation_silence_timeout_ms: Silence to end utterance (default: 1500ms)
            segmentation_max_time_ms: Max utterance duration (optional)
            start_sensitivity: Detection sensitivity - "low"/"medium"/"high"
            segmentation_strategy: Segmentation method - "Automatic"/"Time"/"Semantic"
            
        Returns:
            Configured ConversationTranscriber instance
        """
        # Configure speech settings with all VAD parameters
        speech_config = self.configure_speech_config(
            initial_silence_timeout_ms=initial_silence_timeout_ms,
            segmentation_silence_timeout_ms=segmentation_silence_timeout_ms,
            segmentation_max_time_ms=segmentation_max_time_ms,
            start_sensitivity=start_sensitivity,
            segmentation_strategy=segmentation_strategy
        )
        
        # Use default microphone if no audio config provided
        if audio_config is None:
            audio_config = speechsdk.AudioConfig(use_default_microphone=True)
        
        # Create conversation transcriber
        self.transcriber = speechsdk.transcription.ConversationTranscriber(
            speech_config=speech_config,
            audio_config=audio_config
        )
        
        return self.transcriber
        
    def setup_event_handlers(
        self,
        on_transcribed: Optional[callable] = None,
        on_transcribing: Optional[callable] = None,
        on_session_started: Optional[callable] = None,
        on_session_stopped: Optional[callable] = None,
        on_canceled: Optional[callable] = None
    ):
        """
        Setup event handlers for the transcriber
        
        Args:
            on_transcribed: Handler for final transcription results
            on_transcribing: Handler for interim transcription results
            on_session_started: Handler for session start event
            on_session_stopped: Handler for session stop event
            on_canceled: Handler for cancellation events
        """
        if not self.transcriber:
            raise ValueError("Transcriber not initialized. Call create_conversation_transcriber first.")
        
        # Default handlers if none provided
        def default_transcribed(evt):
            if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
                logger.info(f"[TRANSCRIBED] Speaker {evt.result.speaker_id}: {evt.result.text}")
            elif evt.result.reason == speechsdk.ResultReason.NoMatch:
                logger.warning(f"[NO MATCH] Speech could not be recognized")
        
        def default_transcribing(evt):
            logger.info(f"[TRANSCRIBING] {evt.result.text}")
        
        def default_session_started(evt):
            logger.info(f"[SESSION STARTED] Session ID: {evt.session_id}")
        
        def default_session_stopped(evt):
            logger.info(f"[SESSION STOPPED] Session ID: {evt.session_id}")
            transcribing_stop=True
        
        def default_canceled(evt):
            logger.warning(f"[CANCELED] Reason: {evt.result.reason}")
            if evt.result.reason == speechsdk.ResultReason.Canceled:
                cancellation = evt.result.cancellation_details
                logger.error(f"  Error details: {cancellation.error_details}")
            transcribing_stop=True
        
        # Connect event handlers
        self.transcriber.transcribed.connect(
            on_transcribed if on_transcribed else default_transcribed
        )
        self.transcriber.transcribing.connect(
            on_transcribing if on_transcribing else default_transcribing
        )
        self.transcriber.session_started.connect(
            on_session_started if on_session_started else default_session_started
        )
        self.transcriber.session_stopped.connect(
            on_session_stopped if on_session_stopped else default_session_stopped
        )
        self.transcriber.canceled.connect(
            on_canceled if on_canceled else default_canceled
        )


In [None]:
def call_transcriber():
    """
    Initialize and run Azure Conversation Transcriber for audio file transcription.
    
    This function is optimized for fast-paced customer service interactions with:
    - Quick speech detection (5 second initial timeout)
    - Fast response segmentation (800ms silence to end utterance)
    - Time-based segmentation strategy for predictable behavior
    
    The function performs the following steps:
    1. Loads Azure credentials from environment variables
    2. Creates an Azure transcriber instance with US English language
    3. Configures VAD (Voice Activity Detection) parameters
    4. Sets up event handlers for transcription events
    5. Starts asynchronous transcription
    6. Waits for user input to stop transcription
    
    Environment Variables Required:
        AZURE_SPEECH_KEY: Azure Speech Services subscription key
        AZURE_SPEECH_REGION: Azure region (e.g., 'eastus', 'westus')
        AZURE_SPEECH_ENDPOINT_ID: (Optional) Custom endpoint ID
    
    Global Variables:
        transcribing_stop: Boolean flag to control transcription loop
    
    Raises:
        ValueError: If environment variables are not set
        Exception: If transcription fails to start
    """
    
    # Import required modules for environment variable management
    import os
    from dotenv import load_dotenv

    # Load environment variables from .env file
    # This allows secure storage of API keys outside the code
    load_dotenv()
    
    # Retrieve Azure Speech Services credentials from environment
    # These are required for authentication with Azure services
    speech_key = os.getenv("AZURE_SPEECH_KEY")
    service_region = os.getenv("AZURE_SPEECH_REGION")
    endpoint_id = os.getenv("AZURE_SPEECH_ENDPOINT_ID")  # Optional: for custom models

    # Initialize the Azure Conversation Transcriber
    # This wrapper class manages speech recognition with configurable VAD parameters
    transcriber = AzureConversationTranscriber(
        subscription_key=os.getenv("AZURE_SPEECH_KEY"),  # Azure subscription key
        region=os.getenv("AZURE_SPEECH_REGION"),          # Azure deployment region
        language="en-US"                                   # Speech recognition language
    )
    
    # Configure audio input from a WAV file
    # Alternative: use default microphone with AudioConfig(use_default_microphone=True)
    audio_input = speechsdk.AudioConfig(filename="sample1.wav")
    
    # Create and configure the conversation transcriber with VAD parameters
    transcriber.create_conversation_transcriber(
        audio_config=audio_input,
        
        # Initial silence timeout: How long to wait for speech to begin (in milliseconds)
        # 5000ms = 5 seconds - Reasonable wait time before assuming no speech
        initial_silence_timeout_ms=5000,
        
        # Segmentation silence timeout: Duration of silence to end an utterance
        # 800ms = 0.8 seconds - Quick response time for fast-paced conversations
        # Lower values = faster response but may cut off speech
        # Higher values = more complete utterances but slower response
        segmentation_silence_timeout_ms=800,
        
        # Optional parameters (currently commented out):
        # segmentation_max_time_ms: Maximum duration for a single utterance (20s)
        #   Forces segmentation after this time regardless of pauses
        # start_sensitivity: Detection sensitivity ("low"/"medium"/"high")
        #   "high" = faster detection but more false positives
        
        # Segmentation strategy: How to divide continuous speech into segments
        # "Time" = Use time-based rules for predictable, consistent behavior
        # Alternative: "Automatic" (Azure decides) or "Semantic" (natural language breaks)
        segmentation_strategy="Time"
    )
    
    # Display configuration confirmation to user
    print("Customer Service Mode: Fast response optimized")
    
    # Setup default event handlers for transcription events
    # These handlers log:
    # - transcribing: Interim results as speech is being processed
    # - transcribed: Final transcription results with speaker identification
    # - session_started/stopped: Session lifecycle events
    # - canceled: Error conditions and cancellation details
    transcriber.setup_event_handlers()
    
    # Start asynchronous transcription
    # The .get() method blocks until the async operation completes
    # This begins processing the audio file and triggering events
    transcriber.transcriber.start_transcribing_async().get()
    
    # Main transcription loop
    # Continuously checks the global flag 'transcribing_stop'
    # The flag is set to True by event handlers when:
    # - Session stops naturally (end of audio file)
    # - An error/cancellation occurs
    while not transcribing_stop:
        time.sleep(.5)  # Sleep 500ms to avoid busy-waiting and reduce CPU usage
    
    # Wait for user confirmation before stopping
    # Allows user to review transcription output before cleanup
    input("Press Enter to stop...")
    
    # Gracefully stop the transcription process
    # This ensures proper cleanup of resources and connection closure
    transcriber.transcriber.stop_transcribing_async().get()
    
call_transcriber()