In [59]:
import sys
from pathlib import Path
import espnet
from espnet2.bin.asr_inference_streaming import Speech2TextStreaming
from espnet_model_zoo.downloader import ModelDownloader
import argparse
import numpy as np
import wave
import yaml
import time
from datetime import datetime
import pyaudio

In [60]:
# Define paths
data_dir = Path("data")
exp_dir = Path("exp/asr_stats_raw_jp_word/train")
config_path = data_dir / "config.yaml"
model_path = data_dir / "valid.acc.best.pth"
feats_stats_path = data_dir / "feats_stats.npz"
token_path = data_dir / "tokens.txt"

# Load configuration
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

# Initialize Speech2Text with local model
speech2text = Speech2TextStreaming(
    asr_train_config=str(config_path),
    asr_model_file=str(model_path),
    token_type=None,
    bpemodel=None,
    maxlenratio=0.0,
    minlenratio=0.0,
    beam_size=20,
    ctc_weight=0.5,
    lm_weight=0.0,
    penalty=0.0,
    nbest=1,
    device="cpu",
    disable_repetition_detection=True,
    decoder_text_length_limit=0,
    encoded_feat_length_limit=0
)

In [61]:
prev_lines = 0  # グローバル変数の定義

def progress_output(text):
    """Helper function to format and display progress output"""
    global prev_lines
    lines=[''] 
    for i in text:
        if len(lines[-1]) > 100:
            lines.append('')
        lines[-1] += i
    for i,line in enumerate(lines):
        if i == prev_lines:
            sys.stderr.write('\n\r')
        else:
            sys.stderr.write('\r\033[B\033[K')
        sys.stderr.write(line)

    prev_lines = len(lines)
    sys.stderr.flush()

def find_new_content(previous_text, current_text):
    """Find the new content that was added in the current text"""
    if not previous_text:
        return current_text
    
    i = 0
    min_len = min(len(previous_text), len(current_text))
    while i < min_len and previous_text[i] == current_text[i]:
        i += 1
    
    return current_text[i:]

In [62]:
def recognize_with_dysfl_latency_measurement(wavfile):
    """Analyze latency specifically for disfluency end tags with improved duplicate detection"""
    start_process_time = time.time()
    real_time_start = time.time()
    
    with wave.open(wavfile, 'rb') as wavfile:
        ch = wavfile.getnchannels()
        rate = wavfile.getframerate()
        nframes = wavfile.getnframes()
        buf = wavfile.readframes(-1)
        data = np.frombuffer(buf, dtype='int16')
    
    speech = data.astype(np.float16)/32767.0
    sim_chunk_length = 640
    previous_text = ""
    current_text = ""
    
    sys.stderr.write("\nDisfluency Tag Latency Analysis:\n")
    sys.stderr.write("Speech Time | Recognition Time | Latency | Disfluency Context\n")
    sys.stderr.write("-" * 70 + "\n")
    
    # Dictionary to store processed disfluencies with their full context
    processed_disfluencies = {}
    
    def get_disfluency_context(text, pos):
        """Extract the complete disfluency phrase"""
        start_pos = text.rfind("<dysfl>", 0, pos)
        if start_pos == -1:
            return None
        end_pos = pos + len("</dysfl>")
        return text[start_pos:end_pos]
    
    if sim_chunk_length > 0:
        for i in range(len(speech)//sim_chunk_length):
            speech_time = (i * sim_chunk_length) / rate
            
            results = speech2text(speech=speech[i*sim_chunk_length:(i+1)*sim_chunk_length], is_final=False)
            if results is not None and len(results) > 0:
                nbests = [text for text, token, token_int, hyp in results]
                text = nbests[0] if nbests is not None and len(nbests) > 0 else ""
                
                if text:
                    # Find all positions of </dysfl> in the text
                    pos = 0
                    while True:
                        pos = text.find('</dysfl>', pos)
                        if pos == -1:
                            break
                            
                        # Get full context of the disfluency
                        dysfl_context = get_disfluency_context(text, pos)
                        if dysfl_context and dysfl_context not in processed_disfluencies:
                            recognition_time = time.time() - real_time_start
                            latency = recognition_time - speech_time
                            
                            processed_disfluencies[dysfl_context] = {
                                'speech_time': speech_time,
                                'recognition_time': recognition_time,
                                'latency': latency,
                                'context': dysfl_context
                            }
                            
                            sys.stderr.write(f"{speech_time:>.2f}s | {recognition_time:>.2f}s | {latency:>+.2f}s | {dysfl_context}\n")
                        
                        pos += 1
                    
                    previous_text = text
    
    # Print summary statistics
    if processed_disfluencies:
        latencies = [info['latency'] for info in processed_disfluencies.values()]
        avg_latency = sum(latencies) / len(latencies)
        max_latency = max(latencies)
        min_latency = min(latencies)
        
        sys.stderr.write("\nDisfluency Latency Statistics:\n")
        sys.stderr.write(f"Average Latency: {avg_latency:>.2f}s\n")
        sys.stderr.write(f"Maximum Latency: {max_latency:>.2f}s\n")
        sys.stderr.write(f"Minimum Latency: {min_latency:>.2f}s\n")
        sys.stderr.write(f"Total Unique Disfluencies: {len(processed_disfluencies)}\n")

In [63]:
def process_realtime_with_latency_measurement():
    """Process real-time audio input with latency measurement"""
    CHUNK = 2048
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 16000
    RECORD_SECONDS = 5
    
    p = pyaudio.PyAudio()
    stream = p.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
    
    previous_text = ""
    current_text = ""
    start_time = 0
    start_process_time = time.time()
    
    # Print header for timing information
    sys.stderr.write("\nTiming Information:\n")
    sys.stderr.write("Audio Time | Process Time | Latency | Recognition Result\n")
    sys.stderr.write("-" * 70 + "\n")
    
    try:
        for i in range(0, int(RATE/CHUNK*RECORD_SECONDS)+1):
            current_audio_time = i * CHUNK / RATE
            
            data = stream.read(CHUNK)
            data = np.frombuffer(data, dtype='int16')
            data = data.astype(np.float16)/32767.0
            
            if i == int(RATE/CHUNK*RECORD_SECONDS):
                results = speech2text(speech=data, is_final=True)
            else:
                results = speech2text(speech=data, is_final=False)
            
            if results is not None and len(results) > 0:
                nbests = [text for text, token, token_int, hyp in results]
                text = nbests[0] if nbests is not None and len(nbests) > 0 else ""
                
                if text and text != current_text:
                    new_content = find_new_content(previous_text, text)
                    if new_content:
                        process_time = time.time() - start_process_time
                        latency = process_time - current_audio_time
                        
                        sys.stderr.write(f"{current_audio_time:>.2f}s | {process_time:>.2f}s | {latency:>+.2f}s | ")
                        sys.stderr.write(f"<{start_time:.2f}-{current_audio_time:.2f}> {new_content}\n")
                    
                    previous_text = current_text
                    current_text = text
                    start_time = current_audio_time
        
        # Output final segment
        final_time = RECORD_SECONDS
        if current_text:
            new_content = find_new_content(previous_text, current_text)
            if new_content:
                process_time = time.time() - start_process_time
                latency = process_time - final_time
                sys.stderr.write(f"{final_time:>.2f}s | {process_time:>.2f}s | {latency:>+.2f}s | ")
                sys.stderr.write(f"<{start_time:.2f}-{final_time:.2f}> {new_content}\n")
            
    finally:
        stream.stop_stream()
        stream.close()
        p.terminate()

In [64]:
if __name__ == "__main__":
    # Test with audio file
    wavfile = 'sample.wav'
    recognize_with_dysfl_latency_measurement(wavfile)


Disfluency Tag Latency Analysis:
Speech Time | Recognition Time | Latency | Disfluency Context
----------------------------------------------------------------------
1.88s | 0.26s | -1.62s | <dysfl> す ー </dysfl>
2.40s | 0.60s | -1.80s | <dysfl> い </dysfl>
2.40s | 0.60s | -1.80s | <dysfl> あ の </dysfl>
3.40s | 1.18s | -2.22s | <dysfl> い す ー </dysfl>
6.48s | 3.07s | -3.41s | <dysfl> え っ と ー </dysfl>
11.08s | 7.50s | -3.58s | <dysfl> え ー と </dysfl>

Disfluency Latency Statistics:
Average Latency: -2.40s
Maximum Latency: -1.62s
Minimum Latency: -3.58s
Total Unique Disfluencies: 6
