In [None]:
from datetime import datetime, time
import os
import pandas as pd
import re
import subprocess
from pathlib import Path

def parse_time(time_str):
    """Convert time string to datetime object."""
    time_str = time_str.replace(",", ".")
    _, minutes, seconds = time_str.split(":")
    return float(minutes) * 60 + float(seconds)

buffer = 5.0  # Buffer time in seconds

for file in os.listdir("output_ratings"):
    person = file.split("_")[0].strip("P")
    print(f"Processing file: {file} for person {person}")
    if file.endswith(".xlsx"):
        file_path = os.path.join("output_ratings", file)
        df = pd.read_excel(file_path)
        df["Exchange text"] = ""
        # Find the corresponding exchange text
        first_interaction_robot = target_enjoyment_df.loc[target_enjoyment_df["PID"] == int(person), "Q1-Robot"].values[0]
        exchange_text_df = pd.read_excel(f"exchange-data/exchange_data-P{person}-{first_interaction_robot}.xlsx")
        for i, row in df.iterrows():
            start_time = parse_time(row["Start Time"])
            end_time = parse_time(row["End Time"])
            all_user_utterances = []
            turn = row["Turn ID"]
            last = "r"

            for j, exchange_row in exchange_text_df.iterrows():
                
                if (start_time < parse_time(exchange_row["Start Time"])) and (end_time > parse_time(exchange_row["End Time"])):
                            df.at[i, "Exchange text"] += exchange_row["Utterance"] + " "
                            if exchange_row["Utterance"].startswith("User: "):
                                last = "u"
                                all_user_utterances.append({"start": exchange_row["Start Time"], "end": exchange_row["End Time"], "text": exchange_row["Utterance"], "length": len(exchange_row["Utterance"])})
                            else:
                                last = "r"
                elif (start_time < parse_time(exchange_row["Start Time"]) + buffer) and (end_time > parse_time(exchange_row["End Time"])) and (parse_time(exchange_row["Start Time"]) < end_time):
                        if exchange_row["Utterance"].startswith("Robot: "):
                            df.at[i, "Exchange text"] += exchange_row["Utterance"] + " "
                            last = "r"
                elif (start_time < parse_time(exchange_row["Start Time"])) and (end_time > parse_time(exchange_row["End Time"]) - buffer):
                        if exchange_row["Utterance"].startswith("User: ") and last == "r":
                            df.at[i, "Exchange text"] += exchange_row["Utterance"] + " "
                            all_user_utterances.append({"start": exchange_row["Start Time"], "end": exchange_row["End Time"], "text": exchange_row["Utterance"], "length": len(exchange_row["Utterance"])})
                            last = "u"

            if len(all_user_utterances) > 0:
                # Sort by length of utterance
                all_user_utterances = sorted(all_user_utterances, key=lambda x: x["length"], reverse=True)
                # Get the longest user utterance
                longest_user_utterance = all_user_utterances[0]
                # Trim the longest user utterance audio clip
                first_interaction_robot = target_enjoyment_df.loc[target_enjoyment_df["PID"] == int(person), "Q1-Robot"].values[0]
                audio_file = f"data/raw/P{person}/audio-{first_interaction_robot}-P{person}.wav"
                start_time = longest_user_utterance["start"].replace(",", ".")
                end_time = longest_user_utterance["end"].replace(",", ".")
                # Trim the audio file using ffmpeg
                output_dir = f"data/audio/P{person}"
                os.makedirs(output_dir, exist_ok=True)
                print(f"Trimming audio file: {audio_file} from {start_time} to {end_time}")
                trimmed_audio_file = f"data/audio/P{person}/trimmed_audio-P{person}-{first_interaction_robot}-{turn}.wav"
                print(start_time, end_time)
                cmd = [
                    "ffmpeg",
                    "-ss", start_time,
                    "-i", audio_file,
                    "-t", str(parse_time(end_time) - parse_time(start_time)),
                    "-c", "copy",
                    trimmed_audio_file
                ]
                # run the command; raise CalledProcessError on failure
                try:
                    result = subprocess.run(
                        cmd,
                        check=True,
                        stdout=subprocess.PIPE,
                        stderr=subprocess.PIPE,
                        text=True
                    )
                    print("Trimmed successfully:", trimmed_audio_file)
                except subprocess.CalledProcessError as e:
                    # print ffmpeg’s full error output
                    print("ffmpeg failed with exit code", e.returncode)
                    print("ffmpeg stderr:\n", e.stderr)
                
        os.makedirs(f"data/text/P{person}", exist_ok=True)
        # Save the updated DataFrame back to Excel
        df.to_excel(f"data/text/P{person}/text-aligned-P{person}.xlsx", index=False)
        # Print the number of entries in the processed file
        print(f"Processed {file} with {len(df)} entries.")

In [None]:
import webrtcvad
import collections
import contextlib
import wave
import pydub
import argparse
import os

def trim_silence(audio_path, output_path, aggressiveness=2, frame_duration_ms=30, padding_duration_ms=500):
    """
    Trims silence from the beginning and end of an audio file using VAD.

    Args:
        audio_path (str): Path to the input audio file.
        output_path (str): Path to save the trimmed audio file.
        aggressiveness (int): VAD aggressiveness mode (0-3). Higher values are less
                             likely to classify non-speech as speech. Default: 1.
        frame_duration_ms (int): Duration of each audio frame for VAD (10, 20, or 30). Default: 30.
        padding_duration_ms (int): Duration of silence to keep before the first speech
                                   and after the last speech segment (in ms). Default: 300.

    Returns:
        bool: True if trimming was successful and file saved, False otherwise.
    """
    print(f"Loading audio file: {audio_path}")
    try:
        # Load audio using pydub - handles format conversion
        audio = pydub.AudioSegment.from_file(audio_path)
    except FileNotFoundError:
        print(f"Error: Input file not found at {audio_path}")
        return False
    except Exception as e:
        print(f"Error loading audio file: {e}")
        return False

    # Ensure audio is mono and 16-bit PCM for webrtcvad compatibility
    # webrtcvad supports 8000, 16000, 32000, 48000 Hz sample rates
    sample_rate = audio.frame_rate
    supported_rates = [8000, 16000, 32000, 48000]
    if sample_rate not in supported_rates:
        # Choose the closest supported rate (preferring higher rates for quality)
        target_rate = 32000
        print(f"Warning: Sample rate {sample_rate}Hz not directly supported by webrtcvad. Resampling to {target_rate}Hz.")
        try:
            audio = audio.set_frame_rate(target_rate)
            sample_rate = target_rate
        except Exception as e:
            print(f"Error resampling audio: {e}")
            return False

    if audio.channels > 1:
        print(f"Converting audio to mono.")
        audio = audio.set_channels(1)

    if audio.sample_width != 2: # 2 bytes = 16 bits
        print(f"Converting audio to 16-bit PCM.")
        audio = audio.set_sample_width(2)

    print(f"Audio Properties for VAD: Rate={sample_rate}Hz, Channels={audio.channels}, SampleWidth={audio.sample_width} bytes")

    vad = webrtcvad.Vad(aggressiveness)

    # Calculate frame size in bytes
    bytes_per_sample = audio.sample_width
    samples_per_frame = int(sample_rate * frame_duration_ms / 1000)
    frame_size_bytes = samples_per_frame * bytes_per_sample

    # Get raw audio data
    raw_audio_data = audio.raw_data

    num_frames = len(raw_audio_data) // frame_size_bytes
    print(f"Processing {num_frames} frames of {frame_duration_ms}ms each...")

    speech_start_ms = -1
    speech_end_ms = -1
    found_speech = False

    for i in range(num_frames):
        start_byte = i * frame_size_bytes
        end_byte = start_byte + frame_size_bytes
        frame = raw_audio_data[start_byte:end_byte]

        # Ensure the frame has the correct number of bytes (important for the last frame)
        if len(frame) < frame_size_bytes:
            # Pad the last frame with silence if necessary
            # frame += b'\x00' * (frame_size_bytes - len(frame)) # Alternatively, skip last frame
            continue # Skip incomplete frame at the end

        try:
            is_speech = vad.is_speech(frame, sample_rate)
            current_time_ms = i * frame_duration_ms

            if is_speech:
                if not found_speech:
                    speech_start_ms = current_time_ms
                    found_speech = True
                # Always update the end time when speech is detected
                speech_end_ms = current_time_ms + frame_duration_ms # End time is start + duration

        except Exception as e:
            # webrtcvad can sometimes throw errors on invalid frame lengths etc.
            print(f"Error processing frame {i}: {e}")
            continue

    if not found_speech:
        print("No speech detected in the audio file.")
        # Optionally save the original or an empty file, here we just return False
        return False

    # Apply padding
    start_trim_ms = max(0, speech_start_ms - padding_duration_ms)
    end_trim_ms = min(len(audio), speech_end_ms + padding_duration_ms) # len(audio) is in ms

    print(f"Detected speech from {speech_start_ms}ms to {speech_end_ms}ms")
    print(f"Trimming audio from {start_trim_ms}ms to {end_trim_ms}ms (including padding)")

    # Trim the audio using pydub slicing
    trimmed_audio = audio[start_trim_ms:end_trim_ms]

    # Export the trimmed audio
    try:
        # Determine output format from the output file extension
        output_format = os.path.splitext(output_path)[1][1:]
        if not output_format: # Default to wav if no extension
            output_format = "wav"
            output_path += ".wav"
            print(f"No output format specified, defaulting to WAV. Saving to {output_path}")

        print(f"Saving trimmed audio to: {output_path} (Format: {output_format})")
        trimmed_audio.export(output_path, format=output_format)
        return True
    except Exception as e:
        print(f"Error saving trimmed audio file: {e}")
        return False

# --- Main Execution ---
if __name__ == "__main__":

    for person in range(4, 43):
        input_dir = f"data/audio/P{person}"
        output_dir = f"data/audio_vad_500/P{person}"
        # Create output directory if it doesn't exist

        if output_dir and not os.path.exists(output_dir):
            print(f"Creating output directory: {output_dir}")
            os.makedirs(output_dir)

        for file in os.listdir(input_dir):
            if file.endswith(".wav"):
                input_path = os.path.join(input_dir, file)
                output_path = os.path.join(output_dir, file)
                print(f"Trimming silence from {input_path} to {output_path}")
                success = trim_silence(
                    audio_path=input_path,
                    output_path=output_path,
                )
                if success:
                    print("Trimming complete.")
                else:
                    print("Trimming failed.")