# Time Difference Visualisation

In [None]:
import os
import glob
import numpy as np
import pandas as pd
from scipy.io import wavfile
from scipy.signal import find_peaks
import matplotlib.pyplot as plt
import matplotlib.ticker as tck
import seaborn as sns
palette2 = sns.color_palette(["#D81B60", "#1E88E5", "#FFC107", "#004D40"])
palette = palette2
sns.set_theme(context='poster', style='ticks', palette=palette, font_scale=1.0)

In [None]:
base_dir = "C:/Users/ryoma/D/logbot-data/umineko/Umineko2024/v5-umineko-2024-playback/v5-vid-mic-zure-test"
# Change the path appropriately
# base_dir = "../path_to_base_dir"

## Data Visualisation

In [None]:
df_audio = pd.read_csv(f"{base_dir}/experiment-data/clap_start_audio.csv")
df_frame = pd.read_csv(f"{base_dir}/experiment-data/clap_start_frame.csv")
# print(len(df_audio))
# print(len(df_frame))

df = pd.merge(df_audio, df_frame, on='id', how='left', suffixes=('', '_right'))
df.insert(len(df.columns), 'time_diff', df['peak_time_ms'] - df['clap_start_ms'])
print(f"N = {len(df)}")
print(f"time diff mean: {np.mean(df['time_diff']):.2f}")
print(f"time diff median: {np.median(df['time_diff']):.2f}")
print(f"time diff maximum: {np.max(df['time_diff']):.2f}")
print(f"time diff min: {np.min(df['time_diff']):.2f}")
print(f"time diff range: {(np.max(df['time_diff']) - np.min(df['time_diff'])):.2f}")
print(f"time diff var: {np.var(df['time_diff']):.2f}")
print(f"time diff std: {np.std(df['time_diff']):.2f}")
print(f"time diff 25% quantile: {np.quantile(df['time_diff'], q=0.25):.2f}")
print(f"time diff 75% quantile: {np.quantile(df['time_diff'], q=0.75):.2f}")
# display(df)
display(df.head(3))

In [None]:
np.random.seed(558)

fig, ax = plt.subplots(1, 1, figsize=(8, 8))

median = np.median(df['time_diff'])
min = np.min(df['time_diff'])
max = np.max(df['time_diff'])

ax.axhline(xmin=0, xmax=10, y=max, color=palette[0], linestyle='-',    label=f'max      = {max:.2f}')
ax.axhline(xmin=0, xmax=10, y=median, color=palette[1], linestyle="-", label=f'median = {median:.2f}')
ax.axhline(xmin=0, xmax=10, y=min, color=palette[2], linestyle='-',    label=f'min       = {min:.2f}')

ax.yaxis.set_minor_locator(tck.MultipleLocator(25))
ax.grid(which='minor', axis='y', linestyle='-', linewidth='0.75')
ax.grid(which='major', axis='y', linestyle='-', linewidth='0.75')

# Violin plot
violin_parts = sns.violinplot(
    x='device', y='time_diff', data=df, 
    inner=None,
    linewidth=0.0, color='#333333', saturation=0.5,
    ax=ax
)
for pc in violin_parts.collections:
    pc.set_edgecolor('black')
    pc.set_facecolor('#333333')
    pc.set_alpha(0.25)

# Box plot
unique_devices = df['device'].unique()
device_positions = np.arange(len(unique_devices)) + 0.25  # shift 0.25 to right
data_by_device = [df[df['device'] == device]['time_diff'] for device in unique_devices]

box_parts = ax.boxplot(
    data_by_device,
    positions=device_positions,
    widths=0.15,
    patch_artist=True,
    medianprops=dict(color='black', linewidth=3.0),
    whiskerprops=dict(color='black', linewidth=2.0),
    capprops=dict(color='black', linewidth=2.0),
    boxprops=dict(facecolor='white', color='black', linewidth=2.0),
)

ax.set_xticks(np.arange(len(unique_devices)))
ax.set_xticklabels(unique_devices)

np.random.seed(558)
sns.stripplot(x='device', y='time_diff', data=df, color="#333333", jitter=True, size=9, alpha=0.7, ax=ax)
ax.set_yticks(np.arange(0, 500, 50))
ax.set_ylim(130, 330)

plt.legend(ncol=1)
plt.xlabel('Device', labelpad=10)
plt.ylabel('Time difference (ms)', labelpad=10)
plt.show()
save_dir = "../output/figure-for-paper/"
# fig.savefig(f"{save_dir}/png/fig_s07_zure_video_audio_time_diff.png", dpi=350, bbox_inches="tight", pad_inches=0.25, transparent=False)
# fig.savefig(f"{save_dir}/pdf/fig_s07_zure_video_audio_time_diff.pdf", dpi=600, bbox_inches="tight", pad_inches=0.25, transparent=False)
# fig.savefig(f"{save_dir}/zure_video_audio_time_diff.svg", bbox_inches="tight", pad_inches=0.25, transparent=False)


In [None]:
print(1000 / 30)

## Audio Data Peak Detection

In [None]:
def road_wav_data_and_info(audio_path):
    dirname = os.path.dirname(audio_path)
    device = os.path.basename(dirname)
    fname = os.path.basename(audio_path).replace(".wav", "")
    sample_rate, data = wavfile.read(audio_path)
    data = data / np.max(np.abs(data)) # normalization
    
    print(f"audio_path: {audio_path}")
    print(f"sample_rate: {sample_rate}")
    print(f"len(data): {len(data)}")
    print(f"length: {len(data)/sample_rate}")

    return sample_rate, data, dirname, device, fname

def detect_clap_starts(sample_rate, data, threshold=0.5, distance=10000):
    # Calculate energy
    energy = data ** 2
    # Detect peaks using scipy.signal's find_peaks function
    peaks, _ = find_peaks(energy, height=threshold, distance=distance)
    # Calculate peak times
    peak_times = peaks / sample_rate
    
    return peak_times, energy

In [None]:
def plot_raw_data_and_energy_with_peaks(sample_rate, data, energy, peak_times, sup_title=None):
    GRIDSPEC_KW = {'wspace': 0.1, 'hspace': 0.5}
    fig, (ax0, ax1) = plt.subplots(2, 1, figsize=(20, 8), gridspec_kw=GRIDSPEC_KW)
    
    # ax0
    ax0.plot(data, label='Data', color="#555555")
    ax0.set_yticks(np.arange(-2.0, 2.0, 0.5))
    ax0.set_ylim(-1.1, 1.1)
    ax0.set_ylabel("Amplitude", labelpad=10)
    ax0.legend()
    ax0.grid()

    # ax1
    ax1.plot(energy, color=palette[1], label='Energy')
    ax1.scatter(
        peak_times * sample_rate, 
        energy[peak_times.astype(int) * sample_rate], 
        color=palette[0], zorder=5, label='Clap Starts'
    )
    ax1.set_yticks(np.arange(-1.0, 2.0, 0.5))
    ax1.set_ylim(-0.1, 1.1)
    # ax1.set_title("Clap Detection", pad=10)
    ax1.set_xlabel("Sample Index")
    ax1.set_ylabel("Energy", labelpad=10)
    ax1.legend()
    ax1.grid()

    if sup_title is not None:
        fig.suptitle(sup_title)

    plt.show()
    return fig

### Run

In [None]:
audio_path_target = f'{base_dir}/experiment-data/*/*.wav'
audio_path_list = sorted(glob.glob(audio_path_target))
print(f"N of wav files: {len(audio_path_list)}")

device_list = []
fname_list = []
clap_list = []
peak_times_s_list = []
peak_times_ms_list = []
peak_times_kakeru_30fps_list = []
peak_times_kakeru_30fps_hiku_1_list = []
# for i, audio_path in enumerate(audio_path_list[:4]):
for i, audio_path in enumerate(audio_path_list):
    
    # Road audio data
    sample_rate, data, dirname, device, fname = road_wav_data_and_info(audio_path)

    threshold = 0.1  #  
    distance = 10000 # minimum distance between peaks
    peak_times, energy = detect_clap_starts(sample_rate, data, threshold, distance)
    print(f"N of peaks: {len(peak_times)}")
    
    # 検出された手拍子の開始時間を表示
    # print(f'Detected clap start times (in seconds): {peak_times}')
    # print(f'Detected clap start frame index:, {peak_times * 30 - 1}')
    device_list.extend([device]*len(peak_times))
    fname_list.extend([fname]*len(peak_times))
    clap_list.extend(np.arange(1, len(peak_times)+1, 1))
    peak_times_s_list.extend(peak_times)
    peak_times_ms_list.extend(peak_times*1000)
    peak_times_kakeru_30fps_list.extend(peak_times * 30)
    peak_times_kakeru_30fps_hiku_1_list.extend(peak_times * 30 - 1)

    # 可視化
    sup_title = f"{device}: {fname}"
    fig = plot_raw_data_and_energy_with_peaks(sample_rate, data, energy, peak_times, sup_title=sup_title)

In [None]:
data_dict = {
    'id': list(np.arange(1, len(device_list)+1, 1)),
    'device': device_list,
    'fname': fname_list,
    'clap': clap_list,
    'peak_time_s': peak_times_s_list,
    'peak_time_ms': peak_times_ms_list,
    'peak_index_1': peak_times_kakeru_30fps_list,
    'peak_index_0': peak_times_kakeru_30fps_hiku_1_list,
}
df = pd.DataFrame(data_dict)
display(df)

In [None]:
save_path = f"{base_dir}/experiment-data/clap_start_audio.csv"
# df.to_csv(save_path, index=False)

# Additional Note

## Point 1
In an uncorrected version of the video data:

The sound is heard slightly after the moment when the hands clap together.
This audio lag is inherent in this version of the video.
The original video conversion software can correct this issue.  

## Point 2
Due to the short distance, the difference between the speed of light and the speed of sound should not be detectable.

Speed of Light: Approximately 880,000 times faster than the speed of sound.  
Speed of Sound: 340m/s.  
Given the speed of light is so fast, its delay time can be ignored.
  
At a distance of 500 mm (50 cm):  

$$t = \frac{\text{Distance}}{\text{Speed of Sound}} = \frac{0.5 \, \mathrm{m}}{340 \, \mathrm{m/s}} \approx 1.46 \, \mathrm{ms}$$

Theoretically, this slight delay does exist. However, it is at a level that humans cannot perceive. Moreover, when compared to the audio lag visible in the uncorrected video data, this theoretical delay is negligible.