In [None]:
# Audio Exploration for Medical Podcast Dataset

This notebook performs exploratory data analysis (EDA) on medical podcast audio files.
The objective is to understand audio duration and formats before applying preprocessing
and transcription pipelines.

This analysis supports design decisions for segmentation and summarization.


In [None]:

import os
import librosa
import numpy as np

# Path to raw medical podcast audio files
audio_folder = "../Data/audio_raw"

# Collect audio files
audio_files = [
    f for f in os.listdir(audio_folder)
    if f.lower().endswith((".mp3", ".wav"))
]

print("Total audio files found:", len(audio_files))

# Compute duration of each audio file
audio_durations = []

for file_name in audio_files:
    file_path = os.path.join(audio_folder, file_name)
    audio, sample_rate = librosa.load(file_path, sr=None)
    duration_seconds = librosa.get_duration(y=audio, sr=sample_rate)
    audio_durations.append(duration_seconds)

# Display statistics
print("Minimum duration (seconds):", round(min(audio_durations), 2))
print("Maximum duration (seconds):", round(max(audio_durations), 2))
print("Average duration (seconds):", round(np.mean(audio_durations), 2))
print("Average duration (minutes):", round(np.mean(audio_durations) / 60, 2))

# Observation note
print("\nObservation:")
print("- Audio files have varying durations.")
print("- Duration analysis supports preprocessing and topic segmentation decisions.")
print("- This notebook is used only for exploratory analysis.")
