# Audio Exploration for Medical Podcast Dataset
 # DATASET EXPLORATION & PREPROCESSING

In [None]:
/content/drive/MyDrive/Audio_recording_whisper/audio_recordings

In [None]:
import os

ROOT_DIR = "/content/drive/MyDrive/Audio_recording_whisper/audio_recordings"
AUDIO_DIR = os.path.join(ROOT_DIR, "/content/drive/MyDrive/Audio_recording_whisper/audio_recordings/Audio_Recordings")
TEXT_DIR  = os.path.join(ROOT_DIR, "/content/drive/MyDrive/Audio_recording_whisper/audio_recordings/Clean_Transcripts")

print("AUDIO_DIR:", AUDIO_DIR)
print("TEXT_DIR :", TEXT_DIR)

print("\nAudio files:")
print(sorted(os.listdir(AUDIO_DIR))[:20])

print("\nText files:")
print(sorted(os.listdir(TEXT_DIR))[:20])

In [None]:
import os

AUDIO_DIR = "/content/drive/MyDrive/Audio_recording_whisper/audio_recordings/Audio_Recordings"
TEXT_DIR  = "/content/drive/MyDrive/Audio_recording_whisper/audio_recordings/Clean_Transcripts"

audio_files = sorted([f for f in os.listdir(AUDIO_DIR) if f.lower().endswith(".mp3")])
text_files  = set(os.listdir(TEXT_DIR))

pairs = []
for af in audio_files:
    stem = os.path.splitext(af)[0]    # 'CAR0001' from 'CAR0001.mp3'
    txt_name = stem + ".txt"          # 'CAR0001.txt'
    if txt_name in text_files:
        pairs.append({"audio": af, "text": txt_name})

pairs = pairs[:30]  # take first 20â€“30
print("Total matching pairs:", len(pairs))
print("First 5 pairs:", pairs[:5])


In [None]:
import librosa
import numpy as np
import pandas as pd

rows = []
for p in pairs:
    audio_path = os.path.join(AUDIO_DIR, p["audio"])
    text_path  = os.path.join(TEXT_DIR,  p["text"])

    y, sr = librosa.load(audio_path, sr=None)
    dur = len(y) / sr
    rms = float(np.sqrt(np.mean(y**2)))

    with open(text_path, "r", encoding="utf-8") as f:
        t = f.read().strip()

    rows.append({
        "file": p["audio"],
        "sr": sr,
        "duration_s": dur,
        "rms": rms,
        "text_len": len(t),
        "transcript": t
    })

df = pd.DataFrame(rows)
print("DF shape:", df.shape)
print(df.head())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

print("\nSample rates:", df["sr"].unique())
print("Mean duration (s):", df["duration_s"].mean())
print("Mean text length (chars):", df["text_len"].mean())

plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.hist(df["duration_s"], bins=10, edgecolor="black")
plt.title("Audio Duration Distribution")
plt.xlabel("Seconds"); plt.ylabel("Count")

plt.subplot(1,2,2)
plt.hist(df["text_len"], bins=10, edgecolor="black")
plt.title("Transcript Length Distribution")
plt.xlabel("Characters"); plt.ylabel("Count")

plt.tight_layout()
plt.show()
