In [1]:
!pip install librosa soundfile noisereduce pydub torch torchaudio transformers tqdm webrtcvad numpy pandas

Collecting webrtcvad
  Using cached webrtcvad-2.0.10.tar.gz (66 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'


ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: 'c:\\program files\\python38\\lib\\site-packages\\requests-2.32.3.dist-info\\METADATA'


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import librosa
import soundfile as sf
import numpy as np
import pandas as pd
import torch
import noisereduce as nr
from tqdm import tqdm
from pydub import AudioSegment


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
INPUT_DIR = "pitt_corpus_wav"
OUTPUT_DIR = "pitt_corpus_processed"
SEGMENT_DURATION = 10       # in seconds
OVERLAP = 5                 # in seconds (set 0 for no overlap)
TARGET_SR = 16000           # 16kHz for Wav2Vec2/WavLM

os.makedirs(OUTPUT_DIR, exist_ok=True)
for cls in ["Control", "Dementia"]:
    os.makedirs(os.path.join(OUTPUT_DIR, cls), exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")



Using device: cuda


In [4]:
def load_audio(filepath, sr=TARGET_SR):
    """Load and resample audio."""
    y, sr = librosa.load(filepath, sr=sr, mono=True)
    return y, sr

def reduce_noise(y, sr):
    """Apply spectral gating noise reduction."""
    return nr.reduce_noise(y=y, sr=sr)

def rms_normalize(y, target_db=-20):
    """Normalize audio to a consistent loudness (in dBFS)."""
    rms = np.sqrt(np.mean(y**2))
    scalar = 10 ** (target_db / 20) / (rms + 1e-9)
    return y * scalar

def pre_emphasis(y, coeff=0.97):
    """Pre-emphasis filter to boost high frequencies."""
    if len(y)==0:
        return y
    return np.append(y[0], y[1:] - coeff * y[:-1])

def trim_silence(y, sr, top_db=30):
    """Trim leading and trailing silence."""
    yt, _ = librosa.effects.trim(y, top_db=top_db)
    return yt

def segment_audio(y, sr, segment_length=SEGMENT_DURATION, overlap=OVERLAP):
    """Split audio into 10s segments (with overlap)."""
    step = segment_length - overlap
    samples_per_segment = int(segment_length * sr)
    step_size = int(step * sr)
    segments = []
    
    for start in range(0, len(y), step_size):
        end = start + samples_per_segment
        segment = y[start:end]
        if len(segment) < samples_per_segment:
            segment = np.pad(segment, (0, samples_per_segment - len(segment)))
        segments.append(segment)
        if end >= len(y):
            break
    return segments

In [5]:
metadata = []

for cls in ["Control", "Dementia"]:
    input_path = os.path.join(INPUT_DIR, cls)
    output_path = os.path.join(OUTPUT_DIR, cls)
    
    for filename in tqdm(os.listdir(input_path), desc=f"Processing {cls}"):
        if not filename.lower().endswith(".wav"):
            continue
            
        filepath = os.path.join(input_path, filename)
        
        # Load audio
        y, sr = load_audio(filepath)
        
        # Step 1: Noise reduction
        y = reduce_noise(y, sr)
        
        # Step 2: Silence trimming
        y = trim_silence(y, sr)
        if len(y) == 0:
            print(f"⚠️ Skipping {filename} — no speech detected after trimming.")
            continue
        
        # Step 3: RMS normalization
        y = rms_normalize(y)
        
        # Step 4: Pre-emphasis
        y = pre_emphasis(y)
        
        # Step 5: Segmentation
        segments = segment_audio(y, sr)
        
        # Step 6: Save each segment
        for i, seg in enumerate(segments):
            seg_filename = f"{os.path.splitext(filename)[0]}_seg{i}.wav"
            seg_path = os.path.join(output_path, seg_filename)
            sf.write(seg_path, seg, sr)
            
            metadata.append({
                "filepath": seg_path,
                "label": 0 if cls == "Control" else 1,
                "duration_sec": len(seg)/sr
            })


  sig_mult_above_thresh = (abs_sig_stft - sig_stft_smooth) / sig_stft_smooth
Processing Control:  94%|█████████▍| 214/228 [02:29<00:09,  1.43it/s]

⚠️ Skipping 323-1.wav — no speech detected after trimming.


Processing Control: 100%|██████████| 228/228 [02:39<00:00,  1.43it/s]
Processing Dementia: 100%|██████████| 297/297 [04:01<00:00,  1.23it/s]


In [6]:
df = pd.DataFrame(metadata)
df.to_csv(os.path.join(OUTPUT_DIR, "segment_metadata.csv"), index=False)
print(f"✅ Preprocessing complete! Total segments: {len(df)}")
print(f"Metadata saved at: {os.path.join(OUTPUT_DIR, 'segment_metadata.csv')}")


✅ Preprocessing complete! Total segments: 6930
Metadata saved at: pitt_corpus_processed\segment_metadata.csv
