In [9]:
import os
import shutil
import numpy as np

# Define class name to integer mapping
class_mapping = {
    'EGUT': 1,
    'ESTRI': 2,
    'MBON': 3,
    'MVEN': 4,
    'SQ': 5,
    'Vessel': 6
}

# Paths
source_root = r"/notebooks/audio_data8"
destination_folder = r"/notebooks/mixed12"
os.makedirs(destination_folder, exist_ok=True)

labels = []
file_paths = []

# Traverse all subdirectories
for root, dirs, files in os.walk(source_root):
    for file in files:
        if file.lower().endswith('.wav'):
            full_path = os.path.join(root, file)
            class_name = os.path.basename(root)
            # Assign label if in mapping, else 0 for unlabeled
            class_label = class_mapping.get(class_name, 0)
            file_paths.append(full_path)
            labels.append(class_label)

# Shuffle the files and labels together
combined = list(zip(file_paths, labels))
np.random.shuffle(combined)

# Copy files and save labels
shuffled_labels = []
for file_index, (src_path, class_label) in enumerate(combined):
    dst_path = os.path.join(destination_folder, f"{file_index}.wav")
    shutil.copy2(src_path, dst_path)
    shuffled_labels.append(class_label)

# Save labels to .npy file
labels_array = np.array(shuffled_labels, dtype=int)
np.save(os.path.join(destination_folder, "labels.npy"), labels_array)

print(f"Copied {len(labels_array)} files. Labels saved to {os.path.join(destination_folder, 'labels.npy')}")

Copied 21177 files. Labels saved to /notebooks/mixed12/labels.npy


In [10]:
!pip install librosa

[0m

In [11]:
import os
import numpy as np
import librosa
import shutil
import subprocess
from tqdm import tqdm
import csv

# ----------------------------
# Paths
# ----------------------------
input_folder = r"/notebooks/mixed12/"
output_folder = "/notebooks/MX_PA_2022_preprocessed"
os.makedirs(output_folder, exist_ok=True)

# ----------------------------
# Parameters
# ----------------------------
target_sr = 16000     # Target sample rate
target_duration = 2   # seconds
target_len = target_sr * target_duration  # 32000 samples

# ----------------------------
# Helpers
# ----------------------------
def has_ffmpeg():
    try:
        subprocess.run(["ffmpeg", "-version"], capture_output=True, check=False)
        return True
    except Exception:
        return False

FFMPEG_AVAILABLE = has_ffmpeg()

def filesize(path):
    try:
        return os.path.getsize(path)
    except OSError:
        return -1

def try_repair_with_ffmpeg(in_path, out_path, sr):
    """
    Try to decode/transcode anything into a valid mono WAV at target SR.
    Returns True on success (and creates out_path), False otherwise.
    """
    if not FFMPEG_AVAILABLE:
        return False
    # Ensure output dir exists
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    # Build ffmpeg command
    cmd = [
        "ffmpeg", "-y", "-v", "error",
        "-i", in_path,
        "-ac", "1",            # mono
        "-ar", str(sr),        # sample rate
        "-f", "wav",
        out_path
    ]
    try:
        proc = subprocess.run(cmd, capture_output=True, text=True)
        if proc.returncode == 0 and os.path.exists(out_path) and filesize(out_path) > 44:
            return True
        # If ffmpeg printed errors, surface a short hint for debugging
        if proc.stderr:
            print(f"[ffmpeg stderr] {os.path.basename(in_path)}: {proc.stderr.strip()[:300]}...")
    except Exception as e:
        print(f"[ffmpeg exception] {os.path.basename(in_path)}: {e}")
    return False

def safe_load(path, sr):
    """
    Robust audio loader:
      - first try librosa.load
      - if it fails, try to repair with ffmpeg and load the repaired file
    Returns (y, sr_used, used_fixed_path, error_msg)
      - y is np.ndarray or None on failure
      - used_fixed_path is a temporary fixed file path if repair used, else None
      - error_msg is the exception text if load failed
    """
    try:
        y, sr_used = librosa.load(path, sr=sr, mono=True)
        return y, sr_used, None, None
    except Exception as e1:
        # Attempt repair via ffmpeg
        fixed_path = os.path.join("/tmp", f"fixed_{os.path.basename(path)}")
        repaired = try_repair_with_ffmpeg(path, fixed_path, sr)
        if repaired:
            try:
                y, sr_used = librosa.load(fixed_path, sr=sr, mono=True)
                return y, sr_used, fixed_path, None
            except Exception as e2:
                return None, None, fixed_path, f"Post-repair load failed: {e2}"
        else:
            return None, None, None, f"Initial load failed: {e1}. Repair not possible or failed."

# ----------------------------
# Labels loading utilities
# ----------------------------
def load_labels(labels_path, wav_basenames, warn_on_miss=True):
    """
    Load labels from labels.npy in a few common formats and return a lookup:
      - dict-like saved via np.save (np.load(..., allow_pickle=True).item())
        keys can be strings (basenames) or ints (numeric basenames)
      - 1D numpy array where index corresponds to numeric basename
    Returns (labels_lookup: dict[str, any], format_note: str)
    """
    labels_lookup = {}
    format_note = ""

    if not os.path.exists(labels_path):
        return labels_lookup, "no labels.npy found"

    try:
        arr = np.load(labels_path, allow_pickle=True)
    except Exception as e:
        print(f"[WARN] Failed to load labels.npy: {e}")
        return labels_lookup, "failed to load"

    # Try dict-like (object scalar with .item())
    try:
        maybe_dict = arr.item()
        if isinstance(maybe_dict, dict):
            # Normalize keys to str basenames
            for k, v in maybe_dict.items():
                labels_lookup[str(k)] = v
            format_note = "dict"
            return labels_lookup, format_note
    except Exception:
        pass

    # Try 1D array indexed by numeric basename
    if isinstance(arr, np.ndarray) and arr.ndim == 1:
        format_note = "array"
        # Build lookup for numeric basenames found
        for name in wav_basenames:
            base, _ = os.path.splitext(name)
            try:
                idx = int(base)
            except ValueError:
                continue
            if 0 <= idx < len(arr):
                labels_lookup[base] = arr[idx]
            elif warn_on_miss:
                print(f"[WARN] Label index out of range for '{base}'")
        return labels_lookup, format_note

    # Fallback: unsupported format
    print("[WARN] Unsupported labels.npy format; labels will not be expanded.")
    return {}, "unsupported"

# ----------------------------
# Collect files (numeric sort if names are numbers)
# ----------------------------
def numeric_key(name):
    base, _ = os.path.splitext(name)
    try:
        return int(base)
    except ValueError:
        return float('inf')  # non-numeric names at the end

wav_files = sorted(
    [f for f in os.listdir(input_folder) if f.lower().endswith(".wav")],
    key=numeric_key
)

print(f"Found {len(wav_files)} audio files. Starting preprocessing...")

# ----------------------------
# Process: non-overlapping 2s chunks; pad only the final remainder
# ----------------------------
repaired_files = []
skipped_files = []
bad_zero_byte = []
total_segments_saved = 0

# Load labels (if present) and prepare expansion
labels_src = os.path.join(input_folder, "labels.npy")
labels_lookup, labels_format = load_labels(labels_src, wav_files)
have_labels = len(labels_lookup) > 0
if have_labels:
    print(f"[INFO] Loaded labels ({labels_format}). Will expand per segment.")
else:
    print("[INFO] No usable labels found. Proceeding without label expansion.")

expanded_labels = {}  # segment_name (without extension) -> label

def seg_suffix(i, width=3):
    return f"{i:0{width}d}"

for wav_file in tqdm(wav_files):
    input_path = os.path.join(input_folder, wav_file)

    if filesize(input_path) <= 0:
        bad_zero_byte.append(wav_file)
        print(f"[SKIP zero-byte] {wav_file}")
        skipped_files.append(wav_file)
        continue

    y, sr_used, fixed_path, err = safe_load(input_path, target_sr)

    if y is None:
        print(f"[SKIP bad] {wav_file} | {err}")
        skipped_files.append(wav_file)
        continue

    if fixed_path is not None:
        repaired_files.append(wav_file)

    # Ensure 1D float32
    y = np.asarray(y, dtype=np.float32).flatten()

    n = len(y)
    num_full = n // target_len            # number of full 2s segments
    rem = n % target_len                  # leftover samples for the final padded segment (if > 0)

    base = os.path.splitext(wav_file)[0]
    seg_count = 0

    # Get file-level label if any
    file_label = None
    if have_labels:
        # Lookup by exact basename; also try numeric string if key types differ
        if base in labels_lookup:
            file_label = labels_lookup[base]
        else:
            # also try int(base) as key (converted to str), if numeric and present
            try:
                ibase = int(base)
                if str(ibase) in labels_lookup:
                    file_label = labels_lookup[str(ibase)]
                elif ibase in labels_lookup:
                    file_label = labels_lookup[ibase]
            except ValueError:
                pass

    # Save all full segments
    for i in range(num_full):
        seg = y[i * target_len : (i + 1) * target_len]
        out_name = f"{base}_{seg_suffix(seg_count)}"
        np.save(os.path.join(output_folder, f"{out_name}.npy"), seg.astype(np.float32))
        if file_label is not None:
            expanded_labels[out_name] = file_label
        seg_count += 1
        total_segments_saved += 1

    # Save padded remainder (only if there's leftover or if clip was shorter than 2s to begin with)
    if rem > 0 or n == 0:
        last = y[num_full * target_len :]
        if len(last) < target_len:
            last = np.pad(last, (0, target_len - len(last)), mode='constant')
        out_name = f"{base}_{seg_suffix(seg_count)}"
        np.save(os.path.join(output_folder, f"{out_name}.npy"), last.astype(np.float32))
        if file_label is not None:
            expanded_labels[out_name] = file_label
        total_segments_saved += 1

# ----------------------------
# Write expanded labels (if any)
# ----------------------------
if have_labels:
    # Save as dict in labels.npy (segment_name -> label)
    labels_out_path = os.path.join(output_folder, "labels.npy")
    np.save(labels_out_path, expanded_labels, allow_pickle=True)

    # Also save a friendly CSV for inspection
    labels_csv_path = os.path.join(output_folder, "labels.csv")
    with open(labels_csv_path, "w", newline="") as f:
        w = csv.writer(f)
        w.writerow(["segment", "label"])
        for k, v in expanded_labels.items():
            w.writerow([k, v])
    print(f"[INFO] Expanded labels saved to:\n  - {labels_out_path}\n  - {labels_csv_path}")
else:
    # If an original labels.npy existed but couldn't be parsed, still copy it for reference
    if os.path.exists(labels_src):
        print("[WARN] labels.npy existed but couldn't be parsed for expansion; copying original.")
        shutil.copy2(labels_src, os.path.join(output_folder, "labels_original.npy"))

# ----------------------------
# Summary
# ----------------------------
print("\n✅ Preprocessing complete.")
print(f"Output folder: {output_folder}")
print(f"Total input WAVs: {len(wav_files)}")
print(f"Total segments saved: {total_segments_saved}")
print(f"Repaired via ffmpeg: {len(repaired_files)}")
if repaired_files:
    print("Repaired files:")
    for f in repaired_files[:20]:
        print("  -", f)
    if len(repaired_files) > 20:
        print(f"  ... and {len(repaired_files) - 20} more")

print(f"Skipped (unloadable): {len(skipped_files)}")
if skipped_files:
    print("Skipped files:")
    for f in skipped_files[:20]:
        print("  -", f)
    if len(skipped_files) > 20:
        print(f"  ... and {len(skipped_files) - 20} more")

if bad_zero_byte:
    print(f"Zero-byte files detected: {len(bad_zero_byte)}")

Found 21177 audio files. Starting preprocessing...
[INFO] Loaded labels (array). Will expand per segment.


100%|██████████| 21177/21177 [11:38<00:00, 30.32it/s]


[INFO] Expanded labels saved to:
  - /notebooks/MX_PA_2022_preprocessed/labels.npy
  - /notebooks/MX_PA_2022_preprocessed/labels.csv

✅ Preprocessing complete.
Output folder: /notebooks/MX_PA_2022_preprocessed
Total input WAVs: 21177
Total segments saved: 211770
Repaired via ffmpeg: 0
Skipped (unloadable): 0


In [8]:
import os
import numpy as np
import librosa
import shutil
import subprocess
from tqdm import tqdm

# ----------------------------
# Paths
# ----------------------------
input_folder = r"/notebooks/mixed1/"
output_folder = "/notebooks/MXPA2020_preprocessed"
os.makedirs(output_folder, exist_ok=True)

# ----------------------------
# Parameters
# ----------------------------
target_sr = 16000     # Target sample rate
target_duration = 2   # seconds
target_len = target_sr * target_duration  # 32000 samples

# ----------------------------
# Helpers
# ----------------------------
def has_ffmpeg():
    try:
        subprocess.run(["ffmpeg", "-version"], capture_output=True, check=False)
        return True
    except Exception:
        return False

FFMPEG_AVAILABLE = has_ffmpeg()

def filesize(path):
    try:
        return os.path.getsize(path)
    except OSError:
        return -1

def try_repair_with_ffmpeg(in_path, out_path, sr):
    """
    Try to decode/transcode anything into a valid mono WAV at target SR.
    Returns True on success (and creates out_path), False otherwise.
    """
    if not FFMPEG_AVAILABLE:
        return False
    # Ensure output dir exists
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    # Build ffmpeg command
    cmd = [
        "ffmpeg", "-y", "-v", "error",
        "-i", in_path,
        "-ac", "1",            # mono
        "-ar", str(sr),        # sample rate
        "-f", "wav",
        out_path
    ]
    try:
        proc = subprocess.run(cmd, capture_output=True, text=True)
        if proc.returncode == 0 and os.path.exists(out_path) and filesize(out_path) > 44:
            return True
        # If ffmpeg printed errors, surface a short hint for debugging
        if proc.stderr:
            print(f"[ffmpeg stderr] {os.path.basename(in_path)}: {proc.stderr.strip()[:300]}...")
    except Exception as e:
        print(f"[ffmpeg exception] {os.path.basename(in_path)}: {e}")
    return False

def safe_load(path, sr):
    """
    Robust audio loader:
      - first try librosa.load
      - if it fails, try to repair with ffmpeg and load the repaired file
    Returns (y, sr_used, used_fixed_path, error_msg)
      - y is np.ndarray or None on failure
      - used_fixed_path is a temporary fixed file path if repair used, else None
      - error_msg is the exception text if load failed
    """
    try:
        y, sr_used = librosa.load(path, sr=sr, mono=True)
        return y, sr_used, None, None
    except Exception as e1:
        # Attempt repair via ffmpeg
        fixed_path = os.path.join("/tmp", f"fixed_{os.path.basename(path)}")
        repaired = try_repair_with_ffmpeg(path, fixed_path, sr)
        if repaired:
            try:
                y, sr_used = librosa.load(fixed_path, sr=sr, mono=True)
                return y, sr_used, fixed_path, None
            except Exception as e2:
                return None, None, fixed_path, f"Post-repair load failed: {e2}"
        else:
            return None, None, None, f"Initial load failed: {e1}. Repair not possible or failed."

# ----------------------------
# Collect files (numeric sort if names are numbers)
# ----------------------------
def numeric_key(name):
    base, _ = os.path.splitext(name)
    try:
        return int(base)
    except ValueError:
        return float('inf')  # non-numeric names at the end

wav_files = sorted(
    [f for f in os.listdir(input_folder) if f.lower().endswith(".wav")],
    key=numeric_key
)

print(f"Found {len(wav_files)} audio files. Starting preprocessing...")

# ----------------------------
# Process
# ----------------------------
repaired_files = []
skipped_files = []
bad_zero_byte = []

for wav_file in tqdm(wav_files):
    input_path = os.path.join(input_folder, wav_file)

    if filesize(input_path) <= 0:
        bad_zero_byte.append(wav_file)
        print(f"[SKIP zero-byte] {wav_file}")
        skipped_files.append(wav_file)
        continue

    y, sr_used, fixed_path, err = safe_load(input_path, target_sr)

    if y is None:
        print(f"[SKIP bad] {wav_file} | {err}")
        skipped_files.append(wav_file)
        continue

    if fixed_path is not None:
        repaired_files.append(wav_file)
        # Optionally keep the fixed file for inspection:
        # shutil.copy2(fixed_path, os.path.join(output_folder, f"{os.path.splitext(wav_file)[0]}_fixed.wav"))

    # Clip or pad to exactly target_len
    if len(y) > target_len:
        y = y[:target_len]
    elif len(y) < target_len:
        y = np.pad(y, (0, target_len - len(y)))

    # Save as .npy file
    index = os.path.splitext(wav_file)[0]
    output_path = os.path.join(output_folder, f"{index}.npy")
    np.save(output_path, y.astype(np.float32))  # Save as float32 to save space

# ----------------------------
# Copy labels.npy (if present)
# ----------------------------
labels_src = os.path.join(input_folder, "labels.npy")
if os.path.exists(labels_src):
    shutil.copy2(labels_src, output_folder)
else:
    print("[WARN] labels.npy not found; skipping copy.")

# ----------------------------
# Summary
# ----------------------------
print("\n✅ Preprocessing complete.")
print(f"Output folder: {output_folder}")
print(f"Total input WAVs: {len(wav_files)}")
print(f"Repaired via ffmpeg: {len(repaired_files)}")
if repaired_files:
    print("Repaired files:")
    for f in repaired_files[:20]:
        print("  -", f)
    if len(repaired_files) > 20:
        print(f"  ... and {len(repaired_files) - 20} more")

print(f"Skipped (unloadable): {len(skipped_files)}")
if skipped_files:
    print("Skipped files:")
    for f in skipped_files[:20]:
        print("  -", f)
    if len(skipped_files) > 20:
        print(f"  ... and {len(skipped_files) - 20} more")

if bad_zero_byte:
    print(f"Zero-byte files detected: {len(bad_zero_byte)}")

Found 17556 audio files. Starting preprocessing...


 16%|█▌        | 2800/17556 [00:23<02:03, 119.11it/s]

[SKIP zero-byte] 2804.wav


100%|██████████| 17556/17556 [06:53<00:00, 42.41it/s]


✅ Preprocessing complete.
Output folder: /notebooks/MXPA2020_preprocessed
Total input WAVs: 17556
Repaired via ffmpeg: 0
Skipped (unloadable): 1
Skipped files:
  - 2804.wav
Zero-byte files detected: 1



