# Pool and Split Data

Get all the data and split it into training and testing sets

In [1]:
import os
import shutil
import random
import wave
import numpy as np
import librosa
from tqdm import tqdm
import soundfile as sf

In [3]:
def pool_and_split_data(root_dir, path_to_files, type_of_sound, delete_invalid=True):
    """
    1. Pool all valid data into one folder
    2. Split the data into training (80%) and testing (20%) sets
    3. Process the data into an appropriate format for ML
    """
    # Create necessary directories
    all_data_dir = f'{root_dir}/all_data_{type_of_sound}'
    training_dir = f'{root_dir}/training_{type_of_sound}'
    testing_dir = f'{root_dir}/testing_{type_of_sound}'
    
    os.makedirs(all_data_dir, exist_ok=True)
    os.makedirs(training_dir, exist_ok=True)
    os.makedirs(testing_dir, exist_ok=True)
    
    # Find all valid WAV files across splits
    splits = ['balanced_train', 'unbalanced_train', 'eval']
    all_valid_files = []
    
    print("Step 1: Checking and pooling valid files...")
    for split in splits:
        data_path = f'{path_to_files}{split}'
        engine_knocking_path = os.path.join(data_path, type_of_sound)
        
        if not os.path.exists(engine_knocking_path):
            print(f"Path does not exist: {engine_knocking_path}")
            continue
        
        # Get all WAV files in this split
        wav_files = []
        for root, _, files in os.walk(engine_knocking_path):
            wav_files.extend([os.path.join(root, f) for f in files if f.endswith('.wav')])
        
        print(f"Found {len(wav_files)} files in {split}")
        
        # Validate each file
        valid_files = []
        for wav_file in tqdm(wav_files, desc=f"Checking {split} files"):
            try:
                # Try to open the file
                with wave.open(wav_file, 'rb') as wf:
                    # Check if file is empty
                    if wf.getnframes() == 0:
                        if delete_invalid:
                            os.remove(wav_file)
                        continue
                    
                    # Check duration
                    frames = wf.getnframes()
                    rate = wf.getframerate()
                    duration = frames / float(rate)
                    
                    # Allow some tolerance around 10 seconds
                    if abs(duration - 10.0) > 0.5:  # Allow ±0.5 seconds tolerance
                        if delete_invalid:
                            os.remove(wav_file)
                        continue
                    
                    # If we get here, the file is valid
                    valid_files.append(wav_file)
            except Exception as e:
                if delete_invalid:
                    try:
                        os.remove(wav_file)
                    except:
                        pass
        
        all_valid_files.extend(valid_files)
    
    # Copy all valid files to all_data directory with unique names
    print(f"\nCopying {len(all_valid_files)} valid files to {all_data_dir}...")
    for i, src_file in enumerate(tqdm(all_valid_files, desc="Copying files")):
        filename = f"engine_knocking_{i+1:04d}.wav"
        dst_file = os.path.join(all_data_dir, filename)
        shutil.copy2(src_file, dst_file)
    
    # Step 2: Split files into training and testing sets
    print("\nStep 2: Splitting data into training and testing sets...")
    all_pooled_files = [os.path.join(all_data_dir, f) for f in os.listdir(all_data_dir) if f.endswith('.wav')]
    
    # Shuffle the files for random split
    random.shuffle(all_pooled_files)
    
    # 80% training, 20% testing
    split_idx = int(len(all_pooled_files) * 0.8)
    training_files = all_pooled_files[:split_idx]
    testing_files = all_pooled_files[split_idx:]
    
    print(f"Training set: {len(training_files)} files")
    print(f"Testing set: {len(testing_files)} files")
    
    # Step 3: Process files into the preferred format for ML (Log-Mel Spectrograms)
    print("\nStep 3: Processing files into Log-Mel Spectrograms...")
    
    def process_file(file, dest_dir, idx, set_name):
        """Process a single file into a log-mel spectrogram and save as .npy file"""
        try:
            # Load audio
            y, sr = librosa.load(file, sr=None)
            
            # Create log-mel spectrogram
            mel_spec = librosa.feature.melspectrogram(
                y=y, 
                sr=sr,
                n_fft=1024,
                hop_length=512,
                n_mels=128,
                fmin=20,
                fmax=8000
            )
            
            # Convert to log scale
            log_mel_spec = librosa.power_to_db(mel_spec)
            
            # Save as numpy array
            output_path = os.path.join(dest_dir, f"{set_name}_{idx+1:04d}.npy")
            np.save(output_path, log_mel_spec)
            
            return True
        except Exception as e:
            print(f"Error processing {file}: {e}")
            return False
    
    # Process training files
    success_count = 0
    for idx, file in enumerate(tqdm(training_files, desc="Processing training files")):
        if process_file(file, training_dir, idx, "train"):
            success_count += 1
    print(f"Successfully processed {success_count} training files")
    
    # Process testing files
    success_count = 0
    for idx, file in enumerate(tqdm(testing_files, desc="Processing testing files")):
        if process_file(file, testing_dir, idx, "test"):
            success_count += 1
    print(f"Successfully processed {success_count} testing files")
    
    # Create metadata files
    with open(os.path.join(training_dir, 'metadata.txt'), 'w') as f:
        f.write(f"Engine knocking training data\n")
        f.write(f"Number of samples: {len(training_files)}\n")
        f.write(f"Format: Log-Mel Spectrograms (128 mel bands)\n")
        f.write(f"Shape: [128, time_frames]\n")
        f.write(f"Sample rate: Original\n")
    
    with open(os.path.join(testing_dir, 'metadata.txt'), 'w') as f:
        f.write(f"Engine knocking testing data\n")
        f.write(f"Number of samples: {len(testing_files)}\n")
        f.write(f"Format: Log-Mel Spectrograms (128 mel bands)\n")
        f.write(f"Shape: [128, time_frames]\n")
        f.write(f"Sample rate: Original\n")
    
    print("\nData processing complete!")
    print(f"All valid files: {all_data_dir}/")
    print(f"Training data: {training_dir}/")
    print(f"Testing data: {testing_dir}/")


In [4]:
print("Starting AudioSet Engine Knocking data pooling and processing...")
pool_and_split_data(root_dir='engine_knocking_data', 
                    path_to_files='engine_knocking_data/engine_knocking_data_', 
                    type_of_sound="Engine knocking", 
                    delete_invalid=True)

Starting AudioSet Engine Knocking data pooling and processing...
Step 1: Checking and pooling valid files...
Found 27 files in balanced_train


Checking balanced_train files: 100%|██████████| 27/27 [00:00<00:00, 8552.05it/s]


Found 209 files in unbalanced_train


Checking unbalanced_train files: 100%|██████████| 209/209 [00:00<00:00, 13750.09it/s]


Found 21 files in eval


Checking eval files: 100%|██████████| 21/21 [00:00<00:00, 9429.44it/s]



Copying 257 valid files to engine_knocking_data/all_data_Engine knocking...


Copying files: 100%|██████████| 257/257 [00:00<00:00, 421.31it/s]



Step 2: Splitting data into training and testing sets...
Training set: 205 files
Testing set: 52 files

Step 3: Processing files into Log-Mel Spectrograms...


Processing training files: 100%|██████████| 205/205 [00:05<00:00, 34.93it/s]


Successfully processed 205 training files


Processing testing files: 100%|██████████| 52/52 [00:01<00:00, 46.01it/s]

Successfully processed 52 testing files

Data processing complete!
All valid files: engine_knocking_data/all_data_Engine knocking/
Training data: engine_knocking_data/training_Engine knocking/
Testing data: engine_knocking_data/testing_Engine knocking/





In [5]:
print("Starting AudioSet Engine Knocking data pooling and processing...")
pool_and_split_data(root_dir='engine_data', 
                    path_to_files='engine_data/engine_data_', 
                    type_of_sound="Engine", 
                    delete_invalid=True)

Starting AudioSet Engine Knocking data pooling and processing...
Step 1: Checking and pooling valid files...
Found 108 files in balanced_train


Checking balanced_train files: 100%|██████████| 108/108 [00:00<00:00, 3204.00it/s]


Found 354 files in unbalanced_train


Checking unbalanced_train files: 100%|██████████| 354/354 [00:00<00:00, 5488.87it/s]


Found 62 files in eval


Checking eval files: 100%|██████████| 62/62 [00:00<00:00, 4701.20it/s]



Copying 250 valid files to engine_data/all_data_Engine...


Copying files: 100%|██████████| 250/250 [00:00<00:00, 383.51it/s]



Step 2: Splitting data into training and testing sets...
Training set: 200 files
Testing set: 50 files

Step 3: Processing files into Log-Mel Spectrograms...


Processing training files: 100%|██████████| 200/200 [00:03<00:00, 50.50it/s]


Successfully processed 200 training files


Processing testing files: 100%|██████████| 50/50 [00:00<00:00, 50.40it/s]

Successfully processed 50 testing files

Data processing complete!
All valid files: engine_data/all_data_Engine/
Training data: engine_data/training_Engine/
Testing data: engine_data/testing_Engine/



