## Channel Mixing for Stereo Audio

This is supposed to get the audio on one channel, scale it by a random value between 0 and 0.5 and add it into the other channel -- and vice versa.

In [4]:
import os
import numpy as np
import soundfile as sf
from tqdm import tqdm

def stereo_mixer(audio):
    left, right = audio[:, 0], audio[:, 1]
    
    # Generate uniform random scaling factors between 0 and 0.5
    left_to_right_scale = np.random.uniform(0, 0.5)
    right_to_left_scale = np.random.uniform(0, 0.5)
    
    # Mix channels with uniform random scaling
    mixed_left = left + left_to_right_scale * right
    mixed_right = right + right_to_left_scale * left
    
    # Normalize to prevent clipping
    max_val = max(np.max(np.abs(mixed_left)), np.max(np.abs(mixed_right)))
    if max_val > 1:
        mixed_left /= max_val
        mixed_right /= max_val
    
    return np.column_stack((mixed_left, mixed_right))

def process_audio_file(input_path, output_path):
    audio, sample_rate = sf.read(input_path)
    
    if audio.ndim == 1:
        audio = np.column_stack((audio, audio))
    elif audio.shape[1] != 2:
        raise ValueError("Input audio must be mono or stereo")
    
    mixed_audio = stereo_mixer(audio)
    
    sf.write(output_path, mixed_audio, sample_rate)

def create_pipeline(input_dir, output_dir):
    """
    Process all WAV files in the input directory and save them to the output directory.
    
    :param input_dir: Path to the directory containing input WAV files
    :param output_dir: Path to the directory where processed files will be saved
    """
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Get total number of WAV files
    total_files = sum(1 for root, _, files in os.walk(input_dir) for file in files if file.endswith('.wav'))
    
    # Create progress bar
    pbar = tqdm(total=total_files, unit='file')
    
    # Walk through all subdirectories in the input directory
    for root, _, files in os.walk(input_dir):
        for file in files:
            if file.endswith('.wav'):
                # Construct full input path
                input_path = os.path.join(root, file)
                
                # Construct corresponding output path with "_processed" suffix
                rel_path = os.path.relpath(input_path, input_dir)
                file_name, file_ext = os.path.splitext(rel_path)
                output_path = os.path.join(output_dir, f"{file_name}_processed{file_ext}")
                
                # Ensure output subdirectory exists
                os.makedirs(os.path.dirname(output_path), exist_ok=True)
                
                # Process the file
                process_audio_file(input_path, output_path)
                
                # Update progress bar
                pbar.update(1)
                pbar.set_description(f"Processing {file}")
    
    # Close progress bar
    pbar.close()



In [None]:
# Example usage
input_dir = "/home/erik/projects/data/Fisher/fisher_eng_tr_sp_d1/audio"
output_dir = "/home/serhan/Desktop/StreamingVAP/augmentations/mixed_audio"

create_pipeline(input_dir, output_dir)
print("Pipeline completed. All files processed.")

In [6]:
import torch
import numpy as np
import soundfile as sf
import os

def gpu_stereo_mixer(audio_batch):
    left, right = audio_batch[:, :, 0], audio_batch[:, :, 1]
    
    # Generate uniform random scaling factors between 0 and 0.5
    left_to_right_scale = torch.rand(audio_batch.shape[0], 1, device=audio_batch.device) * 0.5
    right_to_left_scale = torch.rand(audio_batch.shape[0], 1, device=audio_batch.device) * 0.5
    
    # Mix channels with uniform random scaling
    mixed_left = left + left_to_right_scale * right
    mixed_right = right + right_to_left_scale * left
    
    # Stack channels
    mixed_audio = torch.stack((mixed_left, mixed_right), dim=2)
    
    # Normalize to prevent clipping
    max_val = torch.max(torch.abs(mixed_audio), dim=2, keepdim=True).values
    max_val = torch.max(max_val, dim=1, keepdim=True).values
    mixed_audio = torch.where(max_val > 1, mixed_audio / max_val, mixed_audio)
    
    return mixed_audio

def test_gpu_stereo_mixer(input_path, output_folder):
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Load audio file
    audio, sample_rate = sf.read(input_path)
    
    if audio.ndim == 1:
        audio = np.column_stack((audio, audio))
    elif audio.shape[1] != 2:
        raise ValueError("Input audio must be mono or stereo")

    # Convert to tensor and move to GPU if available
    audio_tensor = torch.from_numpy(audio).float()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    audio_tensor = audio_tensor.to(device)
    
    # Add batch dimension
    audio_tensor = audio_tensor.unsqueeze(0)

    # Apply GPU-based stereo mixing
    mixed_audio = gpu_stereo_mixer(audio_tensor)

    # Move back to CPU and convert to numpy array
    mixed_audio_np = mixed_audio.squeeze(0).cpu().numpy()

    # Generate output filename
    input_filename = os.path.basename(input_path)
    output_filename = f"mixed_{input_filename}"
    output_path = os.path.join(output_folder, output_filename)

    # Save the mixed audio
    sf.write(output_path, mixed_audio_np, sample_rate)

    print(f"Mixed audio saved to: {output_path}")

    return mixed_audio_np, sample_rate

In [7]:
if __name__ == "__main__":
    input_path = "/home/erik/projects/data/Fisher/fisher_eng_tr_sp_d1/audio/002/fe_03_00208.wav"
    output_folder = "/home/serhan/Desktop/StreamingVAP/augmentations/"

    test_gpu_stereo_mixer(input_path, output_folder)

Mixed audio saved to: /home/serhan/Desktop/StreamingVAP/augmentations/mixed_fe_03_00208.wav


In [10]:
import torch
import numpy as np
import soundfile as sf
import os
from tqdm import tqdm
import time
import glob

def gpu_stereo_mixer(audio_batch):
    left, right = audio_batch[:, :, 0], audio_batch[:, :, 1]
    
    # Generate uniform random scaling factors between 0 and 0.5
    left_to_right_scale = torch.rand(audio_batch.shape[0], 1, device=audio_batch.device) * 0.5
    right_to_left_scale = torch.rand(audio_batch.shape[0], 1, device=audio_batch.device) * 0.5
    
    # Mix channels with uniform random scaling
    mixed_left = left + left_to_right_scale * right
    mixed_right = right + right_to_left_scale * left
    
    # Stack channels
    mixed_audio = torch.stack((mixed_left, mixed_right), dim=2)
    
    # Normalize to prevent clipping
    max_val = torch.max(torch.abs(mixed_audio), dim=2, keepdim=True).values
    max_val = torch.max(max_val, dim=1, keepdim=True).values
    mixed_audio = torch.where(max_val > 1, mixed_audio / max_val, mixed_audio)
    
    return mixed_audio

def process_audio_file(input_path, device):
    audio, sample_rate = sf.read(input_path)
    
    if audio.ndim == 1:
        audio = np.column_stack((audio, audio))
    elif audio.shape[1] != 2:
        raise ValueError("Input audio must be mono or stereo")

    audio_tensor = torch.from_numpy(audio).float().to(device)
    audio_tensor = audio_tensor.unsqueeze(0)  # Add batch dimension

    mixed_audio = gpu_stereo_mixer(audio_tensor)
    mixed_audio_np = mixed_audio.squeeze(0).cpu().numpy()

    return mixed_audio_np, sample_rate

def pipeline_stereo_mixer(input_folder, output_folder):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Get all wav files in the input folder
    audio_files = glob.glob(os.path.join(input_folder, "**", "*.wav"), recursive=True)
    total_files = len(audio_files)

    if total_files == 0:
        print("No WAV files found in the input folder.")
        return

    print(f"Found {total_files} WAV files to process.")

    # Initialize progress bar
    pbar = tqdm(total=total_files, unit="file")

    start_time = time.time()
    processed_files = 0
    errors = 0

    for audio_file in audio_files:
        try:
            mixed_audio, sample_rate = process_audio_file(audio_file, device)

            # Generate output filename
            rel_path = os.path.relpath(audio_file, input_folder)
            output_path = os.path.join(output_folder, f"mixed_{rel_path}")
            os.makedirs(os.path.dirname(output_path), exist_ok=True)

            # Save the mixed audio
            sf.write(output_path, mixed_audio, sample_rate)

            processed_files += 1
            elapsed_time = time.time() - start_time
            files_per_second = processed_files / elapsed_time

            # Update progress bar
            pbar.update(1)
            pbar.set_postfix({"Files/s": f"{files_per_second:.2f}"})

        except Exception as e:
            print(f"Error processing {audio_file}: {str(e)}")
            errors += 1

    pbar.close()

    elapsed_time = time.time() - start_time
    
    if processed_files > 0:
        files_per_second = processed_files / elapsed_time
        print(f"Processed {processed_files} files in {elapsed_time:.2f} seconds")
        print(f"Average processing speed: {files_per_second:.2f} files/second")
    else:
        print("No files were processed successfully.")
    
    if errors > 0:
        print(f"Encountered errors while processing {errors} files.")



In [11]:
if __name__ == "__main__":
    input_folder = "/home/erik/projects/data/Fisher/fisher_eng_tr_sp_d1/audio/"
    output_folder = "/home/serhan/Desktop/StreamingVAP/augmentations/mixed/"

    pipeline_stereo_mixer(input_folder, output_folder)

Using device: cuda
Found 799 WAV files to process.


100%|██████████| 799/799 [04:25<00:00,  3.01file/s, Files/s=3.01]

Processed 799 files in 265.21 seconds
Average processing speed: 3.01 files/second



