In [1]:
import os
from pydub import AudioSegment
import hashlib

def find_mp3_files(directory):
    mp3_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".mp3"):
                mp3_files.append(os.path.join(root, file))
    return mp3_files

def generate_unique_filename(file_path, extension="wav"):
    # Generate a unique hash based on the file path
    file_hash = hashlib.md5(file_path.encode()).hexdigest()
    # Extract the base name without extension and append the hash
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    unique_name = f"{base_name}_{file_hash}.{extension}"
    return unique_name

def convert_mp3_to_wav(mp3_path, wav_path):
    # Load MP3 file
    audio = AudioSegment.from_mp3(mp3_path)
    # Export as WAV with the same sample rate and bit depth
    audio.export(wav_path, format="wav")

def merge_wav_files(wav_files, output_path):
    combined = AudioSegment.empty()
    for wav_file in wav_files:
        audio = AudioSegment.from_wav(wav_file)
        combined += audio
    combined.export(output_path, format="wav")





In [2]:
raw_data_dir = "raw_data"
preprocessing_dir = "dataset"
os.makedirs(preprocessing_dir, exist_ok=True)

mp3_files = find_mp3_files(raw_data_dir)
wav_files = []

for mp3_file in mp3_files:
    unique_wav_file = generate_unique_filename(mp3_file)
    wav_file_path = os.path.join(preprocessing_dir, unique_wav_file)
    convert_mp3_to_wav(mp3_file, wav_file_path)
    wav_files.append(wav_file_path)

output_path = os.path.join(preprocessing_dir, "full_audio.wav")
merge_wav_files(wav_files, output_path)
print(f"Combined audio saved to {output_path}")