In [1]:
from os import listdir, makedirs
from itertools import compress
from scipy.io import wavfile
import numpy as np
import shutil

## Reading and Filtering .wav Files

In [42]:
preprocessDirectory = "SubURMPExtendedAudio/original"
processedDirectory = "SubURMPExtendedAudio/clean"

srcPath = f"Data/{preprocessDirectory}"
dstPath = f"Data/{processedDirectory}"
dataSplit = "train"

In [48]:
# Reading in all training .wav files
## Get all directory names
instrumentNames = listdir(f"{srcPath}/{dataSplit}/")
instrumentNames = [name for name in instrumentNames if name[0] != "."] # Remove all hidden directories

## Get all .wav files names
wavFileNames = []
for dir in instrumentNames:
    fileNames = listdir(f"{srcPath}/{dataSplit}/{dir}/")
    fileNames = [name for name in fileNames if name[0] != "."]
    wavFileNames.append(fileNames)



## Reading in all .wav files in each directory 
## I have checked to ensure that all files have a sampeling rate == 44100

wavFileData = []
for i, instrument in enumerate(instrumentNames):
    instrumentData = []
    for file in wavFileNames[i]:
        _, data = wavfile.read(f"{srcPath}/{dataSplit}/{instrument}/{file}")
        instrumentData.append(data[:,0]) # I have checked to ensure that all data is single sided i.e. data[:,0] == data[:,1]
    
    wavFileData.append(instrumentData)

In [49]:
# Filtering all .wav files upon >= 450 condition i.e. removing silent samples

## Creating list objects for filtered data and recording names of remaining and removed files
audiableWavData = []  # filtered data list
audiableWavFiles = []  # remaining file names
inaudiableWavFiles = []  # removed file names


## Loop over all 13 instruments and filter dataset based on abs(amplitude) >= 450 condition 
for i, instrumentData in enumerate(wavFileData):
    ### Populating list object for whether condition is met
    maxConditionMetList = []  
    for recording in instrumentData:
        maxCondition = max(abs(recording)) >= 450 
        maxConditionMetList.append(maxCondition)

    ### Populating list object for whether condition is not met
    maxConditionFailedList = [not element for element in maxConditionMetList]

    ### Creating list objects for filtered data and filenames for instrument i 
    wavDataKeep = list(compress(instrumentData, maxConditionMetList))  # filtered data
    wavFilesKeep = list(compress(wavFileNames[i], maxConditionMetList))  # remaning file names
    wavFilesRemove = list(compress(wavFileNames[i], maxConditionFailedList))  # removed file names

    ### Adding data for instrument i to parent list
    audiableWavData.append(wavDataKeep)  
    audiableWavFiles.append(wavFilesKeep)
    inaudiableWavFiles.append(wavFilesRemove)




In [50]:
# Copying all audiable files from SubURMP Dataset to clean dataset
# ONLY RUN THIS CODE ONCE 

for name, fileList in zip(instrumentNames, audiableWavFiles):
    for file in fileList:
        sourcePath = f"{srcPath}/{dataSplit}/{name}/{file}"
        destPath = f"{dstPath}/{dataSplit}/{name}/{file}"

        shutil.copy(sourcePath, destPath)
  

## Pairing Filtered .wav files with Images


In [51]:
# Producing a list of wanted file names (This just involves swapping .wav to .jpg)
audiableImgFiles = []

for instrument in audiableWavFiles:
    instrumentImgFiles = []
    for fileName in instrument:
        imgFileName = fileName.replace("wav", "jpg")
        instrumentImgFiles.append(imgFileName)
    audiableImgFiles.append(instrumentImgFiles)

In [52]:
imageSrcDirectory = f"Data/SubURMP64/images/original"
imageDstDirectory = f"Data/SubURMP64/images/extendedAudio"

In [53]:
# Copying all images associated with audiable files from SubURMP Dataset to clean dataset
# ONLY RUN THIS CODE ONCE 

for name, fileList in zip(instrumentNames, audiableImgFiles):
    for file in fileList:
        sourcePath = f"{imageSrcDirectory}/{dataSplit}/{name}/{file}"
        destPath = f"{imageDstDirectory}/{dataSplit}/{name}/{file}"

        shutil.copy(sourcePath, destPath)

In [54]:
with open("Documentation/ExtendedAudio_train_files_removed.txt", "w") as f:
    for instrument, fileName in zip(instrumentNames, inaudiableWavFiles):
        f.write(f"{instrument}\n")
        for file in fileName:
            f.write(f"{file}\n")