In [1]:

from os import listdir
import re
import numpy as np
import wave
from tqdm import tqdm


#### Reading in Data

In [2]:
# Reading in all training .wav files
## Get all directory names
instrumentNames = listdir("Data/SubURMP/chunk/validation/")
instrumentNames = [name for name in instrumentNames if name[0] != "."] # Remove all hidden directories

## Get all .wav files names
wavFileNames = []
for dir in instrumentNames:
    fileNames = listdir(f"Data/SubURMP/chunk/validation/{dir}/")
    fileNames = [name for name in fileNames if name[0] != "."]
    wavFileNames.append(fileNames)


#### Ordering data 

In [3]:
numericFileNames = []
for instrumentFileNames in wavFileNames:
    instrumentNumericFile = []
    for file in instrumentFileNames:
        fileNum = re.sub("[^0-9]", "", file)  # Removes non-numeric values
        #fileNum = fileNum[:2] + "_" + fileNum[2:]
        instrumentNumericFile.append(fileNum)  
    
    #instrumentNumericFile = list(map(int, instrumentNumericFile))  # converts list of values from characters to integers
    numericFileNames.append(instrumentNumericFile)

In [4]:
# Creating Two columns to sort data with such that it maches that in the directory

completeIdxOne = [] # List for all instruments taking first two elements of numeric component of name
completeIdxTwo = [] # List for all instruments taking the remaining elements of numeric component of name

for instrument in numericFileNames:

    ## Individual instrument list 
    idxOneList = [] 
    idxTwoList = []

    ## Loop through instrument i file names (numeric components) to extract all necessary information 
    for elem in instrument:
        idxOne = elem[:2]  # First two elements
        idxTwo = elem[2:]  # Remaining elements

        idxOneList.append(idxOne)
        idxTwoList.append(idxTwo)

    completeIdxOne.append(idxOneList)
    completeIdxTwo.append(idxTwoList)



# Excluding reading in data 

In [6]:
# Sorting file names and data with indexing columns

## Creating list objects for final sorted elements
sortedFileNames = []

for instrumentFileNames, instrumentIdOne, instrumentIdTwo in zip(wavFileNames, completeIdxOne, completeIdxTwo):

    ### Turn all list objects into arrays for sorting
    nameArray = np.array(instrumentFileNames)  # Array by instrument of file names


    idOneArray = np.array(instrumentIdOne).astype(int)  # array of first two elements of numeric list (see above cell)
    idTwoArray = np.array(instrumentIdTwo).astype(int)  # array of remaining elements of numeric list (see above cell)

    ### Sort name and data arrays by indexing elements
    nameArraySorted = nameArray[np.lexsort((idTwoArray, idOneArray))]


    ### Append sorted data to final sorted lists created above
    sortedFileNames.append(nameArraySorted)



#######
## I HAVE CHECKED TO MAKE SURE THIS SORTING PROCESS IS ACCURATE
## TO RECHECK USE THE BELOW CODE 
# sortedFileNames[8][387]
# sortedFileData[8][387]
# samplerate, data = wavfile.read("Data/SubURMPClean/audio/train/tuba/tuba00_39200.wav")
# data[:,0]

    

In [7]:
# This cell divides each instrument into its individual performances
performanceID = ['00', '01', '02', '03', '04', '05', '06', '07', '08', '09'] # All possible performance ID's


sortedPerformanceFileNames = []

for fileset in sortedFileNames:
    performanceNameList = []
    for idx in performanceID:
        indexingList = [True if re.sub(r'[^0-9]', '', name)[:2] == idx else False for name in fileset]  # Indexing list seperating performances within instrument
        names = fileset[indexingList]  # Filtering file names by indexing list
       
        
        performanceNameList.append(names)
      
        
    performanceNameList = [arr for arr in performanceNameList if arr.shape[0] != 0]
    
    # The below lists store musical numbers in seperate lists. 
    # The format is list(list(instrument A --> list(number 0), list(number 1) --> list(number n)), list(instrument B --> list(number 0), list(number 1) --> list(number n)))
    # Instrument A
    #   - number 0
    #   - number 1
    # Instrument B
    #   - number 0
    #   - number 1

    sortedPerformanceFileNames.append(performanceNameList)
  
        



In [8]:
#TODO Make sure that the last file to be created has the same number of samples in it as every other file. This way the audio lengths will be the same otherwise AudioCLIP will not run properly 

def extendAudio(fileNames, instrument, outputDirectory, overlapIncriment = 10, length = 10):
    forLoopLength = fileNames[::5].shape[0] ## fileNames should be a non-empty array of file names
    
    for i in range(0,forLoopLength, overlapIncriment): ## OverlapIncriment specifies the extent of overlap between samples. 10 == no overlap, 1 == maximum overlap
        outputName = f"{outputDirectory}/{instrument}/{fileNames[::5][i]}" #TODO This is the secontion of the code to improve
        files = fileNames[::5][i:i+length]
        data = []
        
        for file in files:
            w = wave.open(f"Data/SubURMP/chunk/validation/{instrument}/{file}", 'rb')
            data.append([w.getparams(), w.readframes(w.getnframes())]) 
            w.close()
        
        output = wave.open(outputName, 'wb')
        output.setparams(data[0][0])
        for i in range(len(data)):
            output.writeframes(data[i][1])
        output.close()
        

In [12]:
import os 
for instrument in instrumentNames:
    os.mkdir(f"Data/SubURMPExtendedAudio/validation/{instrument}")

In [13]:
for instrument, fileNamesList in zip(instrumentNames, sortedPerformanceFileNames):
    for fileNames in tqdm(fileNamesList, desc=f"{{{instrument}}}"):
        extendAudio(fileNames, instrument, "Data/SubURMPExtendedAudio/validation", overlapIncriment=10)
    

{cello}: 100%|██████████| 1/1 [00:00<00:00,  9.10it/s]
{violin}: 100%|██████████| 1/1 [00:00<00:00,  9.33it/s]
{trombone}: 100%|██████████| 1/1 [00:00<00:00, 11.67it/s]
{bassoon}: 100%|██████████| 1/1 [00:00<00:00, 22.02it/s]
{clarinet}: 100%|██████████| 1/1 [00:00<00:00,  9.76it/s]
{sax}: 100%|██████████| 2/2 [00:00<00:00, 20.28it/s]
{oboe}: 100%|██████████| 1/1 [00:00<00:00, 17.11it/s]
{trumpet}: 100%|██████████| 1/1 [00:00<00:00, 16.10it/s]
{tuba}: 100%|██████████| 1/1 [00:00<00:00, 15.87it/s]
{horn}: 100%|██████████| 1/1 [00:00<00:00, 17.70it/s]
{viola}: 100%|██████████| 1/1 [00:00<00:00, 15.23it/s]
{flute}: 100%|██████████| 1/1 [00:00<00:00,  7.20it/s]
{double_bass}: 100%|██████████| 1/1 [00:00<00:00,  7.92it/s]
