# 4. APPLICATION OF MODEL TO SELECTED AUDIO FILE
Necessary imports...

In [1]:
import import_ipynb
N = import_ipynb.NotebookLoader(path=['.'])
N.load_module('CombiningInferences')
from CombiningInferences import *

importing Jupyter notebook from .\CombiningInferences.ipynb
importing Jupyter notebook from .\Functions.ipynb


# Introducing the piece

Initialising classifier...

In [2]:
classifier = KeyAndTempoClassifier(audio_folders=['storage'])
MY_TRACK = 'Ambient_Life.mp3'

Listening the piece as a whole...

In [None]:
classifier.listen(MY_TRACK)

# Inference for the whole piece
Obtaining inference for the whole track...

In [4]:
classifier.get_inferences(MY_TRACK)

Unnamed: 0,Track,Predicted key,Predicted tempo
0,Ambient_Life.mp3,C minor,125


# Inferences for each segment of the piece

In [5]:
signal, sr = classifier.load_track(MY_TRACK)

#________________________
# Short-time Fourier transform:
stft = librosa.core.stft(signal, hop_length=classifier.audio_params_1['hop_length'], n_fft=classifier.audio_params_1['n_fft'])
# Obtain the spectrogram:
spectrogram = np.abs(stft)

#________________________
# Evening out irregularities in dimensions...

# Pad spectrogram if necessary:
if spectrogram.shape[1] < 4000:
    spectrogram = np.pad(spectrogram, ((0, 0), (0, classifier.audio_params_1['n_frames']-spectrogram.shape[1])))
# Truncate spectrogram if necessary
spectrogram = spectrogram[:, :4000]

Obtain the melspectrograms for the current audio file...

In [6]:
# Get log-amplitude melspectrograms:
log_spectrogram = librosa.amplitude_to_db(spectrogram)
# Melspectrograms with log-scaled amplitudes:
melspectrogram_1 = librosa.feature.melspectrogram(S=log_spectrogram,
                                                  sr=classifier.audio_params_1['sr'],
                                                  n_fft=classifier.audio_params_1['n_fft'],
                                                  hop_length=classifier.audio_params_1['hop_length'],
                                                  n_mels=classifier.audio_params_1['n_mels'])
melspectrogram_2 = librosa.feature.melspectrogram(S=log_spectrogram,
                                                  sr=classifier.audio_params_2['sr'],
                                                  n_fft=classifier.audio_params_2['n_fft'],
                                                  hop_length=classifier.audio_params_2['hop_length'],
                                                  n_mels=classifier.audio_params_2['n_mels'])

# Dividing each melspectrogram into 40 segments:
'''
NOTE:
The division is done such that key classifying melspectrograms are 200 time stamps each,
and tempo classifying melspectrograms are 100 time stamps each (to match the neural network architectures).
'''

melspectrograms_1, melspectrograms_2 = [], []

for i in range(20):
    k = 4000//20
    melspectrograms_1 += [melspectrogram_1[:, i*k:(i+1)*k]]*2 # See comment below
melspectrograms_1 = np.array(melspectrograms_1)

'''
Tempo classifier divides the audio file into twice as many segments as the key classifier.
Hence, for each tempo-wise segment there are 2 key-wise segments.
'''

for i in range(40):
    k = 4000//40
    melspectrograms_2 += [np.transpose(melspectrogram_2[:, i*k:(i+1)*k])]
melspectrograms_2 = np.array(melspectrograms_2)

print(f'Shapes: {melspectrograms_1.shape}, {melspectrograms_2.shape}')

Shapes: (40, 128, 200), (40, 100, 12)


Getting the inferences for each segment of the current file...

In [7]:
key_preds, tempo_preds = classifier.get_inferences_for_melspectrograms(melspectrograms_1, melspectrograms_2)

Presenting as dataframe...

In [8]:
pd.DataFrame(data={'Segment': range(1, 41), 'Key predictions':key_preds, 'Tempo predictions':tempo_preds})[0:10]

Unnamed: 0,Segment,Key predictions,Tempo predictions
0,1,C minor,125
1,2,C minor,0
2,3,F major,110
3,4,F major,0
4,5,C minor,0
5,6,C minor,0
6,7,C minor,120
7,8,C minor,0
8,9,F major,0
9,10,F major,0


Listening to each segment...

In [None]:
# Function to facilitate outputs:
def list_to_segment(i):
    i = i - 1
    k = len(signal)//40
    classifier.listen_to_signal(signal[i*k:(i+1)*k], sr=classifier.audio_params_1['sr'])

#================================================
# Inputs:
n = int(input('Enter segment: '))
try:
    list_to_segment(n)
except:
    print('Invalid input')

We can compare the pitch using:

https://www.szynalski.com/tone-generator/