In [None]:
import os
import librosa
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np

Visualize Single Audio

In [None]:
random_file_name = "./blues.00000.wav"

In [None]:
y,sr = librosa.load(random_file_name,sr=50000) ##sr=44100
plt.figure(figsize=(14,5))
librosa.display.waveshow(y,sr=sr)

In [None]:
y.shape

Playing Sound

In [None]:
from IPython.display import Audio
Audio(data=y,rate=sr)

Doing Visualization on chunks of Audio

In [None]:
audio_path = "./blues.00000.wav"
y,sr = librosa.load(random_file_name,sr=None) #sr=None means original sampling rate

#define duration of chunk and overlap
chunk_duration = 4
overlap_duration = 2

#Convert duartion to sample
chunk_samples = chunk_duration * sr
overlap_samples = overlap_duration * sr


#Calculate the number of chunks
num_chunks = int(np.ceil((len(y)-chunk_samples)/(chunk_samples-overlap_samples)))+1

#Iterate over each chunk

for i in range (num_chunks):
    #Calculate start and end indices of each chunk
    start = i*(chunk_samples-overlap_samples)
    end = start + chunk_samples
    #Extract the chunk Audio
    chunk = y[start:end]
    plt.Figure(figsize=(4,2))
    librosa.display.waveshow(chunk,sr=sr)
    plt.show()

Melspectogram Visualization

In [None]:
#Plotting  Melspectogram of entire Audio
def plot_melspectogram(y,sr):
    #Compute Spectogram
    spectogram = librosa.feature.melspectrogram(y=y,sr=sr)
    #Convert to decibles (log scale)
    spectogram_db = librosa.power_to_db(spectogram, ref=np.max)
    #Visualize spectogram
    plt.figure(figsize=(10,4))
    librosa.display.specshow(spectogram_db,sr=sr,x_axis='time',y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Melspectogram')
    plt.tight_layout()
    plt.show()

In [None]:
random_file_name = "./blues.00000.wav"
y,sr = librosa.load(random_file_name,sr=50000)

In [None]:
#For entire Audio
plot_melspectogram(y,sr)

In [None]:
def plot_melspectogram_chunks(y,sr):
    audio_path = "./blues.00000.wav"
y,sr = librosa.load(random_file_name,sr=None) #sr=None means original sampling rate

#define duration of chunk and overlap
chunk_duration = 4
overlap_duration = 2

#Convert duartion to sample
chunk_samples = chunk_duration * sr
overlap_samples = overlap_duration * sr


#Calculate the number of chunks
num_chunks = int(np.ceil((len(y)-chunk_samples)/(chunk_samples-overlap_samples)))+1

#Iterate over each chunk

for i in range (num_chunks):
    #Calculate start and end indices of each chunk
    start = i*(chunk_samples-overlap_samples)
    end = start + chunk_samples
    #Extract the chunk Audio
    chunk = y[start:end]
    #Melspectogram of each chunk
    melspectogram = librosa.feature.melspectrogram(y=chunk,sr=sr)
    spectogram_db = librosa.power_to_db(melspectogram, ref=np.max)
    #visualize melspectogram
    plt.figure(figsize=(10,4))
    librosa.display.specshow(spectogram_db,sr=sr,x_axis='time',y_axis='mel')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Melspectogram')
    plt.tight_layout()
    plt.show()
    
    

In [None]:
random_file_name="./blues.00000.wav"
y,sr = librosa.load(random_file_name,sr=50000)

In [None]:
plot_melspectogram_chunks(y,sr)

In [None]:
data_dir = "./genres_original"
classes = ['blues', 'classical','country','disco','hiphop','jazz','metal','pop','reggae','rock']

In [None]:
from tensorflow.image import resize
#Load and preprocess audio data
def load_and_preprocess_data(data_dir,classes,target_shape=(150,150)):
    data=[]
    labels=[]

    for i_class,class_name in enumerate(classes):
        class_dir = os.path.join(data_dir,class_name)
        print("Processing--",class_name)
        for filename in os.listdir(class_dir):
            if filename.endswith('.wav'):
                file_path = os.path.join(class_dir,filename)
                audio_data,sample_rate = librosa.load(file_path,sr=None)
                #Performing Preprocessing
                #define the duration of each chunk and overlap
                chunk_duration = 4
                overlap_duration = 2
                
                #Convert duration to sample
                chunk_samples = chunk_duration * sample_rate
                overlap_samples = overlap_duration * sample_rate
                
                #Calculate the number of chunks
                num_chunks = int(np.ceil((len(audio_data)-chunk_samples)/(chunk_samples-overlap_samples)))+1
                
                #iterate over each chunks
                for i in range(num_chunks):
                    #Calculate start and end indices of the chunk
                    start = i*(chunk_samples-overlap_samples)
                    end = start+chunk_samples
                    #Extract the chunk audio
                    chunk = audio_data[start:end]
                    #Melspectrogram part
                    mel_spectrogram = librosa.feature.melspectrogram(y=chunk,sr=sample_rate)
                    #Resize matrix based on provided target shape
                    mel_spectrogram = resize(np.expand_dims(mel_spectrogram,axis=-1),target_shape)
                    #Append data to list
                    data.append(mel_spectrogram)
                    labels.append(i_class)
    #Return
    return np.array(data),np.array(labels)

In [None]:
data,labels = load_and_preprocess_data(data_dir,classes)

In [None]:
data.shape

In [None]:
labels.shape

In [None]:
from tensorflow.keras.utils import to_categorical
labels = to_categorical(labels,num_classes = len(classes)) # Converting labels to one-hot encoding
labels

In [None]:
labels.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(data,labels,test_size=0.2,random_state=42)

In [None]:
model = tf.keras.models.Sequential()

In [None]:
X_train[0].shape

In [None]:
model.add(Conv2D(filters=32,kernel_size=3,padding='same',activation='relu',input_shape=X_train[0].shape))
model.add(Conv2D(filters=32,kernel_size=3,activation='relu'))
model.add(MaxPool2D(pool_size=2,strides=2))

In [None]:
model.add(Conv2D(filters=64,kernel_size=3,padding='same',activation='relu'))
model.add(Conv2D(filters=64,kernel_size=3,activation='relu'))
model.add(MaxPool2D(pool_size=2,strides=2))

In [None]:
model.add(Conv2D(filters=128,kernel_size=3,padding='same',activation='relu'))
model.add(Conv2D(filters=128,kernel_size=3,activation='relu'))
model.add(MaxPool2D(pool_size=2,strides=2))

In [None]:
model.add(Dropout(0.3))

In [None]:
model.add(Conv2D(filters=256,kernel_size=3,padding='same',activation='relu'))
model.add(Conv2D(filters=256,kernel_size=3,activation='relu'))
model.add(MaxPool2D(pool_size=2,strides=2))

In [None]:
model.add(Conv2D(filters=512,kernel_size=3,padding='same',activation='relu'))
model.add(Conv2D(filters=512,kernel_size=3,activation='relu'))
model.add(MaxPool2D(pool_size=2,strides=2))

In [None]:
model.add(Dropout(0.3))

In [None]:
model.add(Flatten())

In [None]:
model.add(Dense(units=1200,activation='relu'))

In [None]:
model.add(Dropout(0.45))

In [None]:
model.add(Dense(units=len(classes),activation='softmax'))

In [None]:
model.summary()

In [None]:
model.compile(optimizer=Adam(learning_rate=0.0001),loss='categorical_crossentropy',metrics=['accuracy'])

In [None]:
training_history = model.fit(X_train,Y_train,epochs=30,batch_size=32,validation_data=(X_test,Y_test))

In [None]:
model.save("Trained_model.h5")

In [None]:
training_history.history