In [2]:
import os
import librosa
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Conv2D,MaxPool2D,Flatten,Dense,Dropout
from tensorflow.keras.optimizers.legacy import Adam #for MAC configuration for windows use directly from optimizers

## Visualize single audio

In [None]:
random_file = "rock.wav"

In [None]:
y, sr = librosa.load(random_file, sr=44100)
plt.figure(figsize=(14,5))
librosa.display.waveshow(y, sr=sr)
# print(30*44100)
# print(y.shape)

## Play audio

In [None]:
from IPython.display import Audio
Audio(data=y,rate=sr)

## Visualizing chunks of audio

In [None]:
audio_path = "rock.wav"
y, sr = librosa.load(audio_path, sr=None)  # sr=None to keep the original sampling rate

# Define the duration of each chunk and overlap
chunk_duration = 4  # seconds
overlap_duration = 2  # seconds To maintain the previous information of the audio sample

# Convert durations to samples
chunk_samples = chunk_duration * sr
overlap_samples = overlap_duration * sr
print(len(y), chunk_samples, sr)

# Calculate the number of chunks
#window size is of 2
#First sample 0 to 4 then second sample is 2 to 6
num_chunks = int(np.ceil((len(y) - chunk_samples) / (chunk_samples - overlap_samples))) + 1 
print(num_chunks) #30 sec ko 15 huna parcha

# Iterate over each chunk
for i in range(num_chunks):
    # Calculate start and end indices of the chunk
    start = i * (chunk_samples - overlap_samples)
    end = start + chunk_samples
    
    # Extract the chunk of audio
    chunk = y[start:end]
    plt.figure(figsize=(4, 2))
    librosa.display.waveshow(chunk, sr=sr)
    plt.show()

## Melspectrogram Visualization

In [None]:
random_file_name = "rock.wav"
y,sr = librosa.load(random_file_name,sr=44100)

In [None]:
#Plotting Melspectrogram of Entire Audio
def plot_melspectrogram(y,sr):
    #Compute spectrogram
    spectrogram = librosa.feature.melspectrogram(y=y,sr=sr)
    #Convert to decibels (log scale)
    spectrogram_db = librosa.power_to_db(spectrogram,ref=np.max)
    #Visualize the spectrogram
    plt.figure(figsize=(10,4))
    librosa.display.specshow(spectrogram_db,sr=sr,x_axis='time',y_axis='mel')
    plt.colorbar(format='%2.0f dB')
    plt.title("Spectrogram")
    plt.tight_layout()
    plt.show()

In [None]:
plot_melspectrogram(y, sr)

In [None]:
#Melspectrogram for audio chunks
def plot_melspectrogram_chunks(y,sr):
    #define the duration of each chunk and overlap
    chunk_duration = 4
    overlap_duration = 2
    
    #Convert duration to sample
    chunk_samples = chunk_duration * sr
    overlap_samples = overlap_duration * sr
    
    #Calculate the number of chunks
    num_chunks = int(np.ceil((len(y)-chunk_samples)/(chunk_samples-overlap_samples)))+1
    
    #iterate over each chunks
    for i in range(num_chunks):
        #Calculate start and end indices of the chunk
        start = i*(chunk_samples-overlap_samples)
        end = start+chunk_samples
        #Extract the chunk audio
        chunk = y[start:end]
        #Melspectrogram part
        spectrogram = librosa.feature.melspectrogram(y=chunk,sr=sr)
        print(spectrogram.shape)
        spectrogram_db = librosa.power_to_db(spectrogram,ref=np.max)
        #Visualize the spectrogram
        plt.figure(figsize=(10,4))
        librosa.display.specshow(spectrogram_db,sr=sr,x_axis='time',y_axis='mel')
        plt.colorbar(format='%2.0f dB')
        plt.title("Spectrogram")
        plt.tight_layout()
        plt.show()

In [None]:
plot_melspectrogram_chunks(y,sr)

In [None]:
128*345 #take close enough dimension to this

In [None]:
210*210

## Data Preprocessing

In [None]:
data_dir = "genres_original"
# classes = ['blues', 'classical','country','disco','hiphop','metal','pop','reggae','rock']
classes = ['hiphop','metal','rock']

In [None]:
from tensorflow.image import resize #resize for the above output dimension
#Load and preprocess audio data
def load_and_preprocess_data(data_dir,classes,target_shape=(150,150)):
    data=[]
    labels=[]

    for i_class,class_name in enumerate(classes):
        class_dir = os.path.join(data_dir,class_name)
        print("Processing--",class_name)
        for filename in os.listdir(class_dir):
            if filename.endswith('.wav'):
                file_path = os.path.join(class_dir,filename)
                audio_data,sample_rate = librosa.load(file_path,sr=None)
                #Performing Preprocessing
                #define the duration of each chunk and overlap
                chunk_duration = 4
                overlap_duration = 2
                
                #Convert duration to sample
                chunk_samples = chunk_duration * sample_rate
                overlap_samples = overlap_duration * sample_rate
                
                #Calculate the number of chunks
                num_chunks = int(np.ceil((len(audio_data)-chunk_samples)/(chunk_samples-overlap_samples)))+1
                
                #iterate over each chunks
                for i in range(num_chunks):
                    #Calculate start and end indices of the chunk
                    start = i*(chunk_samples-overlap_samples)
                    end = start+chunk_samples
                    #Extract the chunk audio
                    chunk = audio_data[start:end]
                    #Melspectrogram part
                    mel_spectrogram = librosa.feature.melspectrogram(y=chunk,sr=sample_rate)
                    #Resize matrix based on provided target shape 150 x 150
                    mel_spectrogram = resize(np.expand_dims(mel_spectrogram,axis=-1),target_shape)
                    #Append data to list
                    data.append(mel_spectrogram)
                    labels.append(i_class)
    #Return
    return np.array(data),np.array(labels)

In [None]:
data,labels = load_and_preprocess_data(data_dir,classes)

In [None]:
data.shape

In [None]:
labels.shape

In [None]:
from tensorflow.keras.utils import to_categorical
#one hot encoding
labels = to_categorical(labels,num_classes = len(classes)) # Converting labels to one-hot encoding
labels

In [None]:
labels.shape

## Splitting dataset into training and test set

In [None]:
from sklearn.model_selection import train_test_split
#80% training, 20% test set
X_train,X_test,Y_train,Y_test = train_test_split(data,labels,test_size=0.2,random_state=42)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
Y_train.shape

In [None]:
Y_test.shape

## Building Model

In [None]:
model = tf.keras.models.Sequential()

In [None]:
X_train[0].shape

In [None]:
#padding to preserve data after using filter
model.add(Conv2D(filters=32,kernel_size=3,padding='same',activation='relu',input_shape=X_train[0].shape))
#second layer to remove unwanted info
model.add(Conv2D(filters=32,kernel_size=3,activation='relu'))
# max pooling use same as second layer but is more specific
model.add(MaxPool2D(pool_size=2,strides=2))

In [None]:
model.add(Conv2D(filters=64,kernel_size=3,padding='same',activation='relu'))
model.add(Conv2D(filters=64,kernel_size=3,activation='relu'))
model.add(MaxPool2D(pool_size=2,strides=2))

In [None]:
model.add(Conv2D(filters=128,kernel_size=3,padding='same',activation='relu'))
model.add(Conv2D(filters=128,kernel_size=3,activation='relu'))
model.add(MaxPool2D(pool_size=2,strides=2))

In [None]:
model.add(Dropout(0.3))

In [None]:
model.add(Conv2D(filters=256,kernel_size=3,padding='same',activation='relu'))
model.add(Conv2D(filters=256,kernel_size=3,activation='relu'))
model.add(MaxPool2D(pool_size=2,strides=2))

In [None]:
model.add(Conv2D(filters=512,kernel_size=3,padding='same',activation='relu'))
model.add(Conv2D(filters=512,kernel_size=3,activation='relu'))
model.add(MaxPool2D(pool_size=2,strides=2))

In [None]:
model.add(Dropout(0.3)) #drop 30% of neurons from that to deal with overfitting

In [None]:
model.add(Flatten())

In [None]:
model.add(Dense(units=1200,activation='relu'))

In [None]:
model.add(Dropout(0.45))

In [None]:
#Output layer
model.add(Dense(units=len(classes),activation='softmax')) #multiclass = softmax

In [None]:
model.summary()

In [None]:
#Compile the model
#multi class = categorical_crossentropy
model.compile(optimizer=Adam(learning_rate=0.0001),loss='categorical_crossentropy',metrics=['accuracy'])

In [None]:
#Training Model
training_history = model.fit(X_train,Y_train,epochs=30,batch_size=32,validation_data=(X_test,Y_test))

In [None]:
model.save("Trained_model.keras") #Mac

In [None]:
training_history.history

In [None]:
#Recording History in json
import json
with open('training_hist.json','w') as f:
  json.dump(training_history.history,f)

In [None]:
X_train

In [3]:
#Reloading model variable
model = tf.keras.models.load_model("Trained_model.keras")
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 150, 150, 32)      320       
                                                                 
 conv2d_1 (Conv2D)           (None, 148, 148, 32)      9248      
                                                                 
 max_pooling2d (MaxPooling2  (None, 74, 74, 32)        0         
 D)                                                              
                                                                 
 conv2d_2 (Conv2D)           (None, 74, 74, 64)        18496     
                                                                 
 conv2d_3 (Conv2D)           (None, 72, 72, 64)        36928     
                                                                 
 max_pooling2d_1 (MaxPoolin  (None, 36, 36, 64)        0         
 g2D)                                                   

In [None]:
#Reloading Training history
import json
with open("training_hist.json",'r') as json_file:
    training_history_data = json.load(json_file)

In [None]:
training_history_data.keys()

## Model Evaluation

In [None]:
#Model evaluation on Training set
train_loss,train_accuracy = model.evaluate(X_train,Y_train)

In [None]:
train_loss,train_accuracy

In [None]:
#Model evaluation on Validation set
val_loss,val_accuracy = model.evaluate(X_test,Y_test)

In [None]:
val_loss,val_accuracy

In [None]:
training_history_data['val_loss']

## Accuracy and Loss Visualization

In [None]:
#Visualization of Loss
epochs = [i for i in range(1,31)]
plt.plot(epochs,training_history_data['loss'],label="Training Loss",color='red')
plt.plot(epochs,training_history_data['val_loss'],label="Validation Loss",color='blue')
plt.xlabel("No. of Epochs")
plt.ylabel("Loss")
plt.title("Visualization of Loss Result")
plt.legend()
plt.show()

In [None]:
#Visualization of Accuracy
epochs = [i for i in range(1,31)]
plt.plot(epochs,training_history_data['accuracy'],label="Training Accuracy",color='red')
plt.plot(epochs,training_history_data['val_accuracy'],label="Validation Accuracy",color='blue')
plt.xlabel("No. of Epochs")
plt.ylabel("Accuracy")
plt.title("Visualization of Accuracy Result")
plt.legend()
plt.show()

## Precision, Recall, Confusion Matrix

In [None]:
X_test.shape

In [None]:
y_pred = model.predict(X_test)
y_pred

In [None]:
y_pred.shape

In [None]:
predicted_categories = np.argmax(y_pred,axis=1)
predicted_categories

In [None]:
Y_test

In [None]:
Y_test.shape

In [None]:
true_categories = np.argmax(Y_test,axis=1)
true_categories

In [None]:
classes

In [None]:
from sklearn.metrics import confusion_matrix,classification_report
cm = confusion_matrix(true_categories,predicted_categories)
# Precision Recall F1score
print(classification_report(true_categories,predicted_categories,target_names=classes))

## Confusion Matrix Visualization

In [None]:
cm

In [None]:
import seaborn as sns
plt.figure(figsize=(15,15))
sns.heatmap(cm,annot=True,annot_kws={"size":10})
plt.xlabel("Predicted Class",fontsize=10)
plt.ylabel("Actual Class",fontsize=10)
plt.title("Music Genre Classification Confusion Matrix",fontsize=15)
plt.show()