Import Libraries

In [1]:
#TensorFlow is used for building the CNN model, while 
# scikit-learn is used for data preprocessing. 
# Pandas handles data manipulation 
# and joblib is used to save model components (scaler and label encoder).
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, Reshape, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib  # For saving scaler and label encoder

Load and extract features and labels from provided features_3_sec csv file

In [2]:
features_df = pd.read_csv("../../Misc/features_3_sec.csv")
X = features_df.drop(columns=['filename','label','length']).values  # Drop 'label' column and take all features
y = features_df['label'].values  # 'label' column contains the target classes (genres)

spilt into Train and Test, Encode and Normalize

In [3]:
from joblib import dump, load

# ML models generally work with numerical data, so we need to encode the music genres (which are categorical) into numerical labels. 
# The LabelEncoder will map each unique genre to an integer.
label_encoder = LabelEncoder()
features_df['label'] = label_encoder.fit_transform(features_df['label'])

# Save the label encoder o it can be reused after transforming the labels later (for model inference).
# Save the label encoder as pkl
joblib.dump(label_encoder, 'label_encoder_cnn.pkl')
# Save the label encoder as joblib
dump(label_encoder, 'label_encoder_cnn.joblib')

print("LabelEncoder saved successfully.")


#Split the datset into features and labels
X = features_df.drop(columns=['filename', 'label', 'length']) # Drop unnecessary columns including length as that is constant  
y = features_df['label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42)


# Scale the data after splitting so that Test data does not have visibility
# Normalization across instances should be done after splitting the data 
# between training and test set, using only the data from the training set.
# This is because the test set plays the role of fresh unseen data, 
# so it's not supposed to be accessible at the training stage. 
# Using any information coming from the test set before or during training 
# is a potential bias in the evaluation of the performance.

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Save the scaler so it can be reused after transforming the features
# Save the scaler as pkl
joblib.dump(scaler, 'scaler_cnn.pkl')
# Save the scaler as joblib
dump(scaler, 'scaler_cnn.joblib')
print("Scaler saved successfully.")

# print(f"Shape of features (X): {X.shape}")
# print(f"Shape of target (y): {y.shape}")
# # Print dataset shapes and verify
# print(f"Training Features Shape: {X_train.shape}")
# print(f"Testing Features Shape: {X_test.shape}")
# print(f"Training Labels Shape: {y_train.shape}")
# print(f"Testing Labels Shape: {y_test.shape}")


LabelEncoder saved successfully.
Scaler saved successfully.


Define the CNN model 

In [4]:

# Initialize the model
model = Sequential()

# Reshape the input to have a 3D shape (samples, timesteps, features) for Conv1D layers
# We add an extra dimension since Conv1D expects a 3D input.
model.add(Reshape((X_train.shape[1], 1), input_shape=(X_train.shape[1],)))

# First Conv1D layer with 32 filters, a kernel size of 3, ReLU activation, and L2 regularization
model.add(Conv1D(32, 3, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
# Batch normalization to help stabilize training by normalizing activations
model.add(BatchNormalization())
# Max-pooling layer to down-sample the feature map
model.add(MaxPooling1D(2))

# Second Conv1D layer with 64 filters, kernel size of 3, ReLU activation, and L2 regularization
model.add(Conv1D(64, 3, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(BatchNormalization())
# Another max-pooling layer
model.add(MaxPooling1D(2))

# Flatten the feature map to create a 1D vector, which is required for the dense layers
model.add(Flatten())
# Dense layer with 64 units, ReLU activation, and L2 regularization
model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
# Dropout layer to prevent overfitting by randomly dropping 30% of the neurons
model.add(Dropout(0.3))
# Output layer with softmax activation for multi-class classification
model.add(Dense(len(np.unique(y)), activation='softmax'))

# Compile the model using Adam optimizer with a learning rate of 0.001 and sparse categorical cross-entropy loss
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display a summary of the model architecture
model.summary()

  super().__init__(**kwargs)


Define early stopping callback

In [5]:
# Early stopping is a callback function that helps prevent overfitting by stopping 
# the training process if the validation loss doesn't improve after a certain 
# number of epochs (patience=5). This will restore the best weights based on 
# the validation performance.
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

Train and save the model

In [6]:

#train the model using the training data.
history = model.fit(X_train, y_train, epochs=50, batch_size=64, validation_data=(X_test, y_test), callbacks=[early_stopping])

# save the model to disk using the .save() method. 
model.save('music_genre_cnn_model.h5')

print("Model saved successfully.")

Epoch 1/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.4143 - loss: 1.9766 - val_accuracy: 0.4840 - val_loss: 1.7549
Epoch 2/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.6758 - loss: 1.1284 - val_accuracy: 0.6612 - val_loss: 1.2494
Epoch 3/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.7301 - loss: 0.9431 - val_accuracy: 0.7643 - val_loss: 0.9305
Epoch 4/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.7808 - loss: 0.7977 - val_accuracy: 0.7963 - val_loss: 0.7825
Epoch 5/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.8071 - loss: 0.7195 - val_accuracy: 0.8158 - val_loss: 0.7335
Epoch 6/50
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.8290 - loss: 0.6564 - val_accuracy: 0.8298 - val_loss: 0.6820
Epoch 7/50
[1m125/125[



Model saved successfully.


Evaluate the model on the test data

In [7]:
# evaluate the trained model on the test set to determine its performance 
# in terms of accuracy. This gives us an indication of how well the model 
# generalizes to unseen data.
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
print(f"Test accuracy: {test_acc:.4f}")

63/63 - 0s - 2ms/step - accuracy: 0.8724 - loss: 0.5523
Test accuracy: 0.8724


Plot training history for accuracy and loss

In [None]:
# plot the training and validation accuracy and loss over epochs to visualize 
# how the model improved during training. This can help identify potential 
# issues like overfitting or underfitting.
plt.figure(figsize=(12, 6))

# Plot Accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy over epochs')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# Plot Loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss over epochs')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

Testing with sample data

In [None]:
%pip install tensorflow

In [2]:
import librosa
import numpy as np
from joblib import load
import pandas as pd
from tensorflow.keras.models import load_model

# Step 1: Load the pre-trained CNN model, scaler, and label encoder
model = load_model('music_genre_cnn_model.h5')
scaler = load('scaler_cnn.joblib')
label_encoder = load('label_encoder_cnn.joblib')
print("step 1")
# Define the genre names corresponding to the encoded labels
genre_names = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']

print("before step 2")
# Step 2: Function to extract all features from a .wav file
def extract_all_features_from_wav(wav_file):
    # Load audio file using librosa
    y, sr = librosa.load(wav_file, sr=None)  # sr=None preserves the original sample rate
    
    # Extract various audio features (matching CSV columns)
    
    # Length of the audio (duration in seconds)
    #length = librosa.get_duration(y=y, sr=sr)
    
    # Spectral Bandwidth
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    spectral_bandwidth_mean = np.mean(spectral_bandwidth)
    spectral_bandwidth_var = np.var(spectral_bandwidth)
    
    # Chroma STFT (Short-Time Fourier Transform)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    chroma_stft_mean = np.mean(chroma_stft)
    chroma_stft_var = np.var(chroma_stft)
    
    # RMS (Root Mean Square)
    rms = librosa.feature.rms(y=y)
    rms_mean = np.mean(rms)
    rms_var = np.var(rms)
    
    # Spectral Centroid
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    spectral_centroid_mean = np.mean(spectral_centroid)
    spectral_centroid_var = np.var(spectral_centroid)
    
    # Roll-off (Spectral roll-off point)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.85)
    rolloff_mean = np.mean(rolloff)
    rolloff_var = np.var(rolloff)
    
    # Zero-Crossing Rate
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y=y)
    zero_crossing_rate_mean = np.mean(zero_crossing_rate)
    zero_crossing_rate_var = np.var(zero_crossing_rate)
    
    # Harmony (harmonic-to-noise ratio)
    harmony = librosa.effects.harmonic(y)
    harmony_mean = np.mean(harmony)
    harmony_var = np.var(harmony)
    
    # Perceptual features
    perceptr_mean = np.mean(rms)  # Simplified perceptual mean
    perceptr_var = np.var(rms)    # Simplified perceptual variance
    
    # Tempo (beats per minute)
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    
    # MFCCs (Mel-frequency cepstral coefficients)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=20)
    mfcc_features = [np.mean(mfcc[i]) for i in range(20)] + [np.var(mfcc[i]) for i in range(20)]
    
    # Combine all extracted features
    features = [
        chroma_stft_mean, chroma_stft_var, rms_mean, rms_var,
        spectral_centroid_mean, spectral_centroid_var, spectral_bandwidth_mean, spectral_bandwidth_var,
        rolloff_mean, rolloff_var, zero_crossing_rate_mean, zero_crossing_rate_var,
        harmony_mean, harmony_var, perceptr_mean, perceptr_var, tempo
    ] + mfcc_features
    
    print("just before returning in step 2")
    return features

# Step 3: Extract features from a sample input .wav file
#wav_file = 'klass.00010.wav'  
wav_file ='../rajan-hans/demo/Staylin_alive.wav'
extracted_features = extract_all_features_from_wav(wav_file)
# un-comment below section for debugging purpose
# print(f"Extracted features (length): {len(extracted_features)}")
# print(f"Extracted features: {extracted_features}")

# Step 4: Get all feature column names from the CSV file
feature_columns = pd.read_csv("../../Misc/features_3_sec.csv").columns.tolist()

# un-comment below section for debugging purpose
#print(f"Feature columns from CSV (length): {len(feature_columns)}")

# Step 5: Clean the feature columns (remove 'label' and 'filename')
feature_columns.remove('label')
feature_columns.remove('filename')
feature_columns.remove('length')

# un-comment below section for debugging purpose
#print(f"Feature columns after cleaning (length): {len(feature_columns)}")

# un-comment below section for debugging purpose
# # Step 6: Compare the number of extracted features to the expected number of features
# if len(extracted_features) != len(feature_columns):
#     print(f"Warning: Number of extracted features ({len(extracted_features)}) does not match the expected number of features ({len(feature_columns)})")
#     print(f"Extracted Features: {extracted_features}")
#     print(f"Feature Columns from CSV: {feature_columns}")
# else:
#     print("Number of extracted features matches the expected columns.")

# un-comment for debugging purpose
#for i in range(len(feature_columns)):
#    print(f"{feature_columns[i]}: {extracted_features[i]}")

# Step 7: Create a DataFrame with the correct columns
extracted_features_df = pd.DataFrame([extracted_features], columns=feature_columns)

# Step 8: Scale the extracted features using the saved scaler
X_scaled = scaler.transform(extracted_features_df)

# # Step 9: Use the trained model to predict the genre
# predicted_label_encoded = model.predict(X_scaled)

# print("step 9")
# # Step 10: Decode the predicted label to get the genre name
# predicted_label = label_encoder.inverse_transform(predicted_label_encoded)

# # Step 11: Map the encoded label to the actual genre name
# predicted_genre_name = genre_names[predicted_label_encoded[0]]  # Map the predicted label to the actual genre name

# # Step 12: Print the predicted genre
# print(f"The predicted genre of the song {wav_file} is: {predicted_genre_name}")


# Step 9: Predict the genre
predicted_label_encoded = model.predict(X_scaled)
predicted_label_index = np.argmax(predicted_label_encoded, axis=1)[0]

# Step 10-12: Map and print the genre
if predicted_label_index < len(genre_names):
    predicted_genre_name = genre_names[predicted_label_index]
    print(f"The predicted genre of the song {wav_file} is: {predicted_genre_name}")
else:
    print(f"Error: Predicted label {predicted_label_index} is out of range.")



step 1
before step 2
just before returning in step 2
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step
The predicted genre of the song ../rajan-hans/demo/Staylin_alive.wav is: rock
