In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
!kaggle datasets download -d andradaolteanu/gtzan-dataset-music-genre-classification

Dataset URL: https://www.kaggle.com/datasets/andradaolteanu/gtzan-dataset-music-genre-classification
License(s): other
Downloading gtzan-dataset-music-genre-classification.zip to /content
100% 1.21G/1.21G [00:06<00:00, 260MB/s]
100% 1.21G/1.21G [00:06<00:00, 196MB/s]


In [None]:
!unzip -q '*.zip'

In [1]:
import os
import numpy as np
import librosa
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, CSVLogger

# === Config ===
DATA_DIR = "/content/drive/MyDrive/Data/Data/genres_original"
CLASSES = ['blues', 'classical', 'country', 'disco', 'hiphop',
           'jazz', 'metal', 'pop', 'reggae', 'rock']
TARGET_SHAPE = (150, 150)
BATCH_SIZE = 32
EPOCHS = 30

# === Checkpoint Setup ===
checkpoint_dir = "checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

checkpoint_path = "checkpoints/genre_model_epoch_{epoch:02d}_valacc_{val_accuracy:.2f}.h5"

checkpoint_cb = ModelCheckpoint(
    filepath=checkpoint_path,
    monitor='val_accuracy',
    verbose=1,
    save_best_only=True,
    save_weights_only=False,
    mode='max'
)

csv_logger = CSVLogger('training_log.csv', append=True)

# === GPU Info ===
print("Available GPUs:", tf.config.list_physical_devices('GPU'))

# === Data Loader ===
def load_and_preprocess_data(data_dir, classes, target_shape=(150, 150)):
    data = []
    labels = []

    for i_class, class_name in enumerate(classes):
        class_dir = os.path.join(data_dir, class_name)
        print("Processing --", class_name)
        for filename in os.listdir(class_dir):
            if filename.endswith('.wav'):
                file_path = os.path.join(class_dir, filename)
                try:
                    audio_data, sample_rate = librosa.load(file_path, sr=None)
                except Exception as e:
                    print(f"Skipped {file_path} due to error: {e}")
                    continue

                chunk_duration = 4
                overlap_duration = 2
                chunk_samples = chunk_duration * sample_rate
                overlap_samples = overlap_duration * sample_rate
                num_chunks = int(np.ceil((len(audio_data) - chunk_samples) / (chunk_samples - overlap_samples))) + 1

                for i in range(num_chunks):
                    start = i * (chunk_samples - overlap_samples)
                    end = start + chunk_samples
                    chunk = audio_data[start:end]
                    if len(chunk) < chunk_samples:
                        continue
                    mel = librosa.feature.melspectrogram(y=chunk, sr=sample_rate)
                    mel = librosa.power_to_db(mel, ref=np.max)
                    mel = np.expand_dims(mel, axis=-1)
                    mel = tf.image.resize(mel, target_shape).numpy()
                    data.append(mel)
                    labels.append(i_class)

    return np.array(data), np.array(labels)

# === Load Data ===
data, labels = load_and_preprocess_data(DATA_DIR, CLASSES, TARGET_SHAPE)
print("Data shape:", data.shape)
print("Labels shape:", labels.shape)

labels = to_categorical(labels, num_classes=len(CLASSES))
X_train, X_test, Y_train, Y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# === Load Last Checkpoint if Available ===
latest_model = None
saved_models = sorted([f for f in os.listdir(checkpoint_dir) if f.endswith(".keras")])
if saved_models:
    latest_model = os.path.join(checkpoint_dir, saved_models[-1])
    print(f"Loading model from checkpoint: {latest_model}")
    model = load_model(latest_model)
else:
    # === Build Model from Scratch ===
    model = Sequential()
    model.add(Conv2D(32, kernel_size=3, padding='same', activation='relu', input_shape=X_train[0].shape))
    model.add(Conv2D(32, kernel_size=3, activation='relu'))
    model.add(MaxPooling2D(pool_size=2, strides=2))

    model.add(Conv2D(64, kernel_size=3, padding='same', activation='relu'))
    model.add(Conv2D(64, kernel_size=3, activation='relu'))
    model.add(MaxPooling2D(pool_size=2, strides=2))

    model.add(Conv2D(128, kernel_size=3, padding='same', activation='relu'))
    model.add(Conv2D(128, kernel_size=3, activation='relu'))
    model.add(MaxPooling2D(pool_size=2, strides=2))
    model.add(Dropout(0.3))

    model.add(Conv2D(256, kernel_size=3, padding='same', activation='relu'))
    model.add(Conv2D(256, kernel_size=3, activation='relu'))
    model.add(MaxPooling2D(pool_size=2, strides=2))

    model.add(Conv2D(512, kernel_size=3, padding='same', activation='relu'))
    model.add(Conv2D(512, kernel_size=3, activation='relu'))
    model.add(MaxPooling2D(pool_size=2, strides=2))
    model.add(Dropout(0.3))

    model.add(Flatten())
    model.add(Dense(1200, activation='relu'))
    model.add(Dropout(0.45))
    model.add(Dense(len(CLASSES), activation='softmax'))

# === Compile Model ===
model.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

# === Train Model ===
model.summary()
history = model.fit(
    X_train, Y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_data=(X_test, Y_test),
    callbacks=[checkpoint_cb, csv_logger]
)

# === Final Save ===
model.save("genre_classifier_model.keras")
model.save("genre_classifier_model.h5")


Available GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Processing -- blues
Processing -- classical
Processing -- country
Processing -- disco
Processing -- hiphop
Processing -- jazz


  audio_data, sample_rate = librosa.load(file_path, sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Skipped /content/drive/MyDrive/Data/Data/genres_original/jazz/jazz.00054.wav due to error: 
Processing -- metal
Processing -- pop
Processing -- reggae
Processing -- rock
Data shape: (17694, 150, 150, 1)
Labels shape: (17694,)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/30
[1m443/443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step - accuracy: 0.2455 - loss: 2.1324
Epoch 1: val_accuracy improved from -inf to 0.36677, saving model to checkpoints/genre_model_epoch_01_valacc_0.37.h5




[1m443/443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 115ms/step - accuracy: 0.2455 - loss: 2.1320 - val_accuracy: 0.3668 - val_loss: 1.6167
Epoch 2/30
[1m442/443[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 73ms/step - accuracy: 0.4423 - loss: 1.5077
Epoch 2: val_accuracy improved from 0.36677 to 0.62645, saving model to checkpoints/genre_model_epoch_02_valacc_0.63.h5




[1m443/443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 78ms/step - accuracy: 0.4426 - loss: 1.5069 - val_accuracy: 0.6264 - val_loss: 1.0347
Epoch 3/30
[1m442/443[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 73ms/step - accuracy: 0.6411 - loss: 0.9887
Epoch 3: val_accuracy improved from 0.62645 to 0.75615, saving model to checkpoints/genre_model_epoch_03_valacc_0.76.h5




[1m443/443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 78ms/step - accuracy: 0.6412 - loss: 0.9883 - val_accuracy: 0.7561 - val_loss: 0.6862
Epoch 4/30
[1m442/443[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 74ms/step - accuracy: 0.7516 - loss: 0.7046
Epoch 4: val_accuracy improved from 0.75615 to 0.76180, saving model to checkpoints/genre_model_epoch_04_valacc_0.76.h5




[1m443/443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 79ms/step - accuracy: 0.7516 - loss: 0.7045 - val_accuracy: 0.7618 - val_loss: 0.6580
Epoch 5/30
[1m442/443[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 74ms/step - accuracy: 0.7912 - loss: 0.6027
Epoch 5: val_accuracy improved from 0.76180 to 0.84289, saving model to checkpoints/genre_model_epoch_05_valacc_0.84.h5




[1m443/443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 80ms/step - accuracy: 0.7913 - loss: 0.6026 - val_accuracy: 0.8429 - val_loss: 0.4553
Epoch 6/30
[1m442/443[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 74ms/step - accuracy: 0.8365 - loss: 0.4798
Epoch 6: val_accuracy improved from 0.84289 to 0.87172, saving model to checkpoints/genre_model_epoch_06_valacc_0.87.h5




[1m443/443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 80ms/step - accuracy: 0.8365 - loss: 0.4797 - val_accuracy: 0.8717 - val_loss: 0.3628
Epoch 7/30
[1m442/443[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 75ms/step - accuracy: 0.8637 - loss: 0.3901
Epoch 7: val_accuracy improved from 0.87172 to 0.87285, saving model to checkpoints/genre_model_epoch_07_valacc_0.87.h5




[1m443/443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 80ms/step - accuracy: 0.8637 - loss: 0.3901 - val_accuracy: 0.8728 - val_loss: 0.3584
Epoch 8/30
[1m442/443[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 75ms/step - accuracy: 0.8951 - loss: 0.3034
Epoch 8: val_accuracy improved from 0.87285 to 0.88358, saving model to checkpoints/genre_model_epoch_08_valacc_0.88.h5




[1m443/443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 81ms/step - accuracy: 0.8951 - loss: 0.3034 - val_accuracy: 0.8836 - val_loss: 0.3250
Epoch 9/30
[1m442/443[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 75ms/step - accuracy: 0.9160 - loss: 0.2515
Epoch 9: val_accuracy improved from 0.88358 to 0.91608, saving model to checkpoints/genre_model_epoch_09_valacc_0.92.h5




[1m443/443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 82ms/step - accuracy: 0.9160 - loss: 0.2515 - val_accuracy: 0.9161 - val_loss: 0.2430
Epoch 10/30
[1m442/443[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 75ms/step - accuracy: 0.9251 - loss: 0.2201
Epoch 10: val_accuracy improved from 0.91608 to 0.92342, saving model to checkpoints/genre_model_epoch_10_valacc_0.92.h5




[1m443/443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 81ms/step - accuracy: 0.9251 - loss: 0.2200 - val_accuracy: 0.9234 - val_loss: 0.2335
Epoch 11/30
[1m442/443[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 75ms/step - accuracy: 0.9363 - loss: 0.1819
Epoch 11: val_accuracy did not improve from 0.92342
[1m443/443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 81ms/step - accuracy: 0.9363 - loss: 0.1819 - val_accuracy: 0.9039 - val_loss: 0.2810
Epoch 12/30
[1m442/443[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 75ms/step - accuracy: 0.9494 - loss: 0.1454
Epoch 12: val_accuracy did not improve from 0.92342
[1m443/443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 81ms/step - accuracy: 0.9495 - loss: 0.1454 - val_accuracy: 0.9169 - val_loss: 0.2648
Epoch 13/30
[1m442/443[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 75ms/step - accuracy: 0.9523 - loss: 0.



[1m443/443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 81ms/step - accuracy: 0.9671 - loss: 0.0925 - val_accuracy: 0.9376 - val_loss: 0.2102
Epoch 16/30
[1m442/443[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 75ms/step - accuracy: 0.9643 - loss: 0.1069
Epoch 16: val_accuracy did not improve from 0.93755
[1m443/443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 81ms/step - accuracy: 0.9643 - loss: 0.1069 - val_accuracy: 0.9333 - val_loss: 0.2091
Epoch 17/30
[1m442/443[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 75ms/step - accuracy: 0.9727 - loss: 0.0751
Epoch 17: val_accuracy did not improve from 0.93755
[1m443/443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 81ms/step - accuracy: 0.9727 - loss: 0.0752 - val_accuracy: 0.9116 - val_loss: 0.2601
Epoch 18/30
[1m442/443[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 75ms/step - accuracy: 0.9802 - loss: 0.



[1m443/443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 81ms/step - accuracy: 0.9802 - loss: 0.0598 - val_accuracy: 0.9472 - val_loss: 0.1847
Epoch 19/30
[1m442/443[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 75ms/step - accuracy: 0.9795 - loss: 0.0607
Epoch 19: val_accuracy improved from 0.94716 to 0.95535, saving model to checkpoints/genre_model_epoch_19_valacc_0.96.h5




[1m443/443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 81ms/step - accuracy: 0.9795 - loss: 0.0607 - val_accuracy: 0.9554 - val_loss: 0.1508
Epoch 20/30
[1m442/443[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 75ms/step - accuracy: 0.9842 - loss: 0.0420
Epoch 20: val_accuracy did not improve from 0.95535
[1m443/443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 81ms/step - accuracy: 0.9842 - loss: 0.0421 - val_accuracy: 0.9373 - val_loss: 0.2047
Epoch 21/30
[1m442/443[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 75ms/step - accuracy: 0.9856 - loss: 0.0459
Epoch 21: val_accuracy did not improve from 0.95535
[1m443/443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 81ms/step - accuracy: 0.9856 - loss: 0.0460 - val_accuracy: 0.9291 - val_loss: 0.2462
Epoch 22/30
[1m442/443[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 75ms/step - accuracy: 0.9802 - loss: 0.



[1m443/443[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 81ms/step - accuracy: 0.9894 - loss: 0.0335 - val_accuracy: 0.9599 - val_loss: 0.1439




In [2]:
import os
import numpy as np
import librosa
import tensorflow as tf
from keras.models import load_model
from collections import Counter

# === Configuration ===
MODEL_PATH = "genre_classifier_model.h5"
AUDIO_PATH = "/content/drive/MyDrive/Data/Taylor Swift - Haunted.mp3"
CLASSES = ['blues', 'classical', 'country', 'disco', 'hiphop',
           'jazz', 'metal', 'pop', 'reggae', 'rock']
TARGET_SHAPE = (150, 150)

# === Load model ===
model = load_model(MODEL_PATH, compile=False)
print("Model loaded.")

def predict_genre_multiple_chunks(audio_path):
    audio_data, sample_rate = librosa.load(audio_path, sr=None)
    chunk_duration = 4
    overlap = 2
    chunk_samples = chunk_duration * sample_rate
    overlap_samples = overlap * sample_rate
    num_chunks = int(np.ceil((len(audio_data) - chunk_samples) / (chunk_samples - overlap_samples))) + 1

    predictions = []

    for i in range(num_chunks):
        start = i * (chunk_samples - overlap_samples)
        end = start + chunk_samples
        if end > len(audio_data):
            break
        chunk = audio_data[start:end]

        mel = librosa.feature.melspectrogram(y=chunk, sr=sample_rate)
        mel = librosa.power_to_db(mel, ref=np.max)
        mel = np.expand_dims(mel, axis=-1)
        mel = tf.image.resize(mel, TARGET_SHAPE).numpy()
        mel = np.expand_dims(mel, axis=0)

        pred = model.predict(mel, verbose=0)
        class_idx = np.argmax(pred[0])
        predictions.append(class_idx)

    if not predictions:
        print("No valid chunks to predict.")
        return

    vote_counts = Counter(predictions)
    most_common = vote_counts.most_common(1)[0]
    predicted_label = CLASSES[most_common[0]]
    print(f"Predicted Genre: {predicted_label} (from {num_chunks} chunks)")
    print("Votes:", {CLASSES[i]: count for i, count in vote_counts.items()})

# === Run ===
predict_genre_multiple_chunks(AUDIO_PATH)


Model loaded.
Predicted Genre: pop (from 108 chunks)
Votes: {'pop': 98, 'hiphop': 3, 'country': 4, 'rock': 2}
