# Music Genre Classifier


In [None]:
import os
import cv2
import wget
import random
import shutil
import tarfile
import numpy as np
import pandas as pd
import tensorflow as tf
from pandas import DataFrame
from keras import Sequential
from sklearn.model_selection import train_test_split
from keras.layers import Flatten, Dense, Conv2D, MaxPool2D, Activation

In [None]:
SEED_VALUE = 42

# Fix seed to make training deterministic.
random.seed(SEED_VALUE)
np.random.seed(SEED_VALUE)
tf.random.set_seed(SEED_VALUE)

## Load the GTZAN Dataset


In [None]:
if os.getenv('COLAB_RELEASE_TAG'):
    from google.colab import drive 
    drive.mount('/content/gdrive')

In [None]:
PREPROCESSING = False

try:
    if 'dataset' not in os.listdir('/content'):
        os.mkdir('/content/dataset/')
    pwd = os.getcwd()
    os.chdir('/content/dataset/')

    if 'preprocessing.tar.gz' not in os.listdir('.'):
        if os.getenv('COLAB_RELEASE_TAG'):
            if 'preprocessing.tar.gz' in os.listdir('/content/gdrive/MyDrive'):
                shutil.copy2('/content/gdrive/MyDrive/preprocessing.tar.gz', '.')
            else:
                # make sure to download the GTZAN dataset from "https://drive.google.com/file/d/1UdmqcrBw71EgOtCLy6ic_C6EDBz9KQt_/view?usp=share_link"
                # upload it to your own google drive for it to be copied in the previous if statement block
                pass
        else:
            if pwd != '/content/dataset':
                if 'preprocessing.tar.gz' in os.listdir(f'{pwd}/dataset'):
                    shutil.copy2(f'{pwd}/dataset/preprocessing.tar.gz', '.')
                else:
                    raise Exception("Download the GTZAN dataset preprocessed.")

        tar = tarfile.open('preprocessing.tar.gz', 'r:gz')
        tar.extractall()
        tar.close()

finally:
    GENRES = os.listdir('/content/dataset/preprocessing/mfcc')


## Dataset


In [None]:
def load_data(src: str, feature: str, test_size: float=0.3, 
              random_state: float = SEED_VALUE, shuffle: bool = True, 
              stratify: list = None):
    """
        Divide the data set into three subsets, the training set, \
        the test set and the validation set.
    """
    dataset = []
    for genre in os.listdir(f'{src}/{feature}'):
        for img in os.listdir(f'{src}/{feature}/{genre}'):
            img = cv2.imread(f'{src}/{feature}/{genre}/{img}')
            # img = cv2.resize(img, (256, 192))
            img = np.array(img, dtype=np.float32)
            dataset.append([img, genre])

    df = DataFrame(data=np.array(dataset, dtype=object), columns=[feature, 'genre'])

    one_hot = pd.get_dummies(df['genre'])

    df = pd.concat([df, one_hot], axis=1)
    df.drop(['genre'], axis=1, inplace=True)
    
    strat = df[stratify] if stratify else None
    train_set, test_set = train_test_split(df, test_size=test_size, random_state=random_state, 
                     shuffle=shuffle, stratify=strat)
    
    strat = test_set[stratify] if stratify else None
    test_set, val_set = train_test_split(test_set, test_size=0.5, random_state=random_state, 
                     shuffle=shuffle, stratify=strat)

    return (np.array([tf.convert_to_tensor(img) for img in train_set['mfcc']]), train_set[GENRES]) \
        , (np.array([tf.convert_to_tensor(img) for img in test_set['mfcc']]), test_set[GENRES]) \
        , (np.array([tf.convert_to_tensor(img) for img in val_set['mfcc']]), val_set[GENRES])

In [None]:
(X_train, y_train), (X_test, y_test), (X_val, y_val) = load_data('./preprocessing/', 'mfcc', stratify=GENRES)

## CNN


In [None]:
model = Sequential()

model.add(Conv2D(input_shape=(217, 334, 3), filters=16, kernel_size=(3,3), activation='relu', padding='same', strides=1))
model.add(Activation('relu'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Conv2D(filters=32, kernel_size=(3,3), activation='relu'))
model.add(Activation('relu'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Conv2D(filters=64, kernel_size=(3,3), activation='relu'))
model.add(Activation('relu'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Conv2D(filters=128, kernel_size=(3,3), activation='relu'))
model.add(Activation('relu'))
model.add(MaxPool2D(pool_size=(2,2)))

model.add(Flatten())
model.add(Dense(units=256, activation='relu'))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=10, activation='softmax'))

model.compile(loss='categorical_crossentropy', 
              optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
history = []

model_storage = '/content'

if os.getenv('COLAB_RELEASE_TAG'):
    model_storage = '/content/gdrive/MyDrive'

if 'model_storage' not in os.listdir(model_storage):
    os.mkdir(f'{model_storage}/model_storage')

last_iter = 0

if len(os.listdir(f'{model_storage}/model_storage')) > 0:
    last_iter = sorted([int(model.removeprefix('model_').removesuffix('.keras'))
        for model in os.listdir(f'{model_storage}/model_storage')], reverse=True)[0]
    model = tf.keras.saving.load_model(f'{model_storage}/model_storage/model_{last_iter}.keras')

for i in range(last_iter, 50):
    history.append(model.fit(x=X_train, y=y_train, epochs=10, validation_data=(X_test, y_test)))
    model.save(f'{model_storage}/model_storage/model_{i}.keras')