# GTZAN - Baseline CNN and Transfer learning

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import glob
import os
import PIL
import tensorflow as tf

from tensorflow.keras import models, layers
from tensorflow.keras.preprocessing import image_dataset_from_directory
from tensorflow.keras.models import Sequential
from tensorflow.keras.applications import EfficientNetB0, EfficientNetB3, MobileNetV2, InceptionV3
from tensorflow.keras.optimizers import Adam

In [None]:
img_data = '../input/gtzan-dataset-music-genre-classification/Data/images_original/'

## Data Loader

In [None]:
BATCH_SIZE=8
TARGET_SIZE=224 # Based on EfficientNetB0
NUM_CLASSES=10

In [None]:
train_ds = image_dataset_from_directory(
  img_data,
  validation_split=0.2,
  subset="training",
  seed=123,
  image_size=(TARGET_SIZE, TARGET_SIZE),
  batch_size=BATCH_SIZE)

In [None]:
val_ds = image_dataset_from_directory(
  img_data,
  validation_split=0.2,
  subset="validation",
  seed=123,
  image_size=(TARGET_SIZE, TARGET_SIZE),
  batch_size=BATCH_SIZE)

In [None]:
class_names = train_ds.class_names
print(class_names)

In [None]:
plt.figure(figsize=(20, 20))
for images, labels in train_ds.take(1):
    for i in range(8):
        ax = plt.subplot(3, 3, i + 1)
        plt.imshow(images[i].numpy().astype("uint8"))
        plt.title(class_names[labels[i]])
        plt.axis("off")

## Callbacks and Helper Functions

In [None]:
model_save = tf.keras.callbacks.ModelCheckpoint('./best_weights.h5', 
                             save_best_only = True, 
                             save_weights_only = True,
                             monitor = 'val_loss', 
                             mode = 'min', verbose = 1)
early_stop = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', min_delta = 0.001, 
                           patience = 10, mode = 'min', verbose = 1,
                           restore_best_weights = True)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'val_loss', factor = 0.3, 
                              patience = 2, min_delta = 0.001, 
                              mode = 'min', verbose = 1)

In [None]:
def plot_hist(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']

    loss = history.history['loss']
    val_loss = history.history['val_loss']

    plt.figure(figsize=(10, 5))
    
    plt.subplot(1, 2, 1)
    plt.plot(acc, label='Training Accuracy')
    plt.plot(val_acc, label='Validation Accuracy')
    plt.legend(loc='lower right')
    plt.title('Training and Validation Accuracy')
    plt.grid()

    plt.subplot(1, 2, 2)
    plt.plot(loss, label='Training Loss')
    plt.plot(val_loss, label='Validation Loss')
    plt.legend(loc='upper right')
    plt.title('Training and Validation Loss')
    plt.grid()
    plt.show()

## Modeling

In [None]:
model = Sequential([
  layers.experimental.preprocessing.Rescaling(1./255, input_shape=(TARGET_SIZE, TARGET_SIZE, 3)),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(32, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(64, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(NUM_CLASSES)
])

In [None]:
model.compile(optimizer=Adam(lr = 0.001),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
model.summary()


In [None]:
epochs=30
history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs,
  callbacks=[model_save, early_stop, reduce_lr],
  verbose=2
)

In [None]:
plot_hist(history)

## CNN with Dropout

In [None]:
model = Sequential([
  layers.experimental.preprocessing.Rescaling(1./255, input_shape=(TARGET_SIZE, TARGET_SIZE, 3)),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(32, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(64, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Dropout(0.2),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(NUM_CLASSES)
])

In [None]:
model.compile(optimizer=Adam(lr = 0.001),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
epochs = 30
history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs,
  callbacks=[model_save, early_stop, reduce_lr],
  verbose=2
)

In [None]:
plot_hist(history)

## EfficientNet train from scratch

In [None]:
def create_model():
    conv_base = EfficientNetB0(include_top = False, weights = None,
                               input_shape = (TARGET_SIZE, TARGET_SIZE, 3))
    model = conv_base.output
    model = layers.GlobalAveragePooling2D()(model)
    model = layers.Dense(NUM_CLASSES, activation = "softmax")(model)
    model = models.Model(conv_base.input, model)

    model.compile(optimizer = Adam(lr = 0.001),
                  loss = "sparse_categorical_crossentropy",
                  metrics = ["accuracy"])
    return model
model = create_model()
model.summary()

In [None]:
epochs = 30
history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs,
  callbacks=[model_save, early_stop, reduce_lr],
  verbose=2
)

In [None]:
plot_hist(history)

## Transfer learning - EfficientNet (mostly retrain)

In [None]:
def create_model():
    conv_base = EfficientNetB0(include_top = False, weights = "imagenet", drop_connect_rate=0.6,
                               input_shape = (TARGET_SIZE, TARGET_SIZE, 3))
    model = conv_base.output
    model = layers.GlobalAveragePooling2D()(model)
    model = layers.Dense(NUM_CLASSES, activation = "softmax")(model)
    model = models.Model(conv_base.input, model)

    model.compile(optimizer = Adam(lr = 0.001),
                  loss = "sparse_categorical_crossentropy",
                  metrics = ["accuracy"])
    return model
model = create_model()
model.summary()

In [None]:
epochs = 30
history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs,
  callbacks=[model_save, early_stop, reduce_lr],
  verbose=2
)

In [None]:
plot_hist(history)

## Transfer learning - EfficientNet (the usual way of transfer learning)

In [None]:
def create_model():
    conv_base = EfficientNetB0(include_top = False, weights = "imagenet", drop_connect_rate=0.6,
                               input_shape = (TARGET_SIZE, TARGET_SIZE, 3))
    # Freeze pre-trained layers
    conv_base.trainable = False
    
    # Re-build top layers
    model = conv_base.output
    model = layers.GlobalAveragePooling2D()(model)
    model = layers.BatchNormalization()(model)
    
    dropout_rate=0.2
    model = layers.Dropout(dropout_rate, name="top_dropout")(model)
    model = layers.Dense(NUM_CLASSES, activation = "softmax")(model)
    model = models.Model(conv_base.input, model)

    model.compile(optimizer = Adam(lr = 0.01),
                  loss = "sparse_categorical_crossentropy",
                  metrics = ["accuracy"])
    return model

model = create_model()
model.summary()

In [None]:
epochs = 30
history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs,
  callbacks=[model_save, early_stop, reduce_lr],
  verbose=2
)

In [None]:
plot_hist(history)

In [None]:
weights_path = './first_finetune_weights.h5'
model.save_weights(weights_path)

### Unfreeze some of the layers

In [None]:
NUM_UNFREEZE_LAYERS = 100

cont_model = tf.keras.models.clone_model(model)
cont_model.load_weights(weights_path)

def unfreeze_model(model):
    # We unfreeze the top NUM_UNFREEZE_LAYERS layers while leaving BatchNorm layers frozen
    for layer in model.layers[-NUM_UNFREEZE_LAYERS:]:
        if not isinstance(layer, layers.BatchNormalization):
            layer.trainable = True

    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
    model.compile(
        optimizer=optimizer, loss="sparse_categorical_crossentropy", 
        metrics=["accuracy"]
    )


unfreeze_model(cont_model)
cont_model.summary()

In [None]:
epochs = 30  # @param {type: "slider", min:8, max:50}
history = cont_model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs,
  callbacks=[model_save, early_stop, reduce_lr],
  verbose=2
)

In [None]:
plot_hist(history)

### Unfreeze all the layers

In [None]:
cont_model = tf.keras.models.clone_model(model)
cont_model.load_weights(weights_path)

def unfreeze_whole_model(model):
    # We unfreeze the whole layers while leaving BatchNorm layers frozen
    for layer in model.layers:
        if not isinstance(layer, layers.BatchNormalization):
            layer.trainable = True

    optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)
    model.compile(
        optimizer=optimizer, loss="sparse_categorical_crossentropy", 
        metrics=["accuracy"]
    )


unfreeze_whole_model(cont_model)
cont_model.summary()

In [None]:
epochs = 30  # @param {type: "slider", min:8, max:50}
history = cont_model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs,
  callbacks=[model_save, early_stop, reduce_lr],
  verbose=2
)

In [None]:
plot_hist(history)

## Observations
- Results:
    - Baseline CNN - 0.5 val
    - CNN with Dropout - 0.6 val
    - EfficientNetB0 transfer learning - 0.78 val
- All models show overfitting, this may be due to the lack of training data for each class.
- Ordinary data augmentation may not be feasible for song data like GTZAN, because:
    - Cannot use typical transformations like rotation, zoom, flipping because spectrogram would be non-sense
    - Cannot use audio transformation because this will distort the original song.
    - Solution: Research specific methods of data augmentation for song data.
- Why usual way of transfer learning does not perform better than fine-tuning the whole EfficientNet model?

### Reference
- https://www.tensorflow.org/tutorials/images/classification
- https://keras.io/examples/vision/image_classification_efficientnet_fine_tuning/