In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import librosa
import seaborn as sns
import matplotlib
from sklearn.model_selection import train_test_split
import gc
import wave
import joblib
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.preprocessing as pp
from tqdm import tqdm
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau,EarlyStopping
import os 


%matplotlib inline
matplotlib.style.use('ggplot')
%config InlineBackend.figure_format = 'retina'
sns.set(font_scale=1.5)

In [None]:
print(tf.__version__)

In [None]:
fig_size=(15,8)
sns.set(rc={'figure.figsize':fig_size})

In [None]:
df = pd.read_csv('../input/freesound-audio-tagging/train_post_competition.csv')
TRAIN_FILES_PATH = '../input/freesound-audio-tagging/audio_train/'
RANDOM_STATE=42

In [None]:
df.head()

In [None]:
df.groupby('label').describe()

In [None]:
df.groupby('label')['label'].count().plot(kind='bar')

In [None]:
SAMPLE_RATE = 44100
NUM_MFCC = 40
N_FFT = 2048
HOP_LENGTH = 512
EPOCHS=50
AUDIO_LENGTH = 2

In [None]:
def get_file_length(file_name):
    file_path = TRAIN_FILES_PATH + file_name 
    return 0
    #return  wave.open(file_path).getnframes()

df['length'] = df['fname'].apply(get_file_length)

In [None]:
gc.collect()

In [None]:
_, ax = plt.subplots(figsize=(16, 4))
sns.violinplot(ax=ax, x="label", y="length", data=df)
plt.xticks(rotation=90)
plt.title('Distribution of audio frames, per label', fontsize=16)
plt.show()

In [None]:
COMPLETE_RUN = True
LABELS = list(df['label'].unique())
label_idx = {label: i for i, label in enumerate(LABELS)}

if not COMPLETE_RUN:
    df = df[:2000]

df["label_idx"] = df['label'].apply(lambda x: label_idx[x])
X = df['fname'].apply(lambda x: TRAIN_FILES_PATH + x).values
y = df['label_idx'].values


In [None]:
file_path = TRAIN_FILES_PATH + df.head(1).values.tolist()[0][0]
file_path

In [None]:
data, _ = librosa.core.load(file_path, sr=SAMPLE_RATE)


In [None]:
S = librosa.feature.melspectrogram(data, sr=SAMPLE_RATE, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=NUM_MFCC)
S.shape

In [None]:
data = librosa.feature.mfcc(data, SAMPLE_RATE, n_mfcc=NUM_MFCC, n_fft=N_FFT, hop_length=HOP_LENGTH)
data.shape

In [None]:
def prepare_datasets(X, y, test_size, validation_size):
    # create train, validation and test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=validation_size)

    return X_train, X_validation, X_test, y_train, y_validation, y_test

In [None]:
#Audio length in seconds
def pre_process(file_path,
                audio_length=AUDIO_LENGTH, 
                sample_rate=SAMPLE_RATE,
                num_mfcc=NUM_MFCC, 
                n_fft=N_FFT, 
                hop_length=HOP_LENGTH):
    
    data, _ = librosa.core.load(file_path, sr=sample_rate, res_type="kaiser_fast")
    input_length = audio_length * sample_rate
    # Random offset / Padding
    if len(data) > input_length:
        max_offset = len(data) - input_length
        offset = np.random.randint(max_offset)
        data = data[offset:(input_length+offset)]
    else:
        if input_length > len(data):
            max_offset = input_length - len(data)
            offset = np.random.randint(max_offset)
        else:
            offset = 0
        data = np.pad(data, (offset, input_length - len(data) - offset), "constant")

    data = librosa.feature.mfcc(data, sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)
    data = data.T
    #data = librosa.feature.melspectrogram(data, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=num_mfcc)
    data = data[..., np.newaxis]
    return file_path, data

In [None]:
OUTPUT_PATH = "./output/"
CACHE_PATH = "./cache/"
AUTOTUNE = tf.data.experimental.AUTOTUNE
PROCESSING_BATCH_SIZE = 16
import shutil
if os.path.exists(CACHE_PATH):
    shutil.rmtree(CACHE_PATH)

In [None]:
def path_to_id(file_path):
    return file_path.split("/")[-1].split(".")[0]

In [None]:
def save_object(obj,name, folder_path = OUTPUT_PATH):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    path=folder_path + name + '.pkl'
    joblib.dump(obj, path) 

def load_object(name, folder_path = OUTPUT_PATH):
    return joblib.load(folder_path + name + '.pkl')

In [None]:
def load_image_features(file_id):
    return load_object(file_id, folder_path=CACHE_PATH)

In [None]:
def extract_features_from_files(data):
    file_paths = list(set(data))
    dataset = tf.data.Dataset.from_tensor_slices(file_paths)
    dataset = dataset.map(lambda x: tf.numpy_function(func=pre_process,
              inp=[x], Tout=[tf.string,tf.float32]), num_parallel_calls=AUTOTUNE)
                    
    # https://www.tensorflow.org/guide/data
    for paths, images in tqdm(dataset.batch(PROCESSING_BATCH_SIZE)):
        for index, extracted_feature in enumerate(images):
            file_id = path_to_id(paths[index].numpy().decode("utf-8"))
            save_object(extracted_feature.numpy(),file_id, folder_path=CACHE_PATH)

In [None]:
extract_features_from_files(X)


In [None]:
#dim = (NUM_MFCC,1 + int(np.floor(AUDIO_LENGTH*SAMPLE_RATE/HOP_LENGTH)), 1)

dim = (1 + int(np.floor(AUDIO_LENGTH*SAMPLE_RATE/HOP_LENGTH)),NUM_MFCC, 1)



X_NEW = np.empty(shape=(X.shape[0], dim[0], dim[1], 1))
for i, path in enumerate(X):
    if i%500 == 0:
        print("created mfcc {}".format(i))
    X_NEW[i,] = load_image_features(path_to_id(path))

In [None]:
# get train, validation, test splits
mean = np.mean(X_NEW, axis=0)
std = np.std(X_NEW, axis=0)

X_NEW = (X_NEW - mean)/std

X_train, X_validation, X_test, y_train, y_validation, y_test = prepare_datasets(X_NEW, y, 0.2, 0.2)

In [None]:
SHUFFLE_BUFFER_SIZE = 1000
BATCH_SIZE = 64
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
import tensorflow_addons as tfa
@tf.function
def augment_cutout(image):
    image = tf.expand_dims(image, 0)
    image = tfa.image.random_cutout(image, (5,5), constant_values = 0)
    return tf.squeeze(image,[0])

In [None]:
@tf.function
def augment_brightness(image):
     return tf.image.random_brightness(image, max_delta=0.95, seed=RANDOM_STATE)

In [None]:
@tf.function
def augment_contrast(image):
     return tf.image.random_contrast(image, lower=0.1, upper=0.9, seed=RANDOM_STATE)

In [None]:
@tf.function
def augment_saturation(image):
     return tf.image.random_saturation(image, lower=0.1, upper=0.9, seed=RANDOM_STATE)

In [None]:
@tf.function
def translate_tf(image):
    ratio=tf.random.uniform((2,), minval=-2, maxval=2, dtype=tf.dtypes.int32)
    ratio=tf.cast(ratio, tf.dtypes.float32)
    return tfa.image.translate(image, ratio,'BILINEAR')

In [None]:
@tf.function
def augment(image, label):
    random= tf.random.uniform((1,), minval=0, maxval=5, dtype=tf.dtypes.int32)
    if random==0:
        image = augment_cutout(image)
#     elif random==1:
#         image = augment_brightness(image)
#     elif random == 2:
#         image = augment_contrast(image)
#     elif random == 4:
#         image = translate_tf(image)
    return image, label

In [None]:
def get_data_set(files, labels, is_training=False):
    dataset = tf.data.Dataset.from_tensor_slices((files, labels))
    if is_training:
        dataset = dataset.shuffle(SHUFFLE_BUFFER_SIZE)
        dataset = dataset.map(augment, num_parallel_calls=AUTOTUNE)        
    dataset = dataset.batch(BATCH_SIZE)
    dataset = dataset.prefetch(AUTOTUNE)
    return dataset

In [None]:
train_dataset = get_data_set(X_train, y_train, is_training=True)
val_dataset = get_data_set(X_validation, y_validation)

In [None]:
sample_file_batch, sample_label_batch = next(iter(train_dataset))
print(sample_file_batch.shape) 
print(sample_label_batch.shape) 

# Model

In [None]:
class CNNBlock(layers.Layer):
    def __init__(self, 
                 out_channel, 
                 kernel_size=3, 
                 dropout=None, 
                 has_maxpool=True,
                 maxpool_strides=None,
                 max_pool_size = (2, 2)):
        super(CNNBlock,self).__init__()
        self.conv = layers.Conv2D(out_channel, kernel_size)
        self.bn = layers.BatchNormalization()
        self.activation = layers.Activation("relu")
        self.maxpool = None
        self.dropout = None
        if has_maxpool:
            self.maxpool = layers.MaxPool2D(pool_size=max_pool_size, strides=maxpool_strides, padding="same")
        if dropout:
            self.dropout = layers.Dropout(dropout)
    
    def call(self, input_tensor, training=False):
        x = self.conv(input_tensor)
        x = self.bn(x, training=training)
        x = self.activation(x)
        if self.maxpool:
            x = self.maxpool(x)
        if self.dropout:
            x = self.dropout(x)
        return x    

In [None]:
def get_uncompiled_model():
    inputs = layers.Input(shape=(dim[0],dim[1],1))
    model = keras.Sequential(
         [
          CNNBlock(32,kernel_size=(3,3),dropout=.3), 
          CNNBlock(32,kernel_size=(3, 3),dropout=.3), 
          CNNBlock(32,kernel_size= (3,3),dropout=.3), 
          layers.Flatten(), 
          layers.Dense(64),
          layers.BatchNormalization(),
          layers.Activation("relu"),
          layers.Dropout(0.5),
          layers.Dense(len(LABELS))]
    )
    return model

In [None]:
def get_compiled_model():
    model = get_uncompiled_model()
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.01),
        loss=[keras.losses.SparseCategoricalCrossentropy(from_logits=True),],
        metrics=["accuracy"],
    )
    return model

In [None]:
model = get_compiled_model()

In [None]:
reduce_lr_on_plateau = ReduceLROnPlateau(monitor='val_loss',
                       factor=0.5,
                       patience=3,
                       min_lr=0.000001,
                       verbose=1)  
filepath = 'model.h5'
checkpoint=tf.keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True,
        save_weights_only=True, mode='min')
#early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=8)
callbacks_list = [reduce_lr_on_plateau,checkpoint]
# history = model.fit(X_train, 
#                     y_train, 
#                     batch_size=BATCH_SIZE,
#                     validation_data=(X_validation, y_validation), 
#                     epochs=EPOCHS, 
#                     verbose=1, 
#                     callbacks=callbacks_list).history

history = model.fit(train_dataset, 
                    epochs=EPOCHS, 
                    verbose=1,
                    validation_data=val_dataset,
                    callbacks=callbacks_list).history

In [None]:
model.summary()

In [None]:
plt.figure()
plt.ylabel("Loss (training and validation)")
plt.xlabel("Training Steps")
plt.ylim([0,4])
plt.plot(history["loss"])
plt.plot(history["val_loss"])

plt.figure()
plt.ylabel("Accuracy (training and validation)")
plt.xlabel("Training Steps")
plt.ylim([0,1])
plt.plot(history["accuracy"])
plt.plot(history["val_accuracy"])



# Reference
[Notebook](https://www.kaggle.com/fizzbuzz/beginner-s-guide-to-audio-data)