## Load Packages

In [None]:
import os
import shutil
import gc

import IPython
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

import librosa
import librosa.display

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import keras
from keras.models import Sequential
from keras.layers import *
from keras_preprocessing.image import ImageDataGenerator

In [None]:
import viztools02

## Load DataFrames

In [None]:
path = '../input/freesound-audio-tagging/'
train_path = path + 'audio_train/'

print(len(os.listdir(train_path)))

In [None]:
train = pd.read_csv("../input/freesound-audio-tagging/train.csv")
test = pd.read_csv("../input/freesound-audio-tagging/sample_submission.csv")

train.head()

In [None]:
print('Train Shape:', train.shape)
print('Test Shape: ', test.shape)

## Unique Labels

In [None]:
uniq_labels = train.label.unique()
print(len(uniq_labels), '\n')
print(uniq_labels)

## Distibution of Classes

In [None]:
category_group = train.groupby(['label', 'manually_verified']).count()
plot = category_group.unstack().reindex(category_group.unstack().sum(axis=1).sort_values().index)\
          .plot(kind='bar', stacked=True, title="Number of Audio Samples per Category", figsize=(16,10))
plot.set_xlabel("Category")
plot.set_ylabel("Number of Samples");

In [None]:
print('Minimum samples per category = ', min(train.label.value_counts()))
print('Maximum samples per category = ', max(train.label.value_counts()))

## Analyze a Single Sound

In [None]:
fname = train_path + '002d256b.wav'
clip, sr = librosa.load(fname, sr=44100)

print(clip.shape)
print(sr)

In [None]:
plt.figure(figsize=[12, 6])
plt.plot(clip)
plt.show()

In [None]:
plt.figure(figsize=[12, 6])
plt.plot(clip[2000:3000])
plt.show()

## Subset DataFrame

For quicker training/experimentation.

In [None]:
train_orig = train.copy()
train = train_orig.sample(n=500, random_state=1)

## Determine Number of Frames for Each File

In [None]:
%%time 
train['nframes'] = train['fname'].apply(lambda f: librosa.load(train_path + f, sr=44100)[0].shape[0])
train.head()

In [None]:
plt.figure(figsize=[12,4])
plt.hist(train.nframes, bins=200)
plt.show()

In [None]:
print(np.min(train.nframes))
print(np.max(train.nframes))

In [None]:
_, ax = plt.subplots(figsize=(20, 4))
sns.violinplot(ax=ax, x="label", y="nframes", data=train)
plt.xticks(rotation=90)
plt.title('Distribution of audio frames, per label', fontsize=16)
plt.show()

## Mel Spectrogram

In [None]:
fname = train_path + '002d256b.wav'

clip, sr = librosa.load(fname, sr=44100)

S1 = librosa.feature.melspectrogram(y=clip, sr=44100)   # Numpy Array
S2 = librosa.power_to_db(S1, ref=np.max)                # Numpy Array

print(S2.shape)

librosa.display.specshow(S2)
plt.show()

## Display Several Spectrograms

In [None]:
train

In [None]:
sample = train.sample(16)

plt.figure(figsize=[16,9])

for i in range(16):
    fname = train_path + sample.fname.iloc[i]
    clip, sr = librosa.load(fname, sr=44100)
    S1 = librosa.feature.melspectrogram(y=clip, sr=44100)   # Numpy Array
    S2 = librosa.power_to_db(S1, ref=np.max)                # Numpy Array
    
    plt.subplot(4, 4, i+1)
    librosa.display.specshow(S2)
    plt.title(f'{sample.label.iloc[i]} - {S2.shape[:2]} - {sample.nframes.iloc[i]}')

plt.tight_layout()
plt.show()


In [None]:
rows = 3
cols = 4
    
sample = train.sample(rows * cols)

for i in range(rows):
    
    plt.figure(figsize=[16,2])
    for j in range(cols):
        k = 4*i + j
        wf = librosa.load( train_path + sample.fname.iloc[k] )[0] 
        plt.subplot(1, cols ,j+1)
        plt.plot(wf)
        plt.axis('off')
        
    plt.tight_layout()    
    plt.show()
    
    plt.figure(figsize=[16,3])
    for j in range(cols):
        k = 4*i + j
        fname = train_path + sample.fname.iloc[k]
        clip, sr = librosa.load(fname, sr=44100)
        S1 = librosa.feature.melspectrogram(y=clip, sr=44100)   # Numpy Array
        S2 = librosa.power_to_db(S1, ref=np.max)                # Numpy Array
        
        plt.subplot(1, cols ,j+1)
        librosa.display.specshow(S2)
        plt.title(f'{sample.label.iloc[k]} - {S2.shape[:2]} - {sample.nframes.iloc[k]}')
        
        
    plt.tight_layout() 
    plt.show()


## Create Spectrogram Files

In [None]:
!mkdir train_images
!mkdir test_images

In [None]:
def create_spectrogram_file(fname, source, dest, dpi):
    
    path = source + fname
    save_path  = dest + fname[:-4] + '.jpg'
    
    clip, sr = librosa.load(path, sr=44100)
    
    S1 = librosa.feature.melspectrogram(y=clip, sr=44100)
    S2 = librosa.power_to_db(S1, ref=np.max)
    
    fig = plt.figure()
    librosa.display.specshow(S2)   
    plt.savefig(save_path, dpi=dpi, bbox_inches='tight',pad_inches=0)
    
    plt.cla()
    plt.clf()
    plt.close('all')
    plt.close(fig)
    del S1, S2, clip, fig
    gc.collect()

Create training images

In [None]:
%%time

start = 0
end = 10000
n = start

source = train_path
dest = 'train_images/'

for f in train.fname.values[start:end]:
    create_spectrogram_file(f, source, dest, 50)
    n += 1
    
    if n % 50 == 0:
        print(n, end=' ')
    
    if n % 1000 == 0:
        gc.collect()

## Add New Columns to Training Set

In [None]:
train['image_file'] = train.fname.apply(lambda x : x[:-4] + '.jpg')
test['image_file'] = test.fname.apply(lambda x : x[:-4] + '.jpg')

le = LabelEncoder()
le.fit(train.label)

train['enc_label'] = le.transform(train.label)
#test['enc_label'] = le.transform(test.label)

## Data Generators

In [None]:
train_datagen = ImageDataGenerator(rescale=1/255, validation_split=0.2)
#test_datagen = ImageDataGenerator(rescale=1/255)

In [None]:
bs = 64
target_size = (128, 128)

train_generator = train_datagen.flow_from_dataframe(
    dataframe = train,
    directory = 'train_images/',
    subset = "training",
    x_col = 'image_file',
    y_col = 'label',
    batch_size = bs,
    shuffle = True,
    class_mode = 'categorical',
    target_size = target_size)

valid_generator = train_datagen.flow_from_dataframe(
    dataframe = train,
    directory = 'train_images',
    subset = "validation",
    x_col = 'image_file',
    y_col = 'label',
    batch_size = bs,
    seed = 1,
    shuffle = False,
    class_mode = 'categorical',
    target_size = target_size)


In [None]:
tr_batches = len(train_generator)
va_batches = len(valid_generator)

print(tr_batches)
print(va_batches)

## Build CNN

In [None]:
np.random.seed(1)

cnn = Sequential()

cnn.add(Conv2D(16, (3,3), activation = 'relu', padding = 'same', input_shape=(128,128,3)))
cnn.add(Conv2D(16, (3,3), activation = 'relu', padding = 'same'))
cnn.add(MaxPooling2D(2,2))
cnn.add(Dropout(0.25))
cnn.add(BatchNormalization())

cnn.add(Conv2D(32, (3,3), activation = 'relu', padding = 'same'))
cnn.add(Conv2D(32, (3,3), activation = 'relu', padding = 'same'))
cnn.add(MaxPooling2D(2,2))
cnn.add(Dropout(0.5))
cnn.add(BatchNormalization())

cnn.add(Flatten())
cnn.add(Dense(64, activation='relu'))
cnn.add(Dropout(0.75))
cnn.add(BatchNormalization())

cnn.add(Dense(41, activation='softmax'))

cnn.summary()

## Train Network

In [None]:
%%time 

opt = keras.optimizers.Adam(0.001)
cnn.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

h1 = cnn.fit(train_generator, steps_per_epoch=tr_batches, epochs=20,
                       validation_data=valid_generator, validation_steps=va_batches, 
                       verbose=1)

In [None]:
viztools02.vis_training([h1])