In [1]:
import scipy
import numpy as np
import pandas as pd
import librosa as lb
import librosa.display as lbd
import IPython.display as ipd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import tensorflow.keras as k
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy, CategoricalCrossentropy
from tensorflow.keras.layers import Conv2D, Dense, Flatten, MaxPool2D, Activation, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array, array_to_img
from sklearn.utils import shuffle
from sklearn.metrics import label_ranking_average_precision_score as lrap
from PIL import Image
import seaborn as sns
import warnings
import glob
import json
import os
import re

pd.set_option('display.max_columns', 20)
np.set_printoptions(suppress=True)
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
train_samples = glob.glob('../input/rfcx-species-audio-detection/train/*')
test_samples = glob.glob('../input/rfcx-species-audio-detection/test/*')
print("Number of train samples : ", len(train_samples), "\nNumber of test  samples : ", len(test_samples))

In [None]:
data_tp = pd.read_csv('../input/rfcx-species-audio-detection/train_tp.csv')
data_fp = pd.read_csv('../input/rfcx-species-audio-detection/train_fp.csv')

In [None]:
data_tp.head()

In [None]:
data_fp.head()

# **DATA PRE-PROCESSING**

In [None]:
shuffled_data = shuffle(data_tp)
shuffled_data.head()

In [None]:
train_data = shuffled_data[:int(0.9*len(shuffled_data))]
val_data = shuffled_data[int(0.9*len(shuffled_data)):]

In [None]:
num_classes = 24
sampling_rate = 44100
window_size = 10
base_path = '../input/rfcx-species-audio-detection/train/'

In [None]:
os.mkdir('./train')
os.mkdir('./val')
for cls in range(num_classes):
    os.mkdir('./train/class_' + str(cls))
    os.mkdir('./val/class_' + str(cls))

In [None]:
def create_spectrogram(rec_id, species_id, start, end, base_file, count):
    plt.interactive(False)
    sound_clip, sample_rate = lb.load(base_path + rec_id + '.flac', sr=sampling_rate)
    trimmed_sound_clip = sound_clip[int(start*sample_rate):int(end*sample_rate)]
    fig = plt.figure(figsize=(3.56, 3.56))
    ax = fig.add_subplot(111)
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.set_frame_on(False)
    S = lb.feature.melspectrogram(y=trimmed_sound_clip, sr=sample_rate)
    lbd.specshow(lb.power_to_db(S, ref=np.max))
    filename  = base_file + 'class_' + str(species_id) + '/img_' + str(count) + '.jpg'
    plt.savefig(filename)
    plt.close()    
    fig.clf()
    plt.close(fig)
    plt.close('all')
    del sound_clip, trimmed_sound_clip, sample_rate, fig, ax, S

In [None]:
def create_train_and_val_data(data, base_file):
    count = 1
    for i in range(len(data)):
        d = data.iloc[i]
        center = (d.t_min +  d.t_max) / 2
        start = center - (window_size//2)
        end = center + (window_size//2)
        if start < 0:
            start = 0
            end = start + window_size
        if end > 60:
            end = 60
            start = end - window_size
        rec_id = d.recording_id
        species_id = d.species_id
        create_spectrogram(rec_id, species_id, start, end, base_file, count)
        count += 1

In [None]:
#Uncomment this to create the data for mislabelled data

#create_train_and_val_data(train_data, './train/')
#create_train_and_val_data(val_data, './val/')

# Model for labelling unlabelled data

In [None]:
datagen = ImageDataGenerator(rescale=1./255)

In [None]:
input_shape = (256, 256, 3)
batch_size = 1
num_classes = 24

In [None]:
train = datagen.flow_from_directory('../input/data-for-mislabelled-data/data_preprocessing/train/', target_size=input_shape[0:2], batch_size=batch_size, class_mode='categorical', shuffle=True)
val = datagen.flow_from_directory('../input/data-for-mislabelled-data/data_preprocessing/val/', target_size=input_shape[0:2], batch_size=batch_size, class_mode='categorical', shuffle=True)

In [None]:
model = Sequential()

model.add(Conv2D(input_shape=input_shape, filters=8, kernel_size=(5, 5), activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2), strides=(2, 2)))

model.add(Conv2D(filters=16, kernel_size=(7, 7), activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2), strides=(2, 2)))
model.add(Dropout(0.5))

model.add(Conv2D(filters=32, kernel_size=(9, 9), activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2), strides=(2, 2)))
model.add(Dropout(0.5))

model.add(Conv2D(filters=64, kernel_size=(9, 9), activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2), strides=(2, 2)))
model.add(Dropout(0.6))

model.add(Flatten())

model.add(Dense(1098, activation='relu'))
model.add(Dropout(0.78))

model.add(Dense(1098, activation='relu'))
model.add(Dropout(0.78))

model.add(Dense(num_classes, activation='softmax'))

In [None]:
opt = Adam(learning_rate=0.0001)
model.compile(optimizer=opt, loss=CategoricalCrossentropy(), metrics=['accuracy'])

In [None]:
history = model.fit(train, steps_per_epoch=len(train), epochs=100, verbose=1, validation_data=val, validation_steps=len(val), workers=4, use_multiprocessing=True)

In [None]:
plt.figure(figsize=(15, 8))
plt.subplot(121)
plt.plot([i for i in range(len(history.history['loss']))], history.history['loss'], label='train')
plt.plot([i for i in range(len(history.history['val_loss']))], history.history['val_loss'], label='val')
plt.title('Train vs Val Loss')
plt.legend()
plt.subplot(122)
plt.plot([i for i in range(len(history.history['accuracy']))], history.history['accuracy'], label='train')
plt.plot([i for i in range(len(history.history['val_accuracy']))], history.history['val_accuracy'], label='val')
plt.title('Train vs Val Accuracy')
plt.legend()

In [None]:
model.summary()

In [None]:
model.save('model_for_data.h5')

# **Final data and labels creation**

In [None]:
 model = load_model('../input/model-for-unlabelled-data/model_for_data.h5')

In [None]:
classes = sorted([re.findall(r'class_\d+', clas)[0] for clas in glob.glob('../input/data-for-mislabelled-data/data_preprocessing/train/*')])
label_dict = {i:int(re.findall(r'class_(\d+)', clas)[0]) for i, clas in zip(range(0, len(classes)), classes)}

In [None]:
label_dict

In [None]:
os.mkdir('./train')

In [None]:
def spectogram(audio, sample_rate):
    plt.interactive(False)
    fig = plt.figure(figsize=(3.56, 3.56))
    ax = fig.add_subplot(111)
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.set_frame_on(False)
    S = lb.feature.melspectrogram(y=audio, sr=sample_rate)
    lbd.specshow(lb.power_to_db(S, ref=np.max))
    plt.close()
    return fig

In [None]:
def return_species_if_labelled(name, df):
    return list(df[df['recording_id'] == name].species_id.values)

In [None]:
def create_label(audio, sample_rate, label_dict, model, name, df):
    label = [0] * 24
    species = return_species_if_labelled(name, df)
    for i in species:
        label[i] = 1
    audio_windows = []
    for i in range(0, 60, 10):
        audio_windows.append(audio[i*sample_rate:(i+10)*sample_rate])
    for trimmed_audio in audio_windows:
        fig = spectogram(trimmed_audio, sample_rate)
        fig.canvas.draw()
        trimmed_spec = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
        trimmed_spec = trimmed_spec.reshape((1, ) + fig.canvas.get_width_height()[::-1] + (3,))
        label[label_dict[np.argmax(model.predict(trimmed_spec))]] = 1
        fig.clf()
        plt.close(fig)
        plt.close('all')
        del fig
    return label

In [None]:
def create_train_data_and_label(record, name, frmt, label_dict, sampling_rate, model, df):
    audio, sample_rate = lb.load(record, sr=sampling_rate)
    label = create_label(audio, sample_rate, label_dict, model, name, df)
    fig = spectogram(audio, sample_rate)
    filename  = './train/' + name + '.' + frmt
    fig.savefig(filename)
    plt.close()
    fig.clf()
    plt.close(fig)
    plt.close('all')
    del fig
    return label

In [None]:
labels =  {}
index = 1
for record in glob.glob('../input/rfcx-species-audio-detection/train/*'):
    name = re.findall(r'/train/(.+).flac', record)[0]
    l = create_train_data_and_label(record, name, 'jpg', label_dict, sampling_rate, model, data_tp)
    labels[name] = l
    print(str(index) + ' record completed!')
    index += 1

In [None]:
with open('labels.json', 'w') as f:
    json.dump(labels, f)

In [None]:
!tar -cvzf train_data.tar.gz ./

In [None]:
IPython.display.FileLink('./train_data.tar.g')

# **Final Model**

In [2]:
train = glob.glob('../input/training-data/train/*')
train = shuffle(train)
with open('../input/training-data/labels.json', 'r') as f:
    labels = json.load(f)

In [3]:
X = np.zeros((len(train), 256, 256, 3))
Y = np.zeros((len(train), 24))

In [4]:
def extract_name(path):
    return re.findall(r'train/(.+)\.jpg', path)[0]

In [5]:
for exp in range(len(train)):
    X[exp] = (np.array(Image.open(train[exp]))/255)
    Y[exp] = np.array(labels[extract_name(train[exp])])

In [6]:
print('Shape of X : ', X.shape)
print('Shape of Y : ', Y.shape)

Shape of X :  (4727, 256, 256, 3)
Shape of Y :  (4727, 24)


In [7]:
X_train, X_val, y_train, y_val = X[:int(0.90*len(train))], X[int(0.90*len(train)):], Y[:int(0.90*len(train))], Y[int(0.90*len(train)):]

In [8]:
del X, Y

In [9]:
print('Shape of X_train : ', X_train.shape)
print('Shape of y_train : ', y_train.shape)
print('Shape of X_val : ', X_val.shape)
print('Shape of y_val : ', y_val.shape)

Shape of X_train :  (4254, 256, 256, 3)
Shape of y_train :  (4254, 24)
Shape of X_val :  (473, 256, 256, 3)
Shape of y_val :  (473, 24)


In [10]:
model = Sequential()

model.add(Conv2D(input_shape=X_train.shape[1:], filters=8, kernel_size=(5, 5), activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2), strides=(2, 2)))

model.add(Conv2D(filters=16, kernel_size=(7, 7), activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2), strides=(2, 2)))
model.add(Dropout(0.5))

model.add(Conv2D(filters=32, kernel_size=(9, 9), activation='relu'))
model.add(MaxPool2D(pool_size=(2, 2), strides=(2, 2)))
model.add(Dropout(0.5))

model.add(Flatten())

model.add(Dense(1098, activation='relu'))
model.add(Dropout(0.78))

model.add(Dense(1098, activation='relu'))
model.add(Dropout(0.78))

model.add(Dense(24, activation='sigmoid'))

In [19]:
model.compile(optimizer=Adam(lr=0.001), loss=BinaryCrossentropy(), metrics=['accuracy'])

In [12]:
batch_size = 32
epochs = 20

In [22]:
lraps = []
for epoch in range(epochs):
    l = []
    for batch in range(0, ((X_train.shape[0] - (X_train.shape[0] % batch_size)) + 1) - batch_size, batch_size):
        x_t = X_train[batch:batch+batch_size]
        y_t = y_train[batch:batch+batch_size]
        history = model.fit(x_t, y_t, epochs=1, steps_per_epoch=1, verbose=0, workers=4, use_multiprocessing=True)
        l.append(lrap(y_t, model.predict(x_t)))
        del x_t, y_t
    if X_train.shape[0] % batch_size != 0:
        history = model.fit(X_train[batch:], y_train[batch:], epochs=1, steps_per_epoch=1, verbose=0, workers=4, use_multiprocessing=True)
        l.append(lrap(y_train[batch:], model.predict(X_train[batch:])))
    avg_lrap = sum(l)/len(l)
    lraps.append(avg_lrap)
    print('Epoch', (epoch+1),':\nTraining accuracy (LRAP): ', avg_lrap)

Training accuracy :  0.9482475630471786
Training accuracy :  0.9519299064481883
Training accuracy :  0.9509968985664559
Training accuracy :  0.9553735746184059
Training accuracy :  0.9583465821287896
Training accuracy :  0.9624669338847007
Training accuracy :  0.9636812487865929
Training accuracy :  0.9672239209855997
Training accuracy :  0.9686337009382788
Training accuracy :  0.9682411562642468
Training accuracy :  0.9703845761373022
Training accuracy :  0.9717381901215159
Training accuracy :  0.9735570178283703
Training accuracy :  0.9744232043582471
Training accuracy :  0.973977260103162
Training accuracy :  0.9769787712027953
Training accuracy :  0.9773779778623983
Training accuracy :  0.9788506524804993
Training accuracy :  0.9801706991624493
Training accuracy :  0.9806836479306009


In [23]:
print('Validation Accuracy (LRAP): ', lrap(y_val, model.predict(X_val))

0.8190176079035058

In [None]:
test = glob.glob('../input/test-data/test/*')

In [27]:
submission_example = pd.read_csv('../input/rfcx-species-audio-detection/sample_submission.csv')

In [41]:
submission = pd.DataFrame(columns=submission_example.columns, index=[i for i in range(len(train))])

In [None]:
def extract_name(path):
    return re.findall(r'test/(.+)\.jpg', path)[0]

In [None]:
for exp in range(len(test)):
    submission.iloc[exp, :] = [extract_name(test[exp])] + (list(model.predict((np.array(Image.open(test[exp]))/255).reshape((1, 256, 256, 3)))).squeeze())

In [59]:
submission.set_index('recording_id', inplace=True)

In [60]:
submission.to_csv('submission.csv')