In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import librosa
import librosa.display
import IPython.display
import matplotlib.pyplot as plt
import torch
import torch.nn as nn, torch.nn.init as init
import torchaudio
import torchaudio.transforms as AT
import torch.optim as optim

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

from scipy.io import wavfile

from sklearn.preprocessing import StandardScaler

from tqdm import tqdm

# Any results you write to the current directory are saved as output.

In [None]:
CUDA_AVAILABLE = torch.cuda.is_available()
DEVICE = torch.device("cuda" if CUDA_AVAILABLE else "cpu")

In [None]:
train = pd.read_csv('../input/freesound-audio-tagging/train.csv')
train

In [None]:
audio_path = '../input/freesound-audio-tagging/audio_train/'
fname, label, verified = train.values[0]
rate, data = wavfile.read(audio_path+fname)

print(label)
print('Sampling Rate:\t{}'.format(rate))
print('Total Frames:\t{}'.format(data.shape[0]))
print(data)

y, sr = librosa.load(audio_path+fname,sr=11025)
IPython.display.Audio(data=y, rate=sr)


In [None]:
pad_audio_data =np.load('../input/audio-data/audio_data_11025.npy', allow_pickle=True)
pad_audio_data

In [None]:
plt.plot(pad_audio_data[0])

In [None]:
maxlen=sr*5

for i in range(len(pad_audio_data)):
    if len(pad_audio_data[i]) > maxlen:
        pad_audio_data[i] = pad_audio_data[i][:maxlen]
    elif len(pad_audio_data[i]) < maxlen:
        pad_audio_data[i] = np.append(np.zeros(maxlen - len(pad_audio_data[i]), dtype='float32'), pad_audio_data[i])

In [None]:
tmp = [e for e in pad_audio_data]

pad_audio_data = np.array(tmp, dtype = 'float32')

pad_audio_data.shape

In [None]:
sds = StandardScaler()
sds_audio_data = sds.fit_transform(pad_audio_data)
plt.plot(sds_audio_data[0])

In [None]:
labelEncoder = {}
for i, label in enumerate(train['label'].unique()):
    labelEncoder[label] = i

In [None]:
labelEncoder

In [None]:
Encoding_label = np.zeros(9473, dtype = object)

for i in tqdm(range(0,9473)):
    fname, label, verified = train.values[i]
    Encoding_label[i] = labelEncoder[label]

In [None]:
Encoding_label

In [None]:
tmp = []

for i in range(9473):
    tmp_2 = [0 for _ in range(41)]
    tmp_2[Encoding_label[i]] = 1
    tmp.append(tmp_2)

Encoding_label = np.array(tmp, dtype='float32')
Encoding_label[0]

In [None]:
sds_audio_data_tensor = torch.from_numpy(sds_audio_data)
sds_audio_data_tensor

In [None]:
Encoding_label = torch.from_numpy(Encoding_label)
Encoding_label

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D,Dense,Dropout,MaxPool1D,Flatten,GlobalMaxPool1D
from tensorflow.keras import optimizers, regularizers

input_length = sr * 5
n_classes = train['label'].unique().shape[0]
input_audio_data = np.expand_dims(sds_audio_data, axis=2)

sgd = optimizers.SGD(learning_rate=0.001, momentum=0.0, nesterov=False)
momentum = optimizers.SGD(learning_rate=0.001, momentum=0.9, nesterov=False)
nag = optimizers.SGD(learning_rate=0.001, momentum=0.9, nesterov=True)
adagrad = optimizers.Adagrad(learning_rate=0.001)
rmsprop = optimizers.RMSprop(learning_rate=0.001, rho=0.9)
adam = optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)
#radam = RAdam(learning_rate=0.001,beta_1=0.9,beta_2=0.999)
optimizer_dict = {sgd:'sgd', momentum:'momentum', nag:'nag', adagrad:'adagrad', rmsprop:'rmsprop', adam:'adam'}

In [None]:
del train

In [None]:
input_audio_data_tensor = torch.from_numpy(input_audio_data)
test = input_audio_data_tensor[0].numpy()

plt.plot(test)

In [None]:
def create_cnn():
    model = Sequential()
    model.add(Conv1D(16, 9, activation='relu', input_shape=(input_length, 1)))
    model.add(Conv1D(32, 9, activation='relu'))
    model.add(MaxPool1D(16))
    model.add(Dropout(0.1))
    model.add(Conv1D(64, 9, activation='relu'))
    model.add(Conv1D(64, 9, activation='relu'))
    model.add(MaxPool1D(16))
    model.add(Conv1D(128, 9, activation='relu'))
    model.add(Conv1D(128, 9, activation='relu'))
    model.add(MaxPool1D(4))
    model.add(Dropout(0.1))
    model.add(Conv1D(256, 3, activation='relu'))
    model.add(Conv1D(256, 3, activation='relu'))
    model.add(GlobalMaxPool1D())
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(64, activation = 'relu'))
    model.add(Dense(1028, activation = 'relu'))
    model.add(Dense(n_classes, activation = 'softmax'))
    
    model.compile(loss = 'categorical_crossentropy', metrics = ['accuracy'], optimizer = adam)
    return model

In [None]:
num_data = 9473
epoch = 10

model = nn.Sequential(
    nn.Linear(1, 50),
    nn.ReLU(),
    nn.Linear(50, 100),
    nn.ReLU(),
    nn.Linear(100, 50),
    nn.ReLU(),
    nn.Linear(50, 10),
    nn.ReLU(),
    nn.Linear(10, 1)
)

In [None]:
loss_func = nn.L1Loss()
optimizer = optim.SGD(model.parameters(), lr = 0.0002)

loss_arr = []

noise = init.normal_(torch.FloatTensor(num_data, 1), std = 1.0)
y_noise = Encoding_label + noise

for i in range(epoch):
    optimizer.zero_grad()
    output = model(input_audio_data_tensor)

    loss = loss_func(output, y_noise)
    loss.backward()
    optimizer.step()

    loss_arr.append(loss.detach().numpy())
    print(loss)

In [None]:
plt.plot(loss_arr)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train','Validation'], loc = 'upper left')
plt.show()

In [None]:
#model = create_cnn()
#model.summary()

In [None]:
history = model.fit(input_audio_data,Encoding_label, epochs=30, validation_split = 1/6)

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(['Train','Validation'], loc = 'upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(['Train','Validation'], loc = 'upper left')
plt.show()

In [None]:
import random

stop = train.shape[0]
rand = random.randrange(0, stop)
rand

In [None]:
fname, label, verified = train.values[rand]

y_predict, sr_predict = librosa.load(audio_path+fname,sr=11025)
IPython.display.Audio(data=y_predict, rate=sr_predict)

In [None]:
np.argmax(model.predict(np.expand_dims(input_audio_data[rand], axis=0)))

In [None]:
label

In [None]:
labelEncoder