In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Audio Classification
Objective : Use a simple Neural Network to classify audio samples in their category based on features extracted using LIBROSA.

In [None]:
## Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import IPython.display as ipd

'''Librosa is a special library used for audio analysis'''
import librosa
import librosa.display

## Data Understanding and Exploration

In [None]:
#let's take a random audio file from the data.
audio_file_path='../input/urbansound8k/fold1/101415-3-0-2.wav'

#let's view the waveplot 
plt.figure(figsize=(14,3))
y, sr = librosa.load(audio_file_path)
librosa.display.waveplot(y, sr = sr)
ipd.Audio(audio_file_path)

It is a sound of dog barking.

In [None]:
print('Time series data :- ',y)
print('Sample rate :- ',sr)

A `time series` is a series of data points indexed in time order.
Here, time series of an audio signal represented as a one-dimensional numpy.ndarray of floating-point values. y[t] corresponds to amplitude of the waveform at sample t.


A `sample rate` or sampling rate defines how many times per second a sound is sampled. The default sampling rate used by Librosa is 22050

In [None]:
# Load the meta data
metadata = pd.read_csv('../input/urbansound8k/UrbanSound8K.csv')
metadata.head()

In [None]:
# check whether the dataset is imbalanced
metadata['class'].value_counts()

## Data Preprocessing

##### Feature Extration

Let's extract the Mel-frequency cepstral coefficients from the raw signal y

In [None]:
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
print(mfccs.shape)

The output of librosa.feature.mfcc function is the matrix, which is a numpy.ndarray of shape (n_mfcc, T) where T denotes the track duration in frames. 

In [None]:
# Extracting MFCC's for every audio file
import pandas as pd
import os
import librosa

audio_dataset_path = '../input/urbansound8k'
metadata = pd.read_csv('../input/urbansound8k/UrbanSound8K.csv')
metadata.head()

In [None]:
def features_extractor(file):
    audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled_features = np.mean(mfccs_features.T, axis=0)
    return mfccs_scaled_features

In [None]:
import numpy as np
from tqdm import tqdm
## Now we iterate through every audio file and extract features
## using MeL-Frequency cepstral Coefficients
extracted_features=[]
for index_num, row in tqdm(metadata.iterrows()):
    file_name = os.path.join(os.path.abspath(audio_dataset_path),'fold'+str(row["fold"])+'/', str(row['slice_file_name']))
    final_class_labels = row['class']
    data = features_extractor(file_name)
    extracted_features.append([data,final_class_labels])

In [None]:
## Converting extracted_features to pandas dataframe
extracted_features_df = pd.DataFrame(extracted_features, columns=['feature','class'])
extracted_features_df.head()

In [None]:
## Split the dataset into independent and dependent dataset
X=np.array(extracted_features_df['feature'].tolist())
y= np.array(extracted_features_df['class'].tolist())

In [None]:
X.shape

In [None]:
## Label Encoder
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
y = to_categorical(labelencoder.fit_transform(y))

In [None]:
y.shape

In [None]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
X_train

In [None]:
y_train

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

#### Model Creation

In [None]:
import tensorflow as tf
print(tf.__version__)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.optimizers import Adam
from sklearn import metrics

In [None]:
### No of classes
num_labels = y.shape[1]

In [None]:
model = Sequential()

### first Layer
model.add(Dense(100, input_shape=(40,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
### second Layer
model.add(Dense(200))
model.add(Activation('relu'))
model.add(Dropout(0.5))
### third Layer
model.add(Dense(100))
model.add(Activation('relu'))
model.add(Dropout(0.5))

### final Layer
model.add(Dense(num_labels))
model.add(Activation('softmax'))

In [None]:
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', metrics=['accuracy'],optimizer='adam')

In [None]:
## Training my model
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime

num_epochs = 100
num_batch_size = 32

checkpointer = ModelCheckpoint(filepath='./', verbose = 1, save_best_only=True)
start = datetime.now()

model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data = (X_test, y_test), callbacks=[checkpointer])

duration = datetime.now() - start
print('Training completed in time: ', duration)

#### Evaluate the model

In [None]:
test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(test_accuracy[1])

so, this model give 74.81% accuracy on test data.

### Testing some test audio data
steps:
- Preprocess the new audio data
- predict the classes
- Inverse transform your Predicted Label

In [None]:
filename = '../input/urbansound8k/fold6/108638-9-0-1.wav'
audio, sample_rate = librosa.load(filename, res_type='kaiser_fast')
mfccs_features = librosa.feature.mfcc(y=audio,sr=sample_rate, n_mfcc=40)
mfccs_scaled_features = np.mean(mfccs_features.T, axis=0)

print(mfccs_scaled_features)
mfccs_scaled_features = mfccs_scaled_features.reshape(1,-1)
print(mfccs_scaled_features)
print(mfccs_scaled_features.shape)

In [None]:
predicted_label = model.predict_classes(mfccs_scaled_features)
print(predicted_label)
prediction_class = labelencoder.inverse_transform(predicted_label)
prediction_class

This model predicted given audio file as `street_music`. Let's check out what audio it is.

In [None]:
plt.figure(figsize=(14,5))
data, sample_rate = librosa.load(filename)
librosa.display.waveplot(data, sr = sample_rate)
ipd.Audio(filename)

So, model predicted it correctly. It is street music.