# Modules needed
## Preprocessing
 * install pandas
 * wavefile
 * matplotplib
 * LibROSA
 * numba==0.48.0
## Machine learning
 * numpy
 * keras
 * sklearn
 * tensorflow
 * tqdm (just for fun)

If you are using anaconda tensorflow should already be setup
`pip install pandas wavefile matplotlib librosa numba==0.48.0 numpy keras sklearn tqdm keras tensorflow-gpu`
 
## To enable progress bars
`jupyter nbextension enable --py widgetsnbextension`
`jupyter labextension install @jupyter-widgets/jupyterlab-manager`

In [1]:
#!pip freeze > requirements.txt

In [2]:
import pandas as pd
from os.path import join as join_path
from wavefile import WaveReader

### Performance Tweaking
How many threads to use for multithreading and such.
Some of the processing takes forever using default single threaded capabilities

In [3]:
max_threads = 24

In [4]:
usl = "../resources/UrbanSound8K/"

us_meta = pd.read_csv(usl + 'metadata/UrbanSound8K.csv')

In [5]:
audio_data = []
for i, entry in us_meta.iterrows():
    file_loc = join_path(usl, "audio", 'fold' + str(entry["fold"]), str(entry["slice_file_name"]))
    with WaveReader(file_loc) as r:
        # Probably easier way with this library to read the bit depth.
        audio_data.append((r.channels, r.samplerate, int((r.byterate) / (r.samplerate * r.channels) * 8)))

audio_df = pd.DataFrame(audio_data, columns=['num_channels', 'sample_rate', 'bit_depth'])

### Summaries of Sample Data

In [6]:
print("Number of channels")
print(audio_df.num_channels.value_counts(normalize=True))

print("\nSample Rates")
print(audio_df.sample_rate.value_counts(normalize=True))

print("\nBit Depth")
print(audio_df.bit_depth.value_counts(normalize=True))

Number of channels
2    0.915369
1    0.084631
Name: num_channels, dtype: float64

Sample Rates
44100     0.614979
48000     0.286532
96000     0.069858
24000     0.009391
16000     0.005153
22050     0.005039
11025     0.004466
192000    0.001947
8000      0.001374
11024     0.000802
32000     0.000458
Name: sample_rate, dtype: float64

Bit Depth
16    0.659414
24    0.315277
32    0.019354
8     0.004924
4     0.001031
Name: bit_depth, dtype: float64


### Preprocess files to be similar to the format being used in odas

In [7]:
import librosa
import librosa.display
#import tqdm.notebook as tqdm
import tqdm
import numpy as np
from multiprocessing import Pool

max_pad_len = 174

def extract_features(file_name):
   
    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        pad_width = max_pad_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        
    except Exception as e:
        print("Error encountered while parsing file: ", file_name, e)
        return None 
     
    return mfccs

def process_entry(file_entry):
    i, entry = file_entry
    entry_loc = join_path(usl, "audio", 'fold' + str(entry["fold"]), str(entry["slice_file_name"]))
    class_label = entry["class"]
    return [extract_features(entry_loc), class_label];

In [8]:
urban_sound_features = []

print("Extracting features: UrbanSound8K")
with Pool(max_threads) as p:
    entries = us_meta.iterrows()
    for value in tqdm.tqdm(p.imap(process_entry, entries), total=us_meta.shape[0]):
        if value[1] != "gun_shot" and value[1] != "dog_bark":
            urban_sound_features.append(value)
        pass

Extracting features: UrbanSound8K


100%|██████████| 8732/8732 [01:00<00:00, 144.87it/s]


In [9]:
from os import listdir
from os.path import isfile, join, isdir

custom_sound_features = []
custom_sound_loc = "../resources/CustomSounds"
sound_class_folders = [f for f in listdir(custom_sound_loc) if isdir(join(custom_sound_loc, f))]
# (fileLoc, class)test
print("Extracting features: CustomSounds")
custom_sounds = []
with Pool(max_threads) as p:
    for class_name in sound_class_folders:
        print("Extracting:", class_name)
        sound_files = [f for f in listdir(join(custom_sound_loc, class_name))\
                       if isfile(join(custom_sound_loc, class_name, f))]
        if len(sound_files) == 0:
            print("No sounds found for:", class_name)
            continue
        file_locations = list(map(lambda f: join(custom_sound_loc, class_name, f), sound_files))
        for value in tqdm.tqdm(p.imap(extract_features, file_locations), total=len(file_locations)):
            custom_sound_features.append((value, class_name))
    
    

Extracting features: CustomSounds


  0%|          | 0/309 [00:00<?, ?it/s]

Extracting: fire_alarm


100%|██████████| 309/309 [00:01<00:00, 233.25it/s]
100%|██████████| 2/2 [00:00<00:00, 465.75it/s]


Extracting: drilling
No sounds found for: drilling
Extracting: jackhammer


In [10]:
features = urban_sound_features + custom_sound_features
features_df = pd.DataFrame(features, columns=['feature','class_label'])
print('Finished feature extraction from ', len(features_df), ' files')

print("Label Distribution")
print(features_df.class_label.value_counts(normalize=True))

Finished feature extraction from  8669  files
Label Distribution
jackhammer          0.115584
children_playing    0.115354
air_conditioner     0.115354
street_music        0.115354
drilling            0.115354
dog_bark            0.115354
engine_idling       0.115354
siren               0.107163
car_horn            0.049487
fire_alarm          0.035644
Name: class_label, dtype: float64


### Prep learning and training dataset
Will need to check at some point the 8k datasets because it does say something about don't randomise it or something.
Though for now lets get to training! :D

In [13]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Convert features and corresponding classification labels into numpy arrays
X = np.array(features_df.feature.tolist())
y = np.array(features_df.class_label.tolist())

# Encode the classification labels
le = LabelEncoder()
yy = to_categorical(le.fit_transform(y)) 

# split the dataset 
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(X, yy, test_size=0.2, random_state = 42)

### Store pre-processed data

In [14]:
### Save values

%store x_train 
%store x_test 
%store y_train 
%store y_test 
%store yy 
%store le
%store X
%store y


Stored 'x_train' (ndarray)
Stored 'x_test' (ndarray)
Stored 'y_train' (ndarray)
Stored 'y_test' (ndarray)
Stored 'yy' (ndarray)
Stored 'le' (LabelEncoder)
Stored 'X' (ndarray)
Stored 'y' (ndarray)
