In [2]:
#import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
import os
import glob
import random as rand
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten, GRU
from keras.callbacks import Callback, EarlyStopping
from sklearn.model_selection import train_test_split
from keras import backend as K
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.metrics import confusion_matrix  
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn import metrics
from sklearn.metrics import accuracy_score
import librosa
import librosa.display

**Data Preprocessing**


In [51]:

TRAIN_DIR = os.path.join('../input/audiodata/', 'train/*') 
trainF = glob.glob(TRAIN_DIR)
max_length=0
temp={}                                                        #create dict for storing the training npy file data
arr={}
for files in trainF: 
  temp[files]=np.load(files) 
  arr[files]=temp[files].copy()
  df = pd.DataFrame(arr[files][0])
  # df=df.loc[:, (df**2).sum() != 0]
  if max_length<df.shape[1] :
    max_length=df.shape[1]                                     #calculate maximum length of audio sample, =2584 for train data

for files in trainF: 
  p,q,r = arr[files].shape
  start_padding = rand.randint(0, abs(300 - r))                 #reduce max_length, taken to be 200 here
  
  if(r<300):
    arr[files] = np.pad(arr[files], [(0,0),(0,0),(0,300-r)], 'constant', constant_values=0)

    
  elif(r>300):
    end_padding = abs(r - start_padding-300)                   #Randomly slice from left and right to provide better accuracy
    arr[files]=(arr[files][:,:,start_padding:r-end_padding])
      

In [52]:
#upload annotations.csv file from local
# from google.colab import files
# upload = files.upload()


In [53]:
dataFile = "../input/audiofiles/annotations.csv"
df = pd.read_csv(dataFile)
print(df)

     Unnamed: 0         fname               label  split
0             0     02639.npy                Bark  train
1             1      1580.npy                Bark  train
2             2     24030.npy                Bark  train
3             3    316499.npy                Bark  train
4             4    413718.npy                Bark  train
..          ...           ...                 ...    ...
995         995   9956925.npy  Walk_and_footsteps  train
996         996  99611454.npy  Walk_and_footsteps  train
997         997   9979729.npy  Walk_and_footsteps  train
998         998  99818869.npy  Walk_and_footsteps  train
999         999    999433.npy  Walk_and_footsteps  train

[1000 rows x 4 columns]


In [54]:
data=[]                                                 #stores the input training data
output=[]                                               #stores the output labels in the same order
x=""
for i in range(1000):
  x=df.iloc[i,1]
  y=df.iloc[i,2]
  x=os.path.join('../input/audiodata/train/', str(x))
  data.append(arr[x])
  output.append(y)

  #fill the empty columns by mean of other columns
  imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
  imputer=imputer.fit(data[i][0]) 
  data[i][0]=imputer.transform(data[i][0])

In [55]:
!pip install scikit-learn

[0m

In [56]:
# Convert string labels to integer labels
encoder =  LabelEncoder()
y1 = encoder.fit_transform(output)

In [57]:
#convert output labels to one-hot vector
Y = pd.get_dummies(y1).values

In [58]:
data=np.stack(data)
# print(data.shape)

In [59]:
data=np.transpose(data,(0,2,3,1))                     #change the shape to (1000,128,200,1)

In [60]:
# split data into training and validation
X_train,X_test,y_train,y_test=train_test_split(data,Y,test_size=0.2)

In [61]:
#define functions to measure recall, precision and f1-score
def recall_m(y_true, y_preds):
    true_positives = K.sum(K.round(K.clip(y_true * y_preds, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_preds):
    true_positives = K.sum(K.round(K.clip(y_true * y_preds, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_preds, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_preds):
    precision = precision_m(y_true, y_preds)
    recall = recall_m(y_true, y_preds)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [91]:
#CNN model with 2 convolution layers and 2 dense layers
from keras import regularizers
model = Sequential()

model.add(Conv2D(32, 3, activation = 'relu', input_shape = (X_train.shape[1:])))
model.add(Conv2D(32, 3, activation = 'relu'))
model.add(MaxPooling2D(pool_size = (3,3)))
model.add(Dropout(0.2))
model.add(Conv2D(128, 3, activation = 'relu'))
model.add(Conv2D(256, 3, activation = 'relu'))
# model.add(Conv2D(256, 3, activation = 'relu'))
# model.add(Conv2D(, 3, activation = 'relu'))
model.add(MaxPooling2D(pool_size = (3,3)))
model.add(Dropout(0.2))
model.add(Conv2D(512, 3, activation = 'relu'))
model.add(MaxPooling2D(pool_size = (2,2)))
model.add(Dropout(0.2))
model.add(Flatten())

model.add(Dense(256, activation = 'relu', activity_regularizer = regularizers.l2(0.002)))

model.add(Dense(10, activation = 'softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', f1_m, precision_m, recall_m])
model.summary()

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_79 (Conv2D)           (None, 126, 298, 32)      320       
_________________________________________________________________
conv2d_80 (Conv2D)           (None, 124, 296, 32)      9248      
_________________________________________________________________
max_pooling2d_38 (MaxPooling (None, 41, 98, 32)        0         
_________________________________________________________________
dropout_35 (Dropout)         (None, 41, 98, 32)        0         
_________________________________________________________________
conv2d_81 (Conv2D)           (None, 39, 96, 128)       36992     
_________________________________________________________________
conv2d_82 (Conv2D)           (None, 37, 94, 256)       295168    
_________________________________________________________________
max_pooling2d_39 (MaxPooling (None, 12, 31, 256)     

In [92]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=8) #implement early stopping 

In [93]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test),epochs=20, callbacks=es)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [115]:
TRAIN_DIRE = os.path.join('../input/testset/', 'Audio_Classification-MLSP-test/*') 
testF = glob.glob(TRAIN_DIRE)
test_data=[]
name_of_files=[]
max_length=0
temp1={}                                                        #create dict for storing the training npy file data
test_arr={}
for files in testF:
  temp1[files]=np.load(files) 
  test_arr[files]=temp1[files].copy()
  df = pd.DataFrame(test_arr[files][0])
  # df=df.loc[:, (df**2).sum() != 0]
  if max_length<df.shape[1] :
    max_length=df.shape[1]

print(max_length)

2584


In [116]:
for files in testF: 
  p,q,r = test_arr[files].shape
  start_padding = rand.randint(0, abs(300 - r))                 #reduce max_length, taken to be 200 here
  
  if(r<300):
    test_arr[files] = np.pad(test_arr[files], [(0,0),(0,0),(0,300-r)], 'constant', constant_values=0)

    
  elif(r>300):
    end_padding = abs(r - start_padding-300)                   #Randomly slice from left and right to provide better accuracy
    test_arr[files]=(test_arr[files][:,:,start_padding:r-end_padding])
      
for files in testF:
  name_of_files.append(files[17:])
  test_data.append(test_arr[files])  
test_data=np.stack(test_data) #calculate maximum length of audio sample, =2584 for train data


In [117]:
test_data=np.transpose(test_data,(0,2,3,1))    

In [118]:
y_preds = model.predict(test_data)

df1 = pd.DataFrame(y_preds, index=name_of_files)
# df2 = pd.DataFrame(y_test)
# df1.set_index(name_of_files)
df3=df1.idxmax(axis=1)
df3.to_csv('outputcheck.csv', encoding = 'utf-8-sig') 
newdf = pd.read_csv('outputcheck.csv')
print(newdf)
# files.download('outputcheck.csv')
# print(df2.idxmax(axis=1))

                                            Unnamed: 0  0
0    Audio_Classification-MLSP-test/MLSP_test144347...  5
1    Audio_Classification-MLSP-test/MLSP_test160245...  9
2    Audio_Classification-MLSP-test/MLSP_test463146...  0
3    Audio_Classification-MLSP-test/MLSP_test181039...  6
4    Audio_Classification-MLSP-test/MLSP_test616123...  2
..                                                 ... ..
195  Audio_Classification-MLSP-test/MLSP_test192098...  6
196  Audio_Classification-MLSP-test/MLSP_test17186.npy  5
197  Audio_Classification-MLSP-test/MLSP_test220428...  6
198  Audio_Classification-MLSP-test/MLSP_test901102...  3
199  Audio_Classification-MLSP-test/MLSP_test485355...  0

[200 rows x 2 columns]


In [None]:
#graphs
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

plt.plot(history.history['f1_m'])
plt.plot(history.history['val_f1_m'])
plt.title('model f1')
plt.ylabel('f1')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()


In [120]:
#use other methods on 1d array
nsamples, nx, ny, nz = X_train.shape
new_X_train = X_train.reshape((nsamples,nx*ny*nz))
nsamples, nx, ny, nz = X_test.shape
new_X_test = X_test.reshape((nsamples,nx*ny*nz))

In [121]:
knnmodel= KNeighborsClassifier(n_neighbors=3,p=2,metric='euclidean' )  
knnmodel=knnmodel.fit(new_X_train, y_train)
y_preds= knnmodel.predict(new_X_test)
# accuracy calculation works for both knn and decision tree
print(accuracy_score(y_test, y_preds))
print(metrics.classification_report(y_test, y_preds, digits=3))
# cm= confusion_matrix(y_test[:,0], y_preds[:,0]) 
# print(cm) 

0.435
              precision    recall  f1-score   support

           0      0.750     0.522     0.615        23
           1      1.000     0.250     0.400        24
           2      0.571     0.632     0.600        19
           3      0.615     0.500     0.552        16
           4      0.500     0.526     0.513        19
           5      0.440     0.647     0.524        17
           6      0.700     0.467     0.560        15
           7      0.857     0.333     0.480        18
           8      0.471     0.381     0.421        21
           9      0.389     0.250     0.304        28

   micro avg      0.569     0.435     0.493       200
   macro avg      0.629     0.451     0.497       200
weighted avg      0.628     0.435     0.485       200
 samples avg      0.435     0.435     0.435       200



  _warn_prf(average, modifier, msg_start, len(result))


In [122]:
#create decision tree classifier
DTmodel = tree.DecisionTreeClassifier()
DTmodel=DTmodel.fit(new_X_train, y_train)
y_preds = DTmodel.predict(new_X_test)
print(accuracy_score(y_test, y_preds))
print(metrics.classification_report(y_test, y_preds, digits=3))

0.46
              precision    recall  f1-score   support

           0      0.579     0.478     0.524        23
           1      0.429     0.375     0.400        24
           2      0.500     0.632     0.558        19
           3      0.500     0.562     0.529        16
           4      0.391     0.474     0.429        19
           5      0.467     0.412     0.437        17
           6      0.550     0.733     0.629        15
           7      0.611     0.611     0.611        18
           8      0.318     0.333     0.326        21
           9      0.300     0.214     0.250        28

   micro avg      0.460     0.460     0.460       200
   macro avg      0.464     0.482     0.469       200
weighted avg      0.454     0.460     0.453       200
 samples avg      0.460     0.460     0.460       200



In [None]:
TRAIN_DIR = os.path.join('/content/drive/MyDrive/', 'train/*') 
trainF = glob.glob(TRAIN_DIR)
for files in trainF:
  print(files)
  files=np.load(files)
  plt.figure(figsize=(20, 5))
  librosa.display.waveplot(files[0], sr=22050)
  plt.title('Waveplot', fontdict=dict(size=18))
  plt.xlabel('Time', fontdict=dict(size=15))
  plt.ylabel('Amplitude', fontdict=dict(size=15))
  plt.show()

In [None]:
TRAIN_DIR = os.path.join('/content/drive/MyDrive/', 'train/*') 
trainF = glob.glob(TRAIN_DIR)
for files in trainF:
  print(files)
  files=np.load(files)
  trial=files[0].flatten()
  audio_stft = librosa.core.stft(trial, hop_length=512, n_fft=2048, window='hann')
  # gathering the absolute values for all values in our audio_stft 
  spectrogram = np.abs(audio_stft)
  # Converting the amplitude to decibels
  log_spectro = librosa.amplitude_to_db(spectrogram)
  # Plotting the short-time Fourier Transformation
  plt.figure(figsize=(20, 5))
  # Using librosa.display.specshow() to create our spectrogram
  librosa.display.specshow(log_spectro, sr=22050, x_axis='time', y_axis='hz', hop_length=512, cmap='magma')
  plt.colorbar(label='Decibels')
  plt.title('Spectrogram (dB)', fontdict=dict(size=18))
  plt.xlabel('Time', fontdict=dict(size=15))
  plt.ylabel('Frequency', fontdict=dict(size=15))
  plt.show()