In [None]:
import numpy
import librosa
import matplotlib.pyplot as plot
from sklearn.model_selection import train_test_split
import random
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.utils import to_categorical

clip_length_seconds = 120
downsampling_rate = 100
split_ratio = 0.95
random_state = random.randint(1,101)
count_files = 14
y = numpy.flip(numpy.array([1, 2, 1, 2, 2, 0, 0, 2, 1, 0, 0, 2, 2, 2]))

y = y[:count_files,]

X_vectors = []
y_vectors = []

for i in range(1, count_files + 1):
    filename = 'vid' + str(i) + '.wav'
    
    wave, sr = librosa.load(filename, mono=True, sr=None)
    #print('Sample rate: ' + str(sr))    
    #print('Wave shape for i = ' + str(i) + ': ' + str(wave.shape))
    
    for j in range(0, wave.shape[0] / clip_length_seconds / sr):
        start = j * clip_length_seconds * sr
        end = (j + 1) * clip_length_seconds * sr
        
        wave_clip = wave[start:end]
        #print('Clip shape for j = ' + str(j) + ': ' + str(wave_clip.shape))

        wave_downsampled = wave_clip[::downsampling_rate]
        #print('Downsample shape for j = ' + str(j) + ': ' + str(wave_downsampled.shape))

        mfcc = librosa.feature.mfcc(wave_downsampled, sr)
        #print('Padded mfcc shape for j = ' + str(j) + ': ' + str(mfcc.shape))
    
        X_vectors.append(mfcc)
        y_vectors.append(y[i - 1])
    
        #plot.subplot(312)
        #mfcc -= (numpy.mean(mfcc,axis=0) + 1e-8)
        #plot.imshow(mfcc.T, cmap=plot.cm.jet, aspect='auto')
        # plot.xticks(numpy.arange(0, (mfcc.T).shape[1], int((mfcc.T).shape[1] / 6)), ['0s', '0.5s', '1s', '1.5s','2.5s','3s','3.5'])
        #ax = plot.gca()
        #ax.invert_yaxis()
        #plot.title('the Normalized MFCC spectrum image for video ' + str(i))
        #plot.show()
    
X = numpy.array(X_vectors)
y = numpy.array(y_vectors)

print(y)

assert X.shape[0] == y.shape[0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = (1 - split_ratio), random_state = random_state, shuffle = True)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)

y_train_hot = to_categorical(y_train)
y_test_hot = to_categorical(y_test)

model = Sequential()
model.add(Conv2D(64, kernel_size=(5, 5), activation='relu', input_shape=(X_train.shape[1], X_train.shape[2], X_train.shape[3])))
model.add(MaxPooling2D(pool_size=(5, 5)))
#model.add(Dropout(rate = 1 - 0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
#model.add(Dropout(rate = 1 - 0.25))
model.add(Dense(3, activation='softmax'))
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(),
              metrics=['accuracy'])
model.fit(X_train, y_train_hot, batch_size=2, epochs=1000, verbose=1, validation_data=(X_test, y_test_hot))