We will create a deep neural network with Keras.

In [1]:
# import some modules we may need
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# try out PCA as well as some feature selection. or try both!
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Activation

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# read in the csv file and put it in a DataFrame.
df_train = pd.read_csv('audio_training_data_cleaned.csv')

In [3]:
# drop any null values we may have forgotten
df_train = df_train.dropna(how='any',axis=0)

In [4]:
# split into X_train and y_train
X_train = df_train.drop(columns=['filename','age','Unnamed: 0']).values
y_train = df_train['age']

# to do feature selection, must convert categorical values to
# numerical values
replaced = {'teens':1,'twenties':2,'thirties':3,'fourties':4,'fifties':5,'sixties':6,
            'seventies':7,'eighties':8}
y_train_encoded = y_train.replace(replaced)

In [5]:
# use LASSO technique to do some feature selection because there are 170 features
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_train, y_train_encoded)
model = SelectFromModel(lsvc, prefit=True)
X_train = model.transform(X_train)
print(X_new.shape) # our new shape has decreased to 81 features

(73765, 81)


In [6]:
# try normal 3-layer perceptron
from keras.utils import to_categorical
y_train_keras_encoded = to_categorical(y_train_encoded)[:,1:] # adds unneeded extra column
DL_model = Sequential()

DL_model.add(Dense(200, activation='relu', input_shape = (X_new.shape[1],)))

DL_model.add(Dense(250, activation='relu'))

DL_model.add(Dense(8, activation='softmax'))

DL_model.compile(optimizer='adam',loss='categorical_crossentropy',
             metrics=['accuracy'])
DL_model.fit(X_new,y_train_keras_encoded,epochs=50,batch_size=1000)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1a2032dd90>

In [7]:
# now we need to test data against the testing set
# import testing set
testing_df = pd.read_csv('audio_testing_data_cleaned.csv')
# drop any null values we may have forgotten
testing_df = testing_df.dropna(how='any',axis=0)

Unnamed: 0.1,Unnamed: 0,filename,mean zero crossing rate,std zero crossing rate,max zero crossing rate,min zero crossing rate,median zero crossing rate,mean energy,std energy,max energy,...,std chroma12,max chroma12,min chroma12,median chroma12,mean chroma deviation,std chroma deviation,max chroma deviation,min chroma deviation,median chroma deviation,age
0,0,sample-000001.mp3,0.065252,0.063561,0.19925,0.005002,0.031055,0.020665,0.039363,0.167026,...,0.004051,0.022685,7.637082e-06,0.001875,0.024829,0.012904,0.05983,0.002064,0.024758,twenties
1,1,sample-000003.mp3,0.067284,0.085895,0.361817,0.0,0.036265,0.007341,0.013896,0.093219,...,0.016081,0.113979,0.0,0.002619,0.024565,0.014182,0.067191,0.000372,0.023678,twenties
2,2,sample-000005.mp3,0.056188,0.072168,0.309296,0.005836,0.02501,0.011656,0.020846,0.119734,...,0.013246,0.093047,2.442842e-05,0.002809,0.026102,0.014403,0.065812,0.000456,0.026022,twenties
3,3,sample-000006.mp3,0.107507,0.064129,0.198416,0.006253,0.129429,0.020982,0.0609,0.339483,...,0.004951,0.037732,2.343016e-05,0.00217,0.020776,0.01727,0.071816,0.000648,0.015771,twenties
4,4,sample-000007.mp3,0.081098,0.064335,0.230096,0.0,0.052939,0.008568,0.015405,0.081486,...,0.007254,0.061649,5.368893e-37,0.002164,0.029061,0.017429,0.074686,0.001601,0.025904,thirties
5,5,sample-000008.mp3,0.081751,0.076781,0.275948,0.0,0.047937,0.021823,0.027527,0.172971,...,0.011994,0.088578,2.855427e-38,0.002984,0.017411,0.015406,0.075583,0.00047,0.01407,thirties
6,6,sample-000009.mp3,0.071803,0.057462,0.293456,0.00917,0.049604,0.01147,0.015898,0.069122,...,0.015954,0.121773,0.0001365552,0.004455,0.020392,0.015884,0.080845,0.000808,0.015823,fifties
7,7,sample-000011.mp3,0.069486,0.060474,0.270946,0.005836,0.044602,0.005833,0.009417,0.048313,...,0.003538,0.032242,2.78363e-05,0.002077,0.023875,0.016795,0.081621,0.00102,0.021428,twenties
8,8,sample-000014.mp3,0.067858,0.063011,0.306794,0.0,0.052105,0.009783,0.018284,0.083497,...,0.011845,0.094221,5.625067e-35,0.00112,0.035463,0.022117,0.098675,0.001122,0.035639,twenties
9,9,sample-000016.mp3,0.091057,0.070631,0.244268,0.0,0.074406,0.007853,0.016308,0.083802,...,0.006594,0.039425,0.0,0.003439,0.015862,0.011808,0.061004,0.000791,0.013966,twenties


In [15]:
X_test = testing_df.drop(columns=['filename','age','Unnamed: 0']).values
X_test = model.transform(X_test)
y_test = testing_df['age']
y_test_encoded = y_test.replace(replaced)
y_test_keras_encoded = to_categorical(y_test_encoded)[:,1:] # adds unneeded extra column

In [20]:
score = DL_model.evaluate(X_test,y_test_keras_encoded,batch_size=32)
print("Loss: {}".format(score[0]))
print("Accuracy: {}".format(score[1]))



[0.8655181968567746, 0.7016861219195849]