We will create a deep neural network with Keras.

In [1]:
# import some modules we may need
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# try out PCA as well as some feature selection. or try both!
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, Activation

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# read in the csv file and put it in a DataFrame.
df_train = pd.read_csv('audio_training_data_cleaned.csv')

In [3]:
# drop any null values we may have forgotten
df_train = df_train.dropna(how='any',axis=0)

In [4]:
# split into X_train and y_train
X_train = df_train.drop(columns=['filename','age','Unnamed: 0']).values
y_train = df_train['age']

# to do feature selection, must convert categorical values to
# numerical values
replaced = {'teens':1,'twenties':2,'thirties':3,'fourties':4,'fifties':5,'sixties':6,
            'seventies':7,'eighties':8}
y_train_encoded = y_train.replace(replaced)

In [5]:
%%time
# use LASSO technique to do some feature selection because there are 170 features
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_train, y_train_encoded)
model = SelectFromModel(lsvc, prefit=True)
X_train = model.transform(X_train)

CPU times: user 1min 42s, sys: 1.44 s, total: 1min 43s
Wall time: 1min 45s


In [6]:
%%time
# try normal 3-layer perceptron
from keras.utils import to_categorical
from keras.layers import Dropout
y_train_keras_encoded = to_categorical(y_train_encoded)[:,1:] # adds unneeded extra column
DL_model = Sequential()
DL_model.add(Dropout(0.15))
DL_model.add(Dense(256, activation='relu', input_shape = (X_train.shape[1],)))

DL_model.add(Dense(1024, activation='relu'))

DL_model.add(Dense(1024, activation='relu'))

DL_model.add(Dense(256, activation='relu'))

DL_model.add(Dense(8, activation='softmax'))

DL_model.compile(optimizer='adam',loss='categorical_crossentropy',
             metrics=['accuracy'])
DL_model.fit(X_train,y_train_keras_encoded,epochs=50,batch_size=100)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
CPU times: user 1h 1min 39s, sys: 7min 29s, total: 1h 9min 8s
Wall time: 23min 28s


In [7]:
# now we need to test data against the testing set
# import testing set
testing_df = pd.read_csv('audio_testing_data_cleaned.csv')
# drop any null values we may have forgotten
testing_df = testing_df.dropna(how='any',axis=0)

# import another testing set
other_testing_df = pd.read_csv('audio_other_testing_data_cleaned.csv')
# drop any null values we may have forgotten
other_testing_df = other_testing_df.dropna(how='any',axis=0)

In [8]:
X_test = testing_df.drop(columns=['filename','age','Unnamed: 0']).values
X_test = model.transform(X_test)
y_test = testing_df['age']
y_test_encoded = y_test.replace(replaced)
y_test_keras_encoded = to_categorical(y_test_encoded)[:,1:] # adds unneeded extra column

X_other_test = other_testing_df.drop(columns=['filename','age','Unnamed: 0']).values

X_other_test= model.transform(X_other_test)
y_other_test = other_testing_df['age']
y_other_test_encoded = y_other_test.replace(replaced)
y_other_test_keras_encoded = to_categorical(y_other_test_encoded)[:,1:]

In [9]:
%%time
score1 = DL_model.evaluate(X_test,y_test_keras_encoded,batch_size=32)
print("Results on primary testing set:")
print("Loss: {}".format(score1[0]))
print("Accuracy: {}".format(score1[1]))

Results on primary testing set:
Loss: 0.792143861994
Accuracy: 0.708819714656
CPU times: user 496 ms, sys: 39.6 ms, total: 536 ms
Wall time: 244 ms
