**Import Libraries and Define constants**

In [1]:
#### Training based on features of audio
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier
from sknn.mlp import Classifier, Layer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
import pickle
import numpy as np
import random
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostClassifier

#Constant
EMOTION_ANNOTATORS = {'anger': 0, 'happiness' : 1, 'sadness' : 2, 'neutral' : 3, 'frustration' : 4, 'excited': 5,
           'fear' : 6,'surprise' : 7,'disgust' : 8, 'other' : 9}

EMOTION = {'ang': 0, 'hap' : 1, 'sad' : 2, 'neu' : 3, 'fru' : 4, 'exc': 5,
           'fea' : 6,'sur' : 7,'dis' : 8, 'oth' : 9, 'xxx':10}

#EMOTION = {'ang': 0, 'hap' : 1, 'sad' : 2}






**Load data**

In [2]:

##Loading  data from files
filehandlerInput = open('processed-data/input.obj', 'rb')
filehandlerOutput = open('processed-data/output.obj', 'rb')
input = pickle.load(filehandlerInput)
output = pickle.load(filehandlerOutput)
print("Size input, output:", len(input),", ", len(output))

Size input, output: 2706 ,  2706


**Analyze data**

In [3]:
feature_name= ['energy', 
               'f0', 'intensity', 'f1', 'f2', 'f3','f1-bw','f2-bw','f3-bw' ,
               'f2-f1', 'f3-f1', 
               'jitter', 'shimmer', 'duration',
              'unvoiced_percent', 'breaks_degree', 'max_dur_pause', 'average_dur_pause']

num_feas = len(input[0])

numNan = [np.count_nonzero(np.isnan(input[:,i]))   for i in range (0, num_feas)]
print("Number of Nan values in each features in all sample:", numNan)

# index_fea_contain_Nan = [i for i in range(0, len(numNan)) if numNan[i] != 0]
# print("Index of features containing Nan values: ", index_fea_contain_Nan)

# fea_contain_Nan = [feature_name[index] for index in index_fea_contain_Nan]
# print("Name of features containing Nan values: ", fea_contain_Nan)

# Filter samples containing Nan values
input_filtered = input[~np.any(np.isnan(input), axis=1)]
output_filtered = output[~np.any(np.isnan(input), axis=1)]
print("Size filteres input, output: ", len(input_filtered), ", ", len(output_filtered))

#Normalize input
input_filtered = (input_filtered - input_filtered.min(axis=0)) / (input_filtered.max(axis=0) - input_filtered.min(axis=0))

print(EMOTION)


Number of Nan values in each features in all sample: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]
Size filteres input, output:  2705 ,  2705
{'ang': 0, 'hap': 1, 'sad': 2, 'neu': 3, 'fru': 4, 'exc': 5, 'fea': 6, 'sur': 7, 'dis': 8, 'oth': 9, 'xxx': 10}


**Cluster emotion**

In [4]:
# exc -> hap. Positive
output_filtered[output_filtered == 5] = 1

#sad -> ang. Negative
#output_filtered[output_filtered == 2] = 0
#output_filtered[output_filtered == 4] = 2


**Preprocess data**

In [5]:
def printQuantitySample(output):
    y = np.bincount(output)
    ii = np.nonzero(y)[0]
    a = list(zip(ii, y[ii]))
    print("EMOTION_ANNOTATE: ", EMOTION)
    print("\nThe quantity of each label: ", a, "\n")
    
def filterLabels(input, output, labels=['ang','hap','sad']):
    labels_int = [EMOTION[l] for l in labels]
    condition = [out in labels_int for out in output]
    input = input[condition]
    output = output[condition]
    return input, output
    
    
# Remove labels that have small quantity.
input_filtered, output_filtered = filterLabels(input_filtered, output_filtered, [ 'sad', 'hap' , 'ang', 'neu'])
printQuantitySample(output_filtered)

#Shuffer
c = list(zip(input_filtered, output_filtered))
random.shuffle(c)
input_filtered, output_filtered = zip( * c)
input_filtered = np.array(input_filtered)
output_filtered = np.array(output_filtered)







EMOTION_ANNOTATE:  {'ang': 0, 'hap': 1, 'sad': 2, 'neu': 3, 'fru': 4, 'exc': 5, 'fea': 6, 'sur': 7, 'dis': 8, 'oth': 9, 'xxx': 10}

The quantity of each label:  [(0, 471), (1, 489), (2, 491), (3, 604)] 



**Split data**

In [6]:
X_train, X_test, y_train, y_test = train_test_split(input_filtered, output_filtered, test_size=0.4, random_state=300)
print("Size training, testing set: ", len(X_train), ", ", len(X_test))
print(y_train)


Size training, testing set:  1438 ,  617
[1 3 0 ... 2 2 0]


**Search best parameters for RandomForest model**

In [7]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1000, num = 4)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 30, num = 5)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

#Tuninng parameter
# rf = RandomForestRegressor()
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# rf_random.fit(X_train, y_train)
# print(rf_random.best_params_)
    


{'n_estimators': [200, 466, 733, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 15, 20, 25, 30, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


**Search model playing around**

In [8]:
# from sklearn import svm
# from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import cross_val_score
# clf =  MLPClassifier(solver='lbfgs', alpha=1e-7,
#                  hidden_layer_sizes=(300), random_state=200)

# log_loss = cross_val_score(clf, X_train, y_train, scoring= 'neg_log_loss', cv = 5) 
# accs = cross_val_score(clf, X_train, y_train, scoring= 'accuracy', cv = 5) 
# print("Log-loss: ", log_loss)
# print("Accuracy: ",  accs)



**Training**

In [17]:
def training(X_train, y_train):    
    sm = SMOTE(random_state=42)
    kf = KFold(n_splits=5, random_state=None, shuffle=True)
    i_fold = 0
    accuracy_train_results = []
    accuracy_valid_results = []

    for train_index, valid_index in kf.split(X_train):
        i_fold = i_fold + 1
        
        x_train_sub, x_valid_sub = X_train[train_index], X_train[valid_index]
        y_train_sub, y_valid_sub = y_train[train_index], y_train[valid_index]
        
        clf = RandomForestClassifier(n_estimators = 300, max_features = 88, max_depth = 5, class_weight="balanced")
        #clf = MLPClassifier(solver='lbfgs', alpha=1e-7,
             #    hidden_layer_sizes=(300), random_state=200)
        
       #print(clf.get_params())
        #Upsampling train data
        x_train_sub, y_train_sub = sm.fit_sample(x_train_sub, y_train_sub)
        clf.fit(x_train_sub, y_train_sub)
        
        score = clf.score(x_train_sub, y_train_sub)
        score1 = clf.score(x_valid_sub, y_valid_sub)
        accuracy_train_results.append(score)
        accuracy_valid_results.append(score1)
        
        print("Score of training set: ", score)
        print("Score of validation set: ", score1)
     
       
    
    avg_accuracy_train_result = np.sum(accuracy_train_results) / len(accuracy_train_results)
    avg_accuracy_valid_result = np.sum(accuracy_valid_results) / len(accuracy_valid_results)
    print("Average accuracy training set, std:", avg_accuracy_train_result, " ",\
          np.std(accuracy_train_results))
    print("Average accuracy validation set, std:", avg_accuracy_valid_result," ", \
          np.std(accuracy_valid_results))     
    
    
    #Upsampling data
    X_train, y_train = sm.fit_sample(X_train, y_train)
    printQuantitySample(y_train)
    clf.fit(X_train, y_train)
    
    return clf
   
# Training 
clf = training(X_train, y_train)

#Save model  into files
filename = 'model/model1.sav'
pickle.dump(clf, open(filename, 'wb'))
print("Saved model into file")

Score of training set:  0.809190031152648
Score of validation set:  0.6423611111111112
Score of training set:  0.7879464285714286
Score of validation set:  0.6458333333333334
Score of training set:  0.8174486803519062
Score of validation set:  0.6701388888888888
Score of training set:  0.8196969696969697
Score of validation set:  0.6202090592334495
Score of training set:  0.81015625
Score of validation set:  0.6027874564459931
Average accuracy training set, std: 0.8088876719545904   0.011228251584615311
Average accuracy validation set, std: 0.6362659698025552   0.023042113937399068
EMOTION_ANNOTATE:  {'ang': 0, 'hap': 1, 'sad': 2, 'neu': 3, 'fru': 4, 'exc': 5, 'fea': 6, 'sur': 7, 'dis': 8, 'oth': 9, 'xxx': 10}

The quantity of each label:  [(0, 412), (1, 412), (2, 412), (3, 412)] 

Saved model into file


**Test **

In [18]:
def test(clf, X_test, y_test):
    predicts = clf.predict(X_test)
   # pro = clf.predict_proba(X_test)

    score_test = clf.score(X_test, y_test)
    print("\nScore for test set: ", score_test)
    
    matrix = confusion_matrix(y_test, predicts)
    print ("\nConfusion matrix:..................... \n",matrix)
    
    sum_colum = np.sum(matrix, axis = 0)
    sum_row = np.sum(matrix, axis = 1)

    TP = [matrix[i,i] for i in range(0, len(matrix))]
    print("\nTP: ", TP,"\n")   
    FP = [sum_colum[i] - matrix[i,i] for i in range(0, len(matrix))]
    print("FP: ", FP,"\n")
    FN = [sum_row[i] - matrix[i,i] for i in range(0, len(matrix))]
    print("FN: ", FN,"\n")
    Presision = [TP[i] /(TP[i] + FP[i])  for i in range(0, len(matrix))]
    Recall = [TP[i] /(TP[i] + FN[i])  for i in range(0, len(matrix))]
    F1_score = [2 * Presision[i] * Recall[i] /(Presision[i] + Recall[i])  for i in range(0, len(matrix))]
    
    print("\nPrecision: ", Presision,"\n")
    print("Recall: ", Recall,"\n")
    print("F1_scrore: ", F1_score, "\n")

test(clf, X_test, y_test)




Score for test set:  0.6288492706645057

Confusion matrix:..................... 
 [[103  27   2   9]
 [ 41  55  13  37]
 [  3   3 114  18]
 [  7  21  48 116]]

TP:  [103, 55, 114, 116] 

FP:  [51, 51, 63, 64] 

FN:  [38, 91, 24, 76] 


Precision:  [0.6688311688311688, 0.5188679245283019, 0.6440677966101694, 0.6444444444444445] 

Recall:  [0.7304964539007093, 0.3767123287671233, 0.8260869565217391, 0.6041666666666666] 

F1_scrore:  [0.6983050847457627, 0.43650793650793657, 0.7238095238095238, 0.6236559139784946] 



**Test probability**

In [16]:
from sklearn.metrics import log_loss

predicts = clf.predict(X_test)
pro = clf.predict_proba(X_test)

theshold_consitent = 0.5
consitent_predictions = pro[[np.any(p> theshold_consitent)  for p in pro]]
print("Ratio of cositent prediction: ", len(consitent_predictions) / len (pro))
print(pro[0:4])
print(y_test[0:4])
print(predicts[0:4])
    
log_loss_value = log_loss(y_test, pro)
print("Log loss value:", log_loss_value)
print("Prediciton accuracy based on log loss: ", np.exp(-log_loss_value))




Ratio of cositent prediction:  0.48946515397082657
[[0.06417086 0.17810933 0.41643606 0.34128375]
 [0.00497508 0.03919117 0.84216358 0.11367017]
 [0.78202933 0.17239961 0.00932225 0.0362488 ]
 [0.14493636 0.29480124 0.10558955 0.45467285]]
[3 2 0 2]
[2 2 0 3]
Log loss value: 0.883168191288062
Prediciton accuracy based on log loss:  0.4134708795561825


**Training on keras**

In [12]:
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Activation


y_train_cat = to_categorical(y_train, num_classes=None)
y_test_cat = to_categorical(y_test, num_classes=None)

print(y_train_cat)
model = Sequential([
    Dense(128, input_shape=(88,)),
    Activation('relu'),
    Dense(64),
    Activation('relu'),
    Dense(48),
    Activation('relu'),
    Dense(24),
    Activation('relu'),
    Dense(2),
    Activation('softmax'),
])
model.compile(optimizer='Adagrad',
              loss='category_crossentropy',
              metrics=['accuracy'])
model.fit(X_train, y_train_cat, validation_data = (X_test, y_test_cat), epochs = 100)





  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


ImportError: Traceback (most recent call last):
  File "/home/vuthede/.local/lib64/python3.6/site-packages/tensorflow/python/pywrap_tensorflow.py", line 58, in <module>
    from tensorflow.python.pywrap_tensorflow_internal import *
  File "/home/vuthede/.local/lib64/python3.6/site-packages/tensorflow/python/pywrap_tensorflow_internal.py", line 28, in <module>
    _pywrap_tensorflow_internal = swig_import_helper()
  File "/home/vuthede/.local/lib64/python3.6/site-packages/tensorflow/python/pywrap_tensorflow_internal.py", line 24, in swig_import_helper
    _mod = imp.load_module('_pywrap_tensorflow_internal', fp, pathname, description)
  File "/usr/lib64/python3.6/imp.py", line 243, in load_module
    return load_dynamic(name, filename, file)
  File "/usr/lib64/python3.6/imp.py", line 343, in load_dynamic
    return _load(spec)
ImportError: libcusolver.so.9.1: cannot open shared object file: No such file or directory


Failed to load the native TensorFlow runtime.

See https://www.tensorflow.org/install/install_sources#common_installation_problems

for some common reasons and solutions.  Include the entire stack trace
above this error message when asking for help.

In [None]:
predicts = model.predict_classes(X_test)
#print(predicts)
#print(y_test)
acc = predicts == y_test
print(np.count_nonzero(acc) / len(predicts))