# Load Library and models

In [2]:
AUDIO_FILES_PATH = "auditary_emotion_recognition/data_interview/Audio/Audio"
AUDIO_LABEL_PATH = "auditary_emotion_recognition/data_interview/Labels/turker_scores_full_interview.csv"
import pickle
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras import backend as K
import tensorflow as tf
from keras.utils.np_utils import to_categorical
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import sys
sys.path.insert(0, "../")
from acousticFeatures import getAllFeatures
import parselmouth 
import numpy as np
from pydub import AudioSegment
from IPython.display import Audio
import matplotlib.pyplot as plt

max_acoustic_features = pickle.load(open('processed-data/max_acoustic_features.obj', 'rb'))
min_acoustic_features = pickle.load(open('processed-data/min_acoustic_features.obj', 'rb'))
mean_acoustic_features = pickle.load(open('processed-data/mean_acoustic_features.obj', 'rb'))
std_acoustic_features = pickle.load(open('processed-data/std_acoustic_features.obj', 'rb'))

#Load model
clf = MLPClassifier(hidden_layer_sizes= (100,), learning_rate_init= 0.0001)
clf = pickle.load(open("model/model_classification.sav", 'rb'))
model_regression_arouse = pickle.load(open("model/model_regression_arouse.sav", 'rb'))

Using TensorFlow backend.


# Check the acoustic feature values in Interview data set

In [4]:
# Cut a segment of audio for analyzing
sound1 = AudioSegment.from_file(AUDIO_FILES_PATH + "/" + "P11.wav", frame_rate= 44100)
sound2 = sound1[6000:8000]
left, right = sound2.split_to_mono()
sound = parselmouth.Sound(left.get_array_of_samples())

acoustic_features_interview = np.array(getAllFeatures(sound))
for e in zip(min_acoustic_features, max_acoustic_features, acoustic_features_interview):
    belong = False
    if e[0] <= e[2] and e[2] <= e[1]:
        belong = True
    #print (e,"          .Belong:", belong)

# Check the acoustic feature values in EMO (france) data set

In [5]:
file_handler_input = open('processed-data/input-EMO.obj', 'rb')
file_handler_output = open('processed-data/output-EMO.obj', 'rb')
input_EMO = np.array(pickle.load(file_handler_input))
output_EMO = np.array(pickle.load(file_handler_output))
print("Size input, output: {}, {}".format(len(input_EMO), len(input_EMO)))
LABELS = ['F', 'T']
EMOTION = {'L': 'bordom', 'F': 'happiness', 'T': 'sadness', 'N': 'neutral'}
in_out_EMO  = filter(lambda x: x[1] in LABELS , zip(input_EMO, output_EMO))
input_EMO_filtered, output_EMO_filtered = zip(*in_out_EMO)
print("Size after filtering input: {}, output: {}".format(len(input_EMO_filtered), len(output_EMO_filtered)))

input_EMO_filtered = np.array(input_EMO_filtered)
print(input_EMO_filtered.shape)
for e in zip(min_acoustic_features, max_acoustic_features, input_EMO_filtered[3]):
    belong = False
    if e[0] <= e[2] and e[2] <= e[1]:
        belong = True
    #print (e,"          .Belong:", belong)



Size input, output: 535, 535
Size after filtering input: 133, output: 133
(133, 88)


# Test acoustic features

In [52]:
import pydub 
import numpy as np
import sys
import parselmouth 

def getStatistic(numpy_arr):
    numpy_arr = np.array(numpy_arr)
    numpy_arr = numpy_arr[numpy_arr != 0]
    max_v = np.max(numpy_arr)
    min_v = np.min(numpy_arr)
    range_v = np.max(numpy_arr) - np.min(numpy_arr)
    mean_v = np.mean(numpy_arr)
    median_v = np.median(numpy_arr)
    per25_v = np.percentile(numpy_arr, 25)
    per75_v= np.percentile(numpy_arr, 75)
    std_v = np.std(numpy_arr)
    return np.array([max_v, min_v, range_v, mean_v, median_v, per25_v, per75_v, std_v])

def calculateJitter(data):
    """Data is list of time of peaks"""
    data = np.array(data)
    data = data[data != 0]
    n = len(data)
    sum1 = 0
    sum2 = 0
    for i in range(n):
        if i > 0:
            sum1 += abs(data[i-1] - data[i])
        sum2 += data[i]
    sum1 /= float(n - 1)
    sum2 /= float(n)
    return (sum1 / sum2)


#Get Shimmer
def calculateShimmer(data):
    data = np.array(data)
    data = data[data != 0]
    n = len(data)
    sum1 = 0
    sum2 = 0
    for i in range(n):
        if i > 0:
            sum1 += abs(data[i-1] - data[i])
        sum2 += data[i]
    sum1 /= float(n - 1)
    sum2 /= float(n)
    return   (sum1 / sum2)

audio = AudioSegment.from_file(AUDIO_FILES_PATH + "/" + "P1.wav")
audio_q1 = audio[0:51952]
left, right = audio_q1.split_to_mono()
sound = parselmouth.Sound(left.get_array_of_samples(), audio_q1.frame_rate)

#Duration
duration = sound.duration
print("Duration:", duration)

#Energy
energy = sound.get_energy()
print("Energy:", energy)

#Power
power = sound.get_power()
print("Power:", power)

#Pitch
pitch = sound.to_pitch(time_step = 0.01)
num_frames = pitch.get_number_of_frames()
frames = [pitch.get_frame(i) for i in range(1, num_frames+1)]
times = [pitch.get_time_from_frame_number(i) for i in range(1, num_frames+1)]
formants = sound.to_formant_burg()

#Get intensity statistic
intensity_arr = [frame.intensity for frame in frames]
intensity_stat = getStatistic(intensity_arr)
#print(intensity_stat)
#print(pitch)

#Frequency
f1_arr = [formants.get_value_at_time(1, time) for time in times]
print("neam f1:", np.mean(f1_arr))

f2_arr = [formants.get_value_at_time(2, time) for time in times]
print("neam f2:", np.mean(f2_arr))

f3_arr = [formants.get_value_at_time(3, time) for time in times]
print("neam f3:", np.mean(f3_arr))


#Jitter
#Jitter
f0_arr = pitch.selected_array['frequency']
f0_arr = np.array(f0_arr)
f0_arr = f0_arr[f0_arr !=  0]
jitter = calculateJitter(1000/f0_arr)
print("jitter:", jitter)

#Shimmer
amplitude_arr = pitch.selected_array['strength']
shimmer = calculateShimmer(amplitude_arr)
print("shimmer:", shimmer)






Duration: 51.952
Energy: 19246096.017249998
Power: 370459.19343336154
neam f1: 651.7347966638118
neam f2: 1834.245626660736
neam f3: 2761.7925726076214
jitter: 0.025611750905398262
shimmer: 0.09486117702767627


In [61]:
from acousticFeatures import getAllFeatures

f = getAllFeatures(sound)
#print(f)
print(f[-3:])

[0.36424412 0.49       0.13844262]
