In [None]:
from os import listdir
from os.path import isfile, join
import os
import subprocess
import sys

old_path = 'D:/Audio Analysis Notebook/data'
new_path = 'D:/Audio Analysis Notebook/new data'

files = []
data_folders = os.listdir(old_path)

#Separating audio clips on the basis of 8 emotions and storing them together accordingly
#Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised)
for i in data_folders:
    temp_join = join(old_path,i)
    for j in listdir(temp_join):
        if isfile(join(temp_join,j)):
            files.append(j)
            temp = j.split('.')[0].split('-')
            subprocess.call("mv %s %s" % (join(temp_join,j),join(new_path,temp[2])),shell=True)

In [2]:
import collections
import contextlib
import sys
import wave
import webrtcvad
import librosa

In [3]:
def read_wave(path):
    """Reads a .wav file.
    Takes the path, and returns (PCM audio data, sample rate).
    """
    with contextlib.closing(wave.open(path, 'rb')) as wf:
        num_channels = wf.getnchannels()
        assert num_channels == 1
        sample_width = wf.getsampwidth()
        assert sample_width == 2
        sample_rate = wf.getframerate()
        assert sample_rate in (8000, 16000, 32000, 48000)
        pcm_data = wf.readframes(wf.getnframes())
        return pcm_data, sample_rate


def write_wave(path, audio, sample_rate):
    """Writes a .wav file.
    Takes path, PCM audio data, and sample rate.
    """
    with contextlib.closing(wave.open(path, 'wb')) as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(sample_rate)
        wf.writeframes(audio)


class Frame(object):
    """Represents a "frame" of audio data."""
    def __init__(self, bytes, timestamp, duration):
        self.bytes = bytes
        self.timestamp = timestamp
        self.duration = duration


def frame_generator(frame_duration_ms, audio, sample_rate):
    """Generates audio frames from PCM audio data.
    Takes the desired frame duration in milliseconds, the PCM data, and
    the sample rate.
    Yields Frames of the requested duration.
    """
    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
    offset = 0
    timestamp = 0.0
    duration = (float(n) / sample_rate) / 2.0
    while offset + n < len(audio):
        yield Frame(audio[offset:offset + n], timestamp, duration)
        timestamp += duration
        offset += n


def vad_collector(sample_rate, frame_duration_ms,
                  padding_duration_ms, vad, frames):
    """Filters out non-voiced audio frames.
    Given a webrtcvad.Vad and a source of audio frames, yields only
    the voiced audio.
    Uses a padded, sliding window algorithm over the audio frames.
    When more than 90% of the frames in the window are voiced (as
    reported by the VAD), the collector triggers and begins yielding
    audio frames. Then the collector waits until 90% of the frames in
    the window are unvoiced to detrigger.
    The window is padded at the front and back to provide a small
    amount of silence or the beginnings/endings of speech around the
    voiced frames.
    Arguments:
    sample_rate - The audio sample rate, in Hz.
    frame_duration_ms - The frame duration in milliseconds.
    padding_duration_ms - The amount to pad the window, in milliseconds.
    vad - An instance of webrtcvad.Vad.
    frames - a source of audio frames (sequence or generator).
    Returns: A generator that yields PCM audio data.
    """
    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
    # We use a deque for our sliding window/ring buffer.
    ring_buffer = collections.deque(maxlen=num_padding_frames)
    # We have two states: TRIGGERED and NOTTRIGGERED. We start in the
    # NOTTRIGGERED state.
    triggered = False

    voiced_frames = []
    for frame in frames:
        is_speech = vad.is_speech(frame.bytes, sample_rate)

        sys.stdout.write('1' if is_speech else '0')
        if not triggered:
            ring_buffer.append((frame, is_speech))
            num_voiced = len([f for f, speech in ring_buffer if speech])
            # If we're NOTTRIGGERED and more than 90% of the frames in
            # the ring buffer are voiced frames, then enter the
            # TRIGGERED state.
            if num_voiced > 0.9 * ring_buffer.maxlen:
                triggered = True
                sys.stdout.write('+(%s)' % (ring_buffer[0][0].timestamp,))
                # We want to yield all the audio we see from now until
                # we are NOTTRIGGERED, but we have to start with the
                # audio that's already in the ring buffer.
                for f, s in ring_buffer:
                    voiced_frames.append(f)
                ring_buffer.clear()
        else:
            # We're in the TRIGGERED state, so collect the audio data
            # and add it to the ring buffer.
            voiced_frames.append(frame)
            ring_buffer.append((frame, is_speech))
            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
            # If more than 90% of the frames in the ring buffer are
            # unvoiced, then enter NOTTRIGGERED and yield whatever
            # audio we've collected.
            if num_unvoiced > 0.9 * ring_buffer.maxlen:
                sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration))
                triggered = False
                yield b''.join([f.bytes for f in voiced_frames])
                ring_buffer.clear()
                voiced_frames = []
    if triggered:
        sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration))
    sys.stdout.write('\n')
    # If we have any leftover voiced audio when we run out of input,
    # yield it.
    if voiced_frames:
        yield b''.join([f.bytes for f in voiced_frames])

In [4]:
from sklearn import preprocessing
import numpy as np
from sklearn.mixture import GaussianMixture
from copy import deepcopy
from sklearn.cluster import SpectralClustering

audio, sample_rate = read_wave('D:/Audio Analysis Notebook/Recordings/20211201-083202_8008512300_1003_TOLLFREE-all.wav')
vad = webrtcvad.Vad(2)
frames = frame_generator(30, audio, sample_rate)
frames = list(frames)
segments = vad_collector(sample_rate, 30, 300, vad, frames)
c = 0
for i, segment in enumerate(segments):
    path = 'chunk-%002d.wav' % (i,)
    print(' Writing %s' % (path,))
    write_wave(path, segment, sample_rate)
    c +=1
#count of chunks
# c = 14

sampling_rate = 8000
n_mfcc = 13
n_fft = 0.032
hop_length = 0.010

components = 16

cov_type = 'full'

00000000000000000000000000000000000000000000000000000000000000000000000000011111111001111111111+(2.549999999999998)111111111111001111111111111111110000000000-(4.10999999999999) Writing chunk-00.wav
000000000001111111111+(4.439999999999992)1111111000000000111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111101111110000000111111111111111111111111111111111111111111111111111111111111111111111110000000000-(15.56999999999986) Writing chunk-01.wav
000000000111110000000000000001111111111+(16.439999999999866)11111111111111111111111111110000000000-(17.87999999999992) Writing chunk-02.wav
0000000000000000000000000000000000000000000000000000000000111111100111111110001110001111111111+(20.400000000000016)1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111

In [5]:

########################### Global GMM i.e UBM ###########################

test_file_path = 'D:/Audio Analysis Notebook/Recordings/20211201-083202_8008512300_1003_TOLLFREE-all.wav'
y,sr = librosa.load(test_file_path)
print(np.shape(y))

mfcc = librosa.feature.mfcc(np.array(y),sr,hop_length=int(hop_length * sr),n_fft=int(n_fft*sr),n_mfcc=n_mfcc,dct_type=2)
mfcc_delta = librosa.feature.delta(mfcc)
mfcc_delta_second_order = librosa.feature.delta(mfcc,order=2)
temp = librosa.feature.delta(mfcc_delta)
inter = np.vstack((mfcc,mfcc_delta,mfcc_delta_second_order))
ubm_feature = inter.T
#ubm_feature = preprocessing.scale(ubm_feature)

# ubm_feature -= np.mean(ubm_feature)
# ubm_feature /= np.std(ubm_feature)

ubm_model = GaussianMixture(n_components = components, covariance_type = cov_type)
ubm_model.fit(ubm_feature)

print(ubm_model.score(ubm_feature))
print(ubm_model.means_)


def MAP_Estimation(model,data,m_iterations):

    N = data.shape[0]
    D = data.shape[1]
    K = model.n_components


    mu_new = np.zeros((K,D))
    n_k = np.zeros((K,1))

    mu_k = model.means_
    
    pi_k = model.weights_

    old_likelihood = model.score(data)
    new_likelihood = 0
    iterations = 0
    while(iterations < m_iterations):
        iterations += 1
        old_likelihood = new_likelihood
        z_n_k = model.predict_proba(data)
        n_k = np.sum(z_n_k,axis = 0)
        n_k = n_k.reshape(np.shape(n_k)[0],1)

        mu_new = np.dot(z_n_k.T,data)
        n_k[n_k == 0] = 1e-20
        mu_new = mu_new / n_k

        adaptation_coefficient = n_k/(n_k + relevance_factor)
        I = np.ones(shape=np.shape(n_k))
        # for k in range(K):
        #     mu_k[k] = (adaptation_coefficient[k] * mu_new[k]) + ((1 - adaptation_coefficient[k]) * mu_k[k])
        mu_k = (adaptation_coefficient*mu_new) + (( I - adaptation_coefficient) * mu_k)
        model.means_ = mu_k

        log_likelihood = model.score(data)

        new_likelihood = log_likelihood

        if abs(old_likelihood - new_likelihood) < 1e-20:
            break
        print(log_likelihood)
    return model



Total = []
relevance_factor = 16
for i in range(c):
    fname='chunk-%002d.wav' % (i,)
    print('MAP adaptation for {0}'.format(fname))
    temp_y,sr_temp = librosa.load(fname,sr=None)
    
    temp_mfcc = librosa.feature.mfcc(np.array(temp_y),sr_temp,hop_length=int(hop_length * sr_temp),n_fft=int(n_fft*sr_temp),n_mfcc=n_mfcc,dct_type=2)
    temp_mfcc_delta = librosa.feature.delta(temp_mfcc)
    temp_mfcc_delta_second_order = librosa.feature.delta(temp_mfcc,order=2)
    temp_inter = np.vstack((temp_mfcc,temp_mfcc_delta,temp_mfcc_delta_second_order))
    temp_gmm_feature = temp_inter.T
    #data = preprocessing.scale(temp_gmm_feature)

    gmm  = deepcopy(ubm_model)

    gmm = MAP_Estimation(gmm,temp_gmm_feature,m_iterations =1)
    
    sv = gmm.means_.flatten()
    #sv = preprocessing.scale(sv)
    Total.append(sv)

N_CLUSTERS = 2

def rearrange(labels, n):
    seen = set()
    distinct = [x for x in labels if x not in seen and not seen.add(x)]
    correct = [i for i in range(n)]
    dict_ = dict(zip(distinct, correct))
    return [x if x not in dict_ else dict_[x] for x in labels]

sc = SpectralClustering(n_clusters=N_CLUSTERS, affinity='cosine')

#Labels help us identify between chunks of customer and call center agent
labels = sc.fit_predict(Total)
labels = rearrange(labels, N_CLUSTERS)
print(labels)

#Since there is no way to identify the voice of a customer just from the audio
#we have assumed that customer is the one who speaks 2nd
#Normally the call center agent is the first one to speak and then the customer
#If that is not the case for a specific audio, change the condition from 'x==1' to 'x==0'
print([i for i, x in enumerate(labels) if x == 1])

(2708446,)
-29.559303948184127
[[-5.56621278e+02  1.05131404e+02  2.27655504e+01  1.46333235e+01
   1.77650447e+01  6.61958577e+00  8.55127364e+00  6.34394177e+00
  -1.60606648e-01  3.14688676e+00  1.38520897e-01 -3.94708474e+00
   1.82335051e+00 -9.69751687e-01 -6.18997197e-01  4.11447889e-01
   4.26958872e-01  6.60999847e-02  2.06966301e-02  6.64566335e-02
   2.61113608e-02 -4.20451190e-02 -4.03467753e-02  5.24076874e-02
   5.51030654e-02 -1.32705005e-02  2.78081783e-01  1.92665218e-01
  -5.90249949e-02 -5.86755319e-02 -2.61258075e-02 -5.24293144e-02
  -4.71707214e-02 -4.25137752e-02 -3.32234967e-02 -8.43542689e-03
  -1.38019126e-02 -2.94477548e-03  3.31042194e-02]
 [-3.68091082e+02  1.90953295e+02 -6.69177489e+01 -1.59496686e-01
   2.34129439e+01 -3.97317458e+01 -8.25411535e+00 -1.19958596e+00
  -2.18714096e+01  8.28293517e+00  1.73828161e+00 -1.77960697e+01
   3.80878338e+00 -1.10035176e+00 -5.49074759e-01  7.07995235e-01
   4.29907411e-01 -5.71723241e-02  2.90728380e-01  3.3985224

In [6]:
! pip install xgboost





In [10]:
pip install tensorflow

Collecting tensorflowNote: you may need to restart the kernel to use updated packages.
  Using cached tensorflow-2.7.0-cp39-cp39-win_amd64.whl (430.8 MB)
Collecting keras<2.8,>=2.7.0rc0
  Using cached keras-2.7.0-py2.py3-none-any.whl (1.3 MB)
Installing collected packages: keras, tensorflow
  Attempting uninstall: keras
    Found existing installation: Keras 1.2.0
    Uninstalling Keras-1.2.0:
      Successfully uninstalled Keras-1.2.0
Successfully installed keras-2.7.0 tensorflow-2.7.0





In [8]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


In [None]:
import numpy as np
import os
import sys
import librosa
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers import TimeDistributed
from keras.models import Sequential
from keras.layers import Dense, Activation,Dropout, Flatten, Embedding
import pickle
from xgboost import XGBClassifier

In [12]:
#Path to the folder consisting different Emotions folders
path_data = 'D:/Audio Analysis Notebook/new data'

mslen = 22050

data = []

max_fs = 0
labels = []

emotions = ['neutral','calm','happy','sad','angry','fearful','disgust','surprised']
directories = os.listdir(path_data)

print(directories)

[]


In [13]:
f2 = open('D:/Audio Analysis Notebook/Audio-Sentiment-Analysis-master/model/feature.pkl','rb')
feature_all = pickle.load(f2)
f3 = open('D:/Audio Analysis Notebook/Audio-Sentiment-Analysis-master/model/label.pkl','rb')
labels = pickle.load(f3)
from copy import deepcopy
y = deepcopy(labels)
for i in range(len(y)):
    y[i] = int(y[i])


n_labels = len(y)
n_unique_labels = len(np.unique(y))
one_hot_encode = np.zeros((n_labels,n_unique_labels))
f = np.arange(n_labels)
for i in range(len(f)):
    one_hot_encode[f[i],y[i]-1]=1


X_train,X_test,y_train,y_test = train_test_split(feature_all,one_hot_encode,test_size = 0.3,random_state=20)

In [16]:
########################### MODEL 1 ###########################
model = Sequential()

model.add(Dense(X_train.shape[1],input_dim =X_train.shape[1],kernel_initializer = 'random_uniform',activation ='relu'))

model.add(Dense(400,kernel_initializer = 'random_uniform',activation ='relu'))

model.add(Dropout(0.2))

model.add(Dense(200,kernel_initializer = 'random_uniform',activation ='relu'))

model.add(Dropout(0.2))

model.add(Dense(100,kernel_initializer = 'random_uniform',activation ='relu'))

model.add(Dropout(0.2))

model.add(Dense(y_train.shape[1],kernel_initializer = 'random_uniform',activation ='softmax'))

model.compile(loss = 'categorical_crossentropy',optimizer='adadelta',metrics=['accuracy'])

model.fit(X_train,y_train,epochs=200,batch_size = 5,verbose=1)


model.evaluate(X_test,y_test)

mlp_model = model.to_json()
with open('mlp_model_relu_adadelta.json','w') as j:
    j.write(mlp_model)
model.save_weights("mlp_relu_adadelta_model.h5")

y_pred_model1 = model.predict(X_test)
y2 = np.argmax(y_pred_model1,axis=1)
y_test2 = np.argmax(y_test , axis = 1)

count = 0
for i in range(y2.shape[0]):
    if y2[i] == y_test2[i]:
        count+=1

print('Accuracy for model 1 : ' + str((count / y2.shape[0]) * 100))


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [18]:

########################### MODEL 2 ###########################
model2 = Sequential()

model2.add(Dense(X_train.shape[1],input_dim =X_train.shape[1],kernel_initializer = 'random_uniform',activation ='relu'))

model2.add(Dense(400,kernel_initializer = 'random_uniform',activation ='tanh'))

model2.add(Dropout(0.2))

model2.add(Dense(200,kernel_initializer = 'random_uniform',activation ='tanh'))

model2.add(Dropout(0.2))

model2.add(Dense(100,kernel_initializer = 'random_uniform',activation ='sigmoid'))

model2.add(Dropout(0.2))

model2.add(Dense(y_train.shape[1],kernel_initializer = 'random_uniform',activation ='softmax'))

model2.compile(loss = 'categorical_crossentropy',optimizer='adadelta',metrics=['accuracy'])

model2.fit(X_train,y_train,epochs=200,batch_size = 5,verbose=1)

model2.evaluate(X_test, y_test)


mlp_model2 = model2.to_json()
with open('mlp_model_tanh_adadelta.json','w') as j:
    j.write(mlp_model2)
model2.save_weights("mlp_tanh_adadelta_model.h5")


y_pred_model2 = model2.predict(X_test)
y22 = np.argmax(y_pred_model2,axis=1)
y_test22 = np.argmax(y_test , axis = 1)

count = 0
for i in range(y22.shape[0]):
    if y22[i] == y_test22[i]:
        count+=1
        
print('Accuracy for model 2 : ' + str((count / y22.shape[0]) * 100))


X_train2,X_test2,y_train2,y_test2 = train_test_split(feature_all,y,test_size = 0.3,random_state=20)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [28]:
########################### MODEL 3 ###########################
model3 = XGBClassifier()

evals_result = {}
eval_s = [(X_train2, y_train2)]

model3.fit(X_train2,y_train2,eval_set=eval_s)
model3.evals_result()
score = cross_val_score(model3, X_train2, y_train2, cv=5)
y_pred3 = model3.predict(X_test)

count = 0
for i in range(y_pred3.shape[0]):
    if y_pred3[i] == y_test2[i]:
        count+=1   
        
print('Accuracy for model 3 : ' + str((count / y_pred3.shape[0]) * 100))


[0]	validation_0-mlogloss:1.53040
[1]	validation_0-mlogloss:1.19694
[2]	validation_0-mlogloss:0.95765
[3]	validation_0-mlogloss:0.78845
[4]	validation_0-mlogloss:0.63909
[5]	validation_0-mlogloss:0.53076
[6]	validation_0-mlogloss:0.44669
[7]	validation_0-mlogloss:0.37746
[8]	validation_0-mlogloss:0.32135
[9]	validation_0-mlogloss:0.27689
[10]	validation_0-mlogloss:0.24327
[11]	validation_0-mlogloss:0.21211
[12]	validation_0-mlogloss:0.18402
[13]	validation_0-mlogloss:0.15976
[14]	validation_0-mlogloss:0.14240
[15]	validation_0-mlogloss:0.12492
[16]	validation_0-mlogloss:0.11215
[17]	validation_0-mlogloss:0.09975
[18]	validation_0-mlogloss:0.08961
[19]	validation_0-mlogloss:0.08074
[20]	validation_0-mlogloss:0.07281
[21]	validation_0-mlogloss:0.06615
[22]	validation_0-mlogloss:0.06038
[23]	validation_0-mlogloss:0.05538
[24]	validation_0-mlogloss:0.05137
[25]	validation_0-mlogloss:0.04765
[26]	validation_0-mlogloss:0.04413
[27]	validation_0-mlogloss:0.04117
[28]	validation_0-mlogloss:0.0



















Accuracy for model 3 : 56.25
