In [1]:
import numpy as np
import pandas as pd
import pickle
import joblib
import matplotlib.pyplot as plt
import librosa, librosa.display
from IPython.display import Audio, FileLink
from pydub import AudioSegment
import os
import gc
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MaxAbsScaler

from tensorflow.keras.layers import Input, Dense, Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.applications.vgg19 import VGG19
from tensorflow.keras.models import Sequential
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras import regularizers
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.activations import softmax
from tensorflow.keras.optimizers import Adam



In [2]:
iemo_data = pd.read_csv('/kaggle/input/iemocap-transcriptions-english-french/iemocapTrans.csv')

In [3]:
iemo_data.head()

Unnamed: 0,_id,activation,dominance,emotion,end_time,start_time,titre,to_translate,translated,valence
0,625682441da7a5c1eaef3689,2.5,3.5,sad,6.0541,3.9987,Ses02M_impro02_F000,I don't want you to go.,Je ne veux pas que tu partes.,2.5
1,625682441da7a5c1eaef368a,3.0,4.0,sad,15.1,7.0366,Ses02M_impro02_M000,"I know, I know. I don't want to go either bab...",Je sais je sais. Je ne veux pas y aller non pl...,2.0
2,625682441da7a5c1eaef368b,2.5,4.5,sad,23.3599,15.5524,Ses02M_impro02_F001,I'm going to miss you too; I don't know what ...,Tu vas me manquer aussi; Je ne sais pas ce que...,1.5
3,625682441da7a5c1eaef368c,2.5,4.0,sad,26.4151,23.579,Ses02M_impro02_F002,I don't want to be a single mom.,Je ne veux pas être une mère célibataire.,1.5
4,625682441da7a5c1eaef368d,3.0,3.5,sad,31.4253,26.7598,Ses02M_impro02_M001,You won't be. I'll be back; I'll be back befo...,Vous ne le serez pas. Je reviendrai; Je serai ...,3.5


In [4]:
iemo_data = iemo_data[['emotion', 'titre']]
iemo_data['filepath'] = '/kaggle/input/iemocap-transcriptions-english-french/Iemocap_audio/iemocap_audio/IEMOCAP_wav/' + iemo_data['titre'] + '.wav'

In [5]:
# ravdess dataset
# emotions -> 01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised
# third part of the name

emo_dict = {
    '01': 'neu',
    '02': 'neu',
    '03': 'hap',
    '04': 'sad',
    '05': 'ang',
    '06': 'fea',
    '07': 'dis',
    '08': 'sur'
}

ravdess_base = "/kaggle/input/ravdess-emotional-speech-audio/"

rav_data = pd.DataFrame(columns=['emotion', 'titre', 'filepath'])

for dirname, _, filenames in os.walk(ravdess_base):
    for filename in filenames:
        
        info_list = filename.split('-')
        emotion = emo_dict[info_list[2]]
                
        new_row = {
            'emotion': [emotion],
            'titre': [filename[:-4]],
            'filepath': [os.path.join(dirname, filename)]
        }
        rav_data = pd.concat([rav_data, pd.DataFrame(new_row)], ignore_index=True)
rav_data.head()

Unnamed: 0,emotion,titre,filepath
0,sur,03-01-08-01-01-01-02,/kaggle/input/ravdess-emotional-speech-audio/A...
1,neu,03-01-01-01-01-01-02,/kaggle/input/ravdess-emotional-speech-audio/A...
2,dis,03-01-07-02-01-02-02,/kaggle/input/ravdess-emotional-speech-audio/A...
3,dis,03-01-07-01-01-02-02,/kaggle/input/ravdess-emotional-speech-audio/A...
4,neu,03-01-01-01-02-01-02,/kaggle/input/ravdess-emotional-speech-audio/A...


In [6]:
data = pd.concat([iemo_data, rav_data], ignore_index=True)

In [7]:
data = data.sample(frac=1)

In [8]:
def show_cat(df):
    print('sad', df.emotion.loc[df.emotion == 'sad'].count())
    print('fru', df.emotion.loc[df.emotion == 'fru'].count())
    print('neu', df.emotion.loc[df.emotion == 'neu'].count())
    print('hap', df.emotion.loc[df.emotion == 'hap'].count())
    print('exc', df.emotion.loc[df.emotion == 'exc'].count())
    print('sur', df.emotion.loc[df.emotion == 'sur'].count())
    print('ang', df.emotion.loc[df.emotion == 'ang'].count())
    print('fea', df.emotion.loc[df.emotion == 'fea'].count())
    print('oth', df.emotion.loc[df.emotion == 'oth'].count())
    print('dis', df.emotion.loc[df.emotion == 'dis'].count())
    
show_cat(data)

sad 1634
fru 2917
neu 2302
hap 1040
exc 1976
sur 494
ang 1653
fea 491
oth 26
dis 386


In [9]:
data = pd.get_dummies(data, columns=['emotion'], dtype='int')
data.head()

Unnamed: 0,titre,filepath,emotion_ang,emotion_dis,emotion_exc,emotion_fea,emotion_fru,emotion_hap,emotion_neu,emotion_oth,emotion_sad,emotion_sur
2647,Ses03F_script01_2_M001,/kaggle/input/iemocap-transcriptions-english-f...,0,0,0,0,0,0,1,0,0,0
2545,Ses01F_script01_1_M036,/kaggle/input/iemocap-transcriptions-english-f...,0,0,0,0,1,0,0,0,0,0
11864,03-01-06-01-01-02-12,/kaggle/input/ravdess-emotional-speech-audio/a...,0,0,0,1,0,0,0,0,0,0
7388,Ses01F_impro05_M027,/kaggle/input/iemocap-transcriptions-english-f...,0,0,0,0,0,0,1,0,0,0
2961,Ses02M_script02_1_M038,/kaggle/input/iemocap-transcriptions-english-f...,0,0,1,0,0,0,0,0,0,0


In [10]:
def add_noise(data, noise_factor):
    noise = np.random.randn(len(data))
    noice_data = data + noise_factor * noise
    noice_data = noice_data.astype(type(data[0]))
    return noice_data

def change_pitch(data, sampling_rate, n_steps=3):
    return librosa.effects.pitch_shift(data, sr=sampling_rate, n_steps=n_steps)

In [11]:
def audio_to_stft(filepath, emotions, sample_rate):
    
    arr_len = 3*sample_rate
    arr, sr = librosa.load(filepath, sr=sample_rate)
    
    audios = []
    
    while (arr.shape[0] >= sample_rate):
        
        if arr.shape[0] < arr_len:
            
            arr = np.pad(arr, (0, arr_len-arr.shape[0]), 'constant')
            
#             if ((emotions['emotion_sur'] == 1) | (emotions['emotion_fea'] == 1) | (emotions['emotion_dis'] == 1)):
    #                 # create noice audio
    #                 noise_audio = add_noise(arr, 0.0001)
    #                 audios.append(librosa.amplitude_to_db(np.abs(librosa.core.stft(noise_audio, n_fft=1024, hop_length=512))))
#                 # change pitch of audio
#                 pitch_audio = change_pitch(arr, sr)
#                 audios.append(librosa.amplitude_to_db(np.abs(librosa.core.stft(pitch_audio, n_fft=1024, hop_length=512))))
            audios.append(librosa.amplitude_to_db(np.abs(librosa.core.stft(arr, n_fft=1024, hop_length=512))))
            arr = np.zeros(0)

        else:
                    
            seg = arr[:arr_len]
#             if ((emotions['emotion_sur'] == 1) | (emotions['emotion_fea'] == 1) | (emotions['emotion_dis'] == 1)):
#                 # create noice audio
#                 noise_audio = add_noise(seg, 0.0001)
#                 audios.append(librosa.amplitude_to_db(np.abs(librosa.core.stft(noise_audio, n_fft=1024, hop_length=512))))
#                 # change pitch of audio
#                 pitch_audio = change_pitch(seg, sr)
#                 audios.append(librosa.amplitude_to_db(np.abs(librosa.core.stft(pitch_audio, n_fft=1024, hop_length=512))))
            audios.append(librosa.amplitude_to_db(np.abs(librosa.core.stft(seg, n_fft=1024, hop_length=512))))
            arr = arr[arr_len:]
            
    return audios

In [12]:
X = []
y = []

for _, row in data.iterrows():

    audios = audio_to_stft(row['filepath'], row.drop(['filepath', 'titre']), 22500)
    
    for audio in audios:
        
        # adding normal audio
        X.append(audio.reshape(171, -1, 3))
        y.append(row.drop(['filepath', 'titre']))

In [13]:
del data, iemo_data, rav_data
gc.collect()

0

In [14]:
max = 0
for x in X:
    max1 = np.max(np.abs(x))
    if max < max1:
        max = max1

In [15]:
X_np = np.array(X, dtype=np.float16) / max
y_np = np.array(y, dtype=np.float16)

In [16]:
SPEC_SHAPE = X_np.shape[1:-1]
SPEC_SHAPE

(171, 132)

In [17]:
size = X_np.shape[0]

In [18]:
X_train = np.copy(X_np[:int(size*0.9)])
y_train = np.copy(y_np[:int(size*0.9)])
X_test = np.copy(X_np[int(size*0.9):])
y_test = np.copy(y_np[int(size*0.9):])

In [19]:
del X_np
del y_np
del X
del y
gc.collect()

0

In [20]:
# Xtr, Xte, ytr, yte = train_test_split(X_np, y_np, test_size=0.2, random_state=42)

In [21]:
resnet50 = VGG19(
    include_top = False, 
    weights = 'imagenet',
    input_shape=SPEC_SHAPE + (3,),
)

# freeze layers
for layer in resnet50.layers:
    layer.trainable = False
    
x = Flatten()(resnet50.output)
pred_layer = Dense(10, activation='sigmoid')(x)

model = Model(inputs=resnet50.input, outputs=pred_layer)
model.summary()
        

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 171, 132, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 171, 132, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 171, 132, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 85, 66, 64)        0         
                                                                 
 block2_conv1 (Conv2D)       (None, 85, 66, 128)       73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 85, 66, 128)       147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 42, 33, 128)       0     

In [22]:
model.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(learning_rate=1e-3),
    metrics=['accuracy']
)

In [23]:
# early_stopper = EarlyStopping(monitor = 'val_loss', patience = 3)
# model_save = ModelCheckpoint(filepath='models/', monitor='val_losss')

In [33]:
fit_history = model.fit(X_train, y_train, epochs=1, validation_data=(X_test, y_test))



In [25]:
# model.evaluate(X_test, y_test)

In [30]:
model.save('modelvgg19-without-noice-0.44.h5') # 0.44 -> 0.40
joblib.dump(model, 'modelvgg19-without-noice-0.44.pkl')

['modelvgg19-without-noice-0.44.pkl']

In [31]:
FileLink('modelvgg19-without-noice-0.44.h5')

In [32]:
FileLink('modelvgg19-without-noice-0.44.pkl')