In [1]:
import numpy as np
import pandas as pd
import pyaudio
import wave
import librosa
from scipy.io.wavfile import write
from keras.models import model_from_json

### Define function to record audio

In [2]:
# Define constants for audio recording
format = pyaudio.paInt16
channels = 1
filename = "test/output.wav"
duration=5
sample_rate=44100
chunk_size=1024
duration=5

# Function to record audio
def record_audio():
    audio = pyaudio.PyAudio()
    stream = audio.open(format=format, channels=channels,
                        rate=sample_rate, input=True,
                        frames_per_buffer=chunk_size)
    print("Recording...")
    frames = []

    # Calculate total number of chunks
    total_chunks = int(sample_rate / chunk_size * duration)
    chunks_per_marker = total_chunks // 10  # Number of chunks for each '#' character
    
    for i in range(total_chunks):
        data = stream.read(chunk_size)
        frames.append(data)
        percentage = int((i + 1) / total_chunks * 100)
        if (i + 1) % chunks_per_marker == 0:
            print(f" -- {percentage}%", end='', flush=True)  # Print '#' without newline and flush buffer
    
    print("\nFinished recording.")
    
    stream.stop_stream()
    stream.close()
    audio.terminate()
    
    wavefile = wave.open(filename, 'wb')
    wavefile.setnchannels(channels)
    wavefile.setsampwidth(audio.get_sample_size(format))
    wavefile.setframerate(sample_rate)
    wavefile.writeframes(b''.join(frames))
    wavefile.close()

### Define function to preprocess audio

In [3]:
#sample feature
#librosa.core.load(path, sr=22050, mono=True, offset=0.0, duration=None, dtype=<class 'numpy.float32'>, res_type='kaiser_best')

#Mfcc
#librosa.feature.mfcc(y=None, sr=22050, S=None, n_mfcc=20, dct_type=2, norm='ortho', lifter=0, **kwargs)
mfcc_sample_rate = 22050
n_mfcc = 40
axis_mfcc = 1

# Function to preprocess audio

def preprocess_audio(filename, sample_rate=22050, n_mfcc=40,offset_s = 0.5):
    # Load audio file
    y, sr = librosa.load(filename, sr=sample_rate, mono=True,offset=offset_s)
    
    # Extract MFCC features
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    
    # Calculate mean of MFCCs
    mfccs_mean = np.mean(mfccs, axis=1)
    
    return mfccs_mean

### Load Model

In [4]:
# Load the saved model
def load_model(model_json_path, model_weights_path):
    json_file = open(model_json_path, 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    loaded_model = model_from_json(loaded_model_json)
    loaded_model.load_weights(model_weights_path)
    print("Loaded model from disk")
    return loaded_model

In [5]:
# Define paths
model_json_path = 'models/train_fit/final_model.json'
model_weights_path = 'models/train_fit/final_model.h5'

In [6]:
# Record audio
#record_audio()

In [58]:
# Preprocess audio
filename = "test/sad.wav"
processed_audio = preprocess_audio(filename)

In [59]:
processed_audio.shape

(40,)

In [60]:
au = np.expand_dims(processed_audio, axis=0)

In [61]:
au.shape

(1, 40)

In [62]:
df = pd.DataFrame(columns=['feature'])
df.loc[0] = [processed_audio]

In [63]:
df

Unnamed: 0,feature
0,"[-687.9938, 83.8148, 1.5396544, 13.143715, 8.0..."


In [64]:
#Turn array into dataframe
df = pd.DataFrame(df['feature'].values.tolist())

In [65]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,-687.993774,83.814796,1.539654,13.143715,8.098424,-2.608693,-5.921618,-12.662955,-7.82617,-2.332629,...,-4.221829,-1.125348,-0.966688,-0.903053,-1.418783,-1.934486,-4.277811,-0.757094,-1.361607,3.472135


In [66]:
df.shape

(1, 40)

In [67]:
# Load the model
model = load_model(model_json_path, model_weights_path)

Loaded model from disk


In [68]:
import pickle

with open('meta/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

In [69]:
x_cnn = scaler.transform(df)

In [70]:
x = np.expand_dims(x_cnn, axis=2)

#Add dimension for CNN
x_testcnn = np.expand_dims(input_audio, axis=2)

#Check shapes of dataframes
print(x_testcnn.shape)

from sklearn.preprocessing import StandardScaler
#Normalize the data
scaler = StandardScaler()
scaler.fit(x_testcnn)
x_cnn = scaler.transform(x_testcnn)

In [71]:
# Predict emotion using the model
predictions = model.predict(x)

In [72]:
emotions = {0:'neutral', 1:'calm', 2:'happy', 3:'sad', 4:'angry', 5:'fear', 6:'disgust', 7:'surprised'}
predicted_emotion = emotions[np.argmax(predictions)].title()

print("Predicted Emotion:", predicted_emotion)

Predicted Emotion: Angry


In [28]:
predictions.argmax()

4

In [29]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_4 (Conv1D)            (None, 40, 16)            96        
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 40, 32)            2592      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 10, 32)            0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 10, 64)            10304     
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 10, 128)           41088     
_________________________________________________________________
dropout_2 (Dropout)          (None, 10, 128)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 1280)             