Importing the necessary libaries required



In [2]:
import tkinter as tk
from tkinter import filedialog, messagebox
import pyaudio
import wave
import os
from sklearn.model_selection import train_test_split
from keras.layers import  Flatten, Dense, Dropout
import librosa
import numpy as np
from tensorflow.keras.applications import VGG16
import keras
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

model building,training and testing for emotion detection of female audio

In [4]:
#classes
classes = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'pleasant', 'sad']

# Loading the audio data
audio_data, labels = [], []
for folder_name in classes:
    folder_path = os.path.join(r'C:\Users\Shivam Sharma\Documents\Python Scripts\task4\datasets', folder_name)
    for file in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file)
        # Extracting the MFCCs from the audio file, 
        y,sr = librosa.load(file_path)

        # Extracting Mel spectrogram features
        mels = librosa.feature.melspectrogram(y=y, sr=sr)
        mels = mels.T  # Transpose to match image format
        
        #puting features and labels into audio_data and labels respectively
        audio_data.append(mels)
        labels.append(folder_name)

# Loading pre-trained VGG16 model (exclude top layers)
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(mels.shape[0], mels.shape[1],3)) 

# Freezing layers of the pre-trained model 
for layer in base_model.layers:
  layer.trainable = False

# Adding custom layers for classification
x = base_model.output
x = Flatten()(x)
x = Dense(1024, activation='relu')(x)
x = Dropout(0.5)(x)
predictions = Dense(len(classes), activation='softmax')(x)

# Creating the final model
model = keras.Model(inputs=base_model.input, outputs=predictions)

# Compiling the model (adjust optimizer, learning rate etc. based on your needs)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Spliting data into training and validation sets
train_audio, test_audio, train_labels, test_labels = train_test_split(np.array(audio_data),np.array(labels), test_size=0.2, random_state=42)

# Initializing the encoder
le = LabelEncoder()

# Fiting the encoder and transform the labels
train_labels = le.fit_transform(train_labels)
test_labels = le.transform(test_labels)

# Converting labels to categorical (one-hot encoding)
train_labels = to_categorical(train_labels)
test_labels = to_categorical(test_labels)

# Repeaing the single-channel data to create a three-channel image
train_audio = np.repeat(train_audio[..., np.newaxis], 3, -1)
test_audio = np.repeat(test_audio[..., np.newaxis], 3, -1)


# Training the model
model.fit(train_audio, train_labels, epochs=5)

# testing the model
pred_label=model.predict(test_audio)

# Converting predicted and test probabilities to class labels
pred_label = np.argmax(pred_label, axis=1)
test_labels = np.argmax(test_labels, axis=1)

# calculating the accuracy
print("Accuracy:", accuracy_score(test_labels, pred_label))


Epoch 1/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 468ms/step - accuracy: 0.4248 - loss: 1.5718
Epoch 2/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 619ms/step - accuracy: 0.8032 - loss: 0.4991
Epoch 3/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 579ms/step - accuracy: 0.9038 - loss: 0.2863
Epoch 4/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 524ms/step - accuracy: 0.8921 - loss: 0.2725
Epoch 5/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 512ms/step - accuracy: 0.9413 - loss: 0.1942
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 322ms/step
Accuracy: 0.9396984924623115


building window for user interface

In [7]:
window = tk.Tk()
window.title("Audio Recorder and Uploader")
window.geometry("500x500")

''

main function 

In [5]:
def main():
    global filename
    #extracting the features of input audio
    if filename == '':
       #extracting the features of input audio
       y,sr=librosa.load('output.wav')
    else:
      y,sr=librosa.load(filename)
    # Extracting Mel spectrogram features
    mels = librosa.feature.melspectrogram(y=y, sr=sr)
    mels = mels.T  # Transpose to match image format
    # Extracting the pitch
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)

    # Geting the average pitch
    average_pitch = np.average(pitches[np.nonzero(pitches)])

    if average_pitch > 180:# avg_pitch of female voice
      # Threshold values for English 
      english_threshold = 225 # avg_pitch for female english language audio
      if average_pitch < english_threshold:
         prediction=model.predict(mels)
         class_label = np.argmax(prediction)
         predicted_class_name = [classes[i] for i in class_label]
         messagebox.showinfo("Message", "the emotion type of the audio file is:" + predicted_class_name, parent=window)
      else:
         messagebox.showinfo("Message", " The voice audio should be in english langauge!", parent=window)
    else:
         messagebox.showinfo("Message", " The voice audio should be of a female and not of a male, please upload female voice audio!", parent=window)


building button for file explorer

In [8]:

label_file_explorer = tk.Label(window, text="")
label_file_explorer.pack()

function for browsing the files

In [9]:
def browseFiles():
    global filename 
    filename =''
    filename = filedialog.askopenfilename(initialdir="/", title="Select an audio file", filetypes=(("Audio files", ".wav;.mp3"), ("All files", ".")))
    label_file_explorer.configure(text="File Opened: " + filename)
    main()

building upload button

In [10]:
label_upload = tk.Label(window, text="Upload an audio file:")
label_upload.pack()

button_upload = tk.Button(window, text="Browse", command=browseFiles)
button_upload.pack()

function for recording audio

In [11]:
def record_audio():
    CHUNK = 1024
    FORMAT = pyaudio.paInt16
    CHANNELS = 2
    RATE = 44100
    RECORD_SECONDS = 5
    WAVE_OUTPUT_FILENAME = "output.wav"
    p = pyaudio.PyAudio()

    stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)

    print("* recording")

    frames = []

    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
       data = stream.read(CHUNK)
       frames.append(data)

    print("* done recording")

    stream.stop_stream()
    stream.close()
    p.terminate()

    wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()
    main()

building the record button

In [12]:
label_record = tk.Label(window, text="Record an audio file:")
label_record.pack()

button_record = tk.Button(window, text="Record", command=record_audio)
button_record.pack()
window.mainloop()