In [1]:
import zipfile
import os

def unzip_folder(zip_path, extract_to):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Specify the path to the zip file and the directory where you want to extract the contents
zip_file_path = '/content/drive/MyDrive/AudioWAV.zip'
extracted_folder_path = 'Audio'

# Create the destination folder if it doesn't exist
os.makedirs(extracted_folder_path, exist_ok=True)

# Call the function to unzip the folder
unzip_folder(zip_file_path, extracted_folder_path)

print(f"Folder '{zip_file_path}' has been successfully extracted to '{extracted_folder_path}'.")

Folder '/content/drive/MyDrive/AudioWAV.zip' has been successfully extracted to 'Audio'.


In [2]:
import os
# Specify the path for the new folder
folder_path = "/content/AudioWithCategorisedWAV/"

folders = ['Anger','Disgust','Fear','Happy','Neutral','Sad']

for i in folders:
  folder_path1 = folder_path + i
  if not os.path.exists(folder_path1):
    os.makedirs(folder_path1)

import os

import shutil
#sourcepath = '/content/AudioInput'
sourcepath = '/content/Audio/AudioWAV'
destinationpath = '/content/AudioWithCategorisedWAV'
files = os.listdir(sourcepath)

for file in files:
    source_file = os.path.join(sourcepath, file)
    if not '.wav' in file:
      continue
    if 'ANG' in file:
      destination_file = os.path.join(destinationpath+'/Anger', file)
      shutil.move(source_file, destination_file)
    elif 'DIS' in file:
      destination_file = os.path.join(destinationpath+'/Disgust', file)
      shutil.move(source_file, destination_file)
    elif 'FEA' in file:
      destination_file = os.path.join(destinationpath+'/Fear', file)
      shutil.move(source_file, destination_file)
    elif 'HAP' in file:
      destination_file = os.path.join(destinationpath+'/Happy', file)
      shutil.move(source_file, destination_file)
    elif 'NEU' in file:
      destination_file = os.path.join(destinationpath+'/Neutral', file)
      shutil.move(source_file, destination_file)
    elif 'SAD' in file:
      destination_file = os.path.join(destinationpath+'/Sad', file)
      shutil.move(source_file, destination_file)

In [3]:
def noise(data):
    noise_amp = 0.04*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.70):
    return librosa.effects.time_stretch(data, rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.8):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

def higher_speed(data, speed_factor = 1.25):
    return librosa.effects.time_stretch(data, rate = speed_factor)

def lower_speed(data, speed_factor = 0.75):
    return librosa.effects.time_stretch(data, rate = speed_factor)

In [10]:
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import librosa
import numpy as np
import os

# Function to extract audio features using librosa
def extract_features(audio, sample_rate, mfcc=True, chroma=True, mel=True):
    result = np.array([])
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13), axis=1)
        result = np.hstack((result, mfccs))
    if chroma:
        chroma = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate), axis=1)
        result = np.hstack((result, chroma))
    if mel:
        mel = np.mean(librosa.feature.melspectrogram(y=audio, sr=sample_rate), axis=1)
        result = np.hstack((result, mel))
    return result

# Function to load audio data and labels
def load_data(data_path):
    features, labels = [], []
    for folder in os.listdir(data_path):
        label = folder
        for file_name in os.listdir(os.path.join(data_path, folder)):
            file_path = os.path.join(data_path, folder, file_name)
            audio, sample_rate = librosa.load(file_path)
            feature = extract_features(audio,sample_rate)
            features.append(feature)
            labels.append(label)
            #noised
            noise_data = noise(audio)
            feature = extract_features(noise_data,sample_rate)
            features.append(feature)
            labels.append(label)
            #stretched
            #stretch_data = stretch(audio)
            #feature = extract_features(stretch_data,sample_rate)
            #features.append(feature)
            #labels.append(label)
            #pitched
            #pitch_data = pitch(data = audio, sampling_rate = sample_rate)
            #feature = extract_features(pitch_data,sample_rate)
            #features.append(feature)
            #labels.append(label)

            #speed up
            higher_speed_data = higher_speed(audio)
            feature = extract_features(higher_speed_data,sample_rate)
            features.append(feature)
            labels.append(label)

            #speed down
            lower_speed_data = higher_speed(audio)
            feature = extract_features(lower_speed_data,sample_rate)
            features.append(feature)
            labels.append(label)
    return np.array(features), np.array(labels)

# Load data and preprocess
data_path = "/content/AudioWithCategorisedWAV"
features, labels = load_data(data_path)

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Split the data into training and testing sets



  return pitch_tuning(


In [14]:
X_train, X_test, y_train, y_test = train_test_split(features, encoded_labels, test_size=0.1, random_state=42)
model = models.Sequential()

# Convolutional layers
model.add(layers.Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling1D(pool_size=2))

model.add(layers.Conv1D(128, kernel_size=3, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling1D(pool_size=2))

model.add(layers.Conv1D(256, kernel_size=3, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling1D(pool_size=2))

# Recurrent layers
model.add(layers.Bidirectional(layers.LSTM(128, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(128)))

# Fully connected layers
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(0.5))

# Output layer
model.add(layers.Dense(6, activation='softmax'))

# Compile the model
optimiser = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimiser, loss='sparse_categorical_crossentropy', metrics=['accuracy'],run_eagerly=True)


In [15]:
from sklearn.utils.class_weight import compute_class_weight
class_labels = np.unique(labels)
class_indices = {label: index for index, label in enumerate(class_labels)}
Y = np.array([class_indices[label] for label in labels])

# Calculate class weights
class_weights = compute_class_weight(class_weight ='balanced',classes = np.unique(Y),y= Y)

# Convert class weights to a dictionary for class_weight parameter in model.fit
class_weights_dict = {class_index: weight for class_index, weight in zip(np.unique(Y), class_weights)}
# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), class_weight=class_weights_dict)

# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test accuracy: {test_acc}')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test accuracy: 0.8290225267410278


In [13]:
# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test accuracy: {test_acc}')

Test accuracy: 0.7823312282562256


In [None]:
model.save("emotion_classification_Audio_model_With_Augmentation_With_More_Features.h5")

  saving_api.save_model(


In [None]:
!pip install nlpaug
import nlpaug.augmenter.word as naw

# Create an augmentation pipeline
augmenter = naw.SynonymAug(aug_src='wordnet')
sentdict = {'IEO':"It's eleven o'clock",'TIE':"That is exactly what happened",'IOM':"I'm on my way to the meeting",'IWW':"I wonder what this is about",'TAI':"The airplane is almost full",'MTI':"Maybe tomorrow it will be cold",
            'IWL':"I would like a new alarm clock",'ITH':"I think I have a doctor's appointment",'DFA':"Don't forget a jacket",'ITS':"I think I've seen this before",'TSI':"The surface is slick",'WSI':"We'll stop in a couple of minutes"}
testsentences = []
finalemotion=[]
def create(sentence):
  text = sentence + ' 😠'
  augmented_text1 = augmenter.augment(text)
  augmented_text2 = augmenter.augment(text)
  testsentences.append(text)
  testsentences.append(augmented_text1[0])
  testsentences.append(augmented_text2[0])
  finalemotion.append(0)
  finalemotion.append(0)
  finalemotion.append(0)
  text = sentence + ' 😖'
  augmented_text1 = augmenter.augment(text)
  augmented_text2 = augmenter.augment(text)
  testsentences.append(text)
  testsentences.append(augmented_text1[0])
  testsentences.append(augmented_text2[0])
  finalemotion.append(1)
  finalemotion.append(1)
  finalemotion.append(1)
  text = sentence + ' 😱'
  augmented_text1 = augmenter.augment(text)
  augmented_text2 = augmenter.augment(text)
  testsentences.append(text)
  testsentences.append(augmented_text1[0])
  testsentences.append(augmented_text2[0])
  finalemotion.append(2)
  finalemotion.append(2)
  finalemotion.append(2)
  text = sentence + ' 😊'
  augmented_text1 = augmenter.augment(text)
  augmented_text2 = augmenter.augment(text)
  testsentences.append(text)
  testsentences.append(augmented_text1[0])
  testsentences.append(augmented_text2[0])
  finalemotion.append(3)
  finalemotion.append(3)
  finalemotion.append(3)
  text = sentence + ' 😐'
  augmented_text1 = augmenter.augment(text)
  augmented_text2 = augmenter.augment(text)
  testsentences.append(text)
  testsentences.append(augmented_text1[0])
  testsentences.append(augmented_text2[0])
  finalemotion.append(4)
  finalemotion.append(4)
  finalemotion.append(4)
  text = sentence + ' 😢'
  augmented_text1 = augmenter.augment(text)
  augmented_text2 = augmenter.augment(text)
  testsentences.append(text)
  testsentences.append(augmented_text1[0])
  testsentences.append(augmented_text2[0])
  finalemotion.append(5)
  finalemotion.append(5)
  finalemotion.append(5)
for k in sentdict.keys():
  create(sentdict[k])

In [None]:
print(len(testsentences))
print(len(finalemotion))

216
216


In [None]:
import re
import random

augmentedemojitext=[]
emotions=[]

def emoji_augmentation(text):
    # Define a dictionary of emoji replacements
    emoji_replacements = {
        "😠": ["😠", "😡", "😤", "😾"],
        "😖": ["😖", "😣", "😞", "😷"],
        "😱": ["😱", "😨", "😰", "😲"],
        "😊": ["😊", "😄", "😁", "😆"],
        "😐": ["😐", "😑", "😶", "😏"],
        "😢": ["😢", "😭", "😓", "😥"],
        # Add more emojis and their possible replacements
    }

    emoji_dict = {"😠":0,"😖": 1,"😱": 2,"😊": 3,"😐": 4,"😢": 5}

    # Use regular expression to find emojis in the text
    emoji_pattern = re.compile(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U000024C2-\U0001F251]+')
    matches = emoji_pattern.findall(text)

    # Perform augmentation by randomly replacing emojis
    for match in matches:
      if match in emoji_replacements:
        for replacement in emoji_replacements[match]:
          augmented_text = text
          augmented_text = augmented_text.replace(match, replacement)
          augmentedemojitext.append(augmented_text)
          emotions.append(emoji_dict[match])

for text in testsentences:
  emoji_augmentation(text)


In [None]:
print(len(augmentedemojitext))
print(len(finalemotion))

864
216


In [None]:
import warnings
warnings.filterwarnings('ignore')

import random
import re
import string

import nltk
from nltk import pos_tag
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from transformers import TFRobertaModel, RobertaTokenizerFast
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline

from keras.models import Model
from keras.layers import Dense, Dropout, Input
from keras.optimizers import Adam
from keras.losses import CategoricalCrossentropy
from keras.metrics import CategoricalAccuracy
from keras.callbacks import EarlyStopping, ModelCheckpoint

from tabulate import tabulate


In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(augmentedemojitext,emotions, test_size=0.1, random_state=42)

In [None]:
from transformers import TFRobertaModel, RobertaTokenizerFast
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from keras.losses import CategoricalCrossentropy
from keras.metrics import CategoricalAccuracy
from keras.optimizers import Adam
tokenizer_roberta = RobertaTokenizerFast.from_pretrained('cardiffnlp/twitter-roberta-base-emotion')

MAX_LEN=128

def tokenize_roberta(data, max_len=MAX_LEN) :
    input_ids = []
    attention_masks = []
    for i in range(len(data)):
        encoded = tokenizer_roberta.encode_plus(
            data[i],
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids),np.array(attention_masks)

def create_model(bert_model, max_len=MAX_LEN):
    inputs = Input(shape=(max_len,), dtype='int32')
    masks = Input(shape=(max_len,), dtype='int32')

    bert_output = bert_model([inputs, masks])[1]

    dense_1 = Dense(128, activation='relu')(bert_output)
    dropout_1 = Dropout(0.5)(dense_1)

    dense_2 = Dense(64, activation='relu')(dropout_1)
    dropout_2 = Dropout(0.5)(dense_2)

    output = Dense(6, activation='softmax')(dropout_2)

    model = Model(inputs=[inputs, masks], outputs=output)

    model.compile(optimizer=Adam(learning_rate=1e-5),
                  loss=CategoricalCrossentropy(),
                  metrics=CategoricalAccuracy())
    return model

roberta_model = TFRobertaModel.from_pretrained('cardiffnlp/twitter-roberta-base-emotion')
model = create_model(roberta_model, MAX_LEN)


Some layers from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion were not used when initializing TFRobertaModel: ['classifier']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [None]:
train_inputs, train_masks = tokenize_roberta(X_train1, MAX_LEN)#tokenize_roberta(X_train, MAX_LEN)
history = model.fit([train_inputs, train_masks],  OneHotEncoder().fit_transform(np.array(y_train1).reshape(-1, 1)).toarray(),  epochs=4,  batch_size=32)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [None]:
%%capture
textemo=[]
for s in X_test1:
  textemo.append(np.argmax(model.predict([tokenize_roberta([s])])[0]))

In [None]:
te=0
for i in range(0,len(X_test1)):
  if y_test1[i] == textemo[i]:
    te= te+1
print(te/len(X_test1))

0.9080459770114943


In [None]:
from keras.models import load_model
Audio_model = load_model("/content/emotion_classification_Audio_model_With_Augmentation.h5")
Video_model = load_model("/content/drive/MyDrive/emotion_classification_Video_model.h5")
text_model = model

In [None]:
import zipfile
import os

def unzip_folder(zip_path, extract_to):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Specify the path to the zip file and the directory where you want to extract the contents
zip_file_path = '/content/drive/MyDrive/VideoFlash.zip'
extracted_folder_path = 'Video'

# Create the destination folder if it doesn't exist
os.makedirs(extracted_folder_path, exist_ok=True)

# Call the function to unzip the folder
unzip_folder(zip_file_path, extracted_folder_path)

print(f"Folder '{zip_file_path}' has been successfully extracted to '{extracted_folder_path}'.")

Folder '/content/drive/MyDrive/VideoFlash.zip' has been successfully extracted to 'Video'.


In [None]:
import os
VideosData = []
data_path = '/content/Video/VideoFlash'
for folder in os.listdir(data_path):
    file_path = os.path.join(data_path, folder)
    VideosData.append(folder)

AudiosData = []
data_path = '/content/Audio/AudioWAV'
for folder in os.listdir(data_path):
    file_path = os.path.join(data_path, folder)
    AudiosData.append(folder)

print(len(VideosData))
print(len(AudiosData))

7442
7442


In [None]:
from sklearn.model_selection import train_test_split
vd, VideosDataTest = train_test_split(VideosData, test_size=0.4, random_state=42)
ad, AudiosDataTest = train_test_split(AudiosData, test_size=0.4, random_state=42)
testdata=[]
for data in VideosDataTest:
  if data[:-3]+'wav' in AudiosDataTest:
    testdata.append(data[:-3])

for data in AudiosDataTest:
  if data[:-3]+'flv' in VideosDataTest:
    testdata.append(data[:-3])

testdata = set(testdata)
testdata = list(testdata)

In [None]:
finalemotion = []
for file in testdata:
  if 'ANG' in file:
    finalemotion.append(0)
  elif 'DIS' in file:
    finalemotion.append(1)
  elif 'FEA' in file:
    finalemotion.append(2)
  elif 'HAP' in file:
    finalemotion.append(3)
  elif 'NEU' in file:
    finalemotion.append(4)
  elif 'SAD' in file:
    finalemotion.append(5)

In [None]:
len(testdata)

1216

In [None]:
emoji_replacements = {
        "😠": ["😠", "😡", "😤", "😾"],
        "😖": ["😖", "😣", "😞", "😷"],
        "😱": ["😱", "😨", "😰", "😲"],
        "😊": ["😊", "😄", "😁", "😆"],
        "😐": ["😐", "😑", "😶", "😏"],
        "😢": ["😢", "😭", "😓", "😥"],
        # Add more emojis and their possible replacements
    }
def CreateSentences(file,sentence,testsentences):
  if 'ANG' in file:
    testsentences.append(sentence + random.choice(emoji_replacements['😠']))
  elif 'DIS' in file:
    testsentences.append(sentence + random.choice(emoji_replacements['😖']))
  elif 'FEA' in file:
    testsentences.append(sentence + random.choice(emoji_replacements['😱']))
  elif 'HAP' in file:
    testsentences.append(sentence + random.choice(emoji_replacements['😊']))
  elif 'NEU' in file:
    testsentences.append(sentence + random.choice(emoji_replacements['😐']))
  elif 'SAD' in file:
    testsentences.append(sentence + random.choice(emoji_replacements['😢']))

testsentences=[]
sentdict = {'IEO':"It's eleven o'clock",'TIE':"That is exactly what happened",'IOM':"I'm on my way to the meeting",'IWW':"I wonder what this is about",'TAI':"The airplane is almost full",'MTI':"Maybe tomorrow it will be cold",
            'IWL':"I would like a new alarm clock",'ITH':"I think I have a doctor's appointment",'DFA':"Don't forget a jacket",'ITS':"I think I've seen this before",'TSI':"The surface is slick",'WSI':"We'll stop in a couple of minutes"}
for file in testdata :
  CreateSentences(file,sentdict[file[5:8]],testsentences)


In [None]:
import numpy as np
import cv2
import librosa

# Function to extract features from video
def extract_video_features(video_path):
    # Your video feature extraction code here
    # Example: Using OpenCV to extract color histogram features
    cap = cv2.VideoCapture(video_path)
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        processed_frame = cv2.resize(frame, (224, 224))
        processed_frame = processed_frame / 255.0  # Normalize pixel values

    cap.release()
    return processed_frame

# Function to extract features from audio
def extract_audio_features(file_path, mfcc=True, chroma=True, mel=True):
    # Your audio feature extraction code here
    # Example: Using librosa to extract MFCC features
    audio, sample_rate = librosa.load(file_path)
    result = np.array([])
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13), axis=1)
        result = np.hstack((result, mfccs))
    #if chroma:
    #    chroma = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate), axis=1)
    #    result = np.hstack((result, chroma))
    #if mel:
    #    mel = np.mean(librosa.feature.melspectrogram(y=audio, sr=sample_rate), axis=1)
    #    result = np.hstack((result, mel))
    return result

# Load your pre-trained text model (replace with your actual model loading code)
def predict_text_emotion(text):
    # Your text prediction code here
    # Example: Using a simple RandomForestClassifier
    # You should replace this with your actual text classification model
    return text_model.predict([tokenize_roberta([text])])[0]

# Load your pre-trained video model (replace with your actual model loading code)
def predict_video_emotion(video_path):
    # Your video prediction code here
    # Example: Extract video features and use a simple RandomForestClassifier
    video_features = extract_video_features(video_path)
    return Video_model.predict(np.array([video_features]))

# Load your pre-trained audio model (replace with your actual model loading code)
def predict_audio_emotion(audio_path):
    # Your audio prediction code here
    # Example: Extract audio features and use a simple RandomForestClassifier
    audio_features = extract_audio_features(audio_path)
    return Audio_model.predict(np.array([audio_features]))

textemotion_pred=[]
videoemotion_pred=[]
audioemotion_pred=[]
# Example usage
for i in range(0,len(testdata)):
  text = testsentences[i]
  video_path = '/content/Video/VideoFlash/' + testdata[i] + 'flv' #"/content/Video/1064_IEO_SAD_HI.flv"
  audio_path = '/content/Audio/AudioWAV/'+ testdata[i] + 'wav' #"/content/Audio/1064_IEO_SAD_HI.wav"

    # Let's assume your text, video, and audio models have predicted the following emotions
  textemotion_pred.append(predict_text_emotion(text))#[0.2, 0.3, 0.1, 0.1, 0.2, 0.1]  # Example prediction from the text model
  videoemotion_pred.append(predict_video_emotion(video_path))#[0.1, 0.4, 0.1, 0.1, 0.1, 0.2]  # Example prediction from the video model
  audioemotion_pred.append(predict_audio_emotion(audio_path))#[0.3, 0.2, 0.2, 0.1, 0.1, 0.1]  # Example prediction from the audio model

textemotions=[]
videoemotions=[]
audioemotions=[]
finalemotions=[]
def predict(tw,vw,aw):
  textemotions.clear()
  videoemotions.clear()
  audioemotions.clear()
  finalemotions.clear()
  for i in range(0,len(testdata)):
    # Set custom weights for each modality
    text_weight = tw
    video_weight = vw
    audio_weight = aw

    # Apply custom weights to each modality's prediction
    weighted_text_emotion = text_weight * np.array(textemotion_pred[i])
    weighted_video_emotion = video_weight * np.array(videoemotion_pred[i])
    weighted_audio_emotion = audio_weight * np.array(audioemotion_pred[i])

    # Combine the weighted predictions (you can choose a different method, e.g., averaging)
    final_emotion = np.argmax(weighted_text_emotion + weighted_video_emotion + weighted_audio_emotion)

    textemotions.append(np.argmax(weighted_text_emotion))
    videoemotions.append(np.argmax(weighted_video_emotion))
    audioemotions.append(np.argmax(weighted_audio_emotion))
    finalemotions.append(final_emotion)

In [None]:
#equal weightage to audio and video
def getAccuracy():
  te=0
  ae=0
  ve=0
  fe=0
  for i in range(0,1216):
    if finalemotion[i] == textemotions[i]:
      te= te+1
    if finalemotion[i] == audioemotions[i]:
      ae= ae+1
    if finalemotion[i] == videoemotions[i]:
      ve= ve+1
    if finalemotion[i] == finalemotions[i]:
      fe= fe+1
  print(' final emotion : ' + str(fe/1216))

In [None]:
predict(1,1,1)
getAccuracy()

 final emotion : 0.9712171052631579
 final emotion : 0.9358552631578947
 final emotion : 0.625
 final emotion : 0.9498355263157895


In [None]:
predict(1,1,1)
getAccuracy()

 final emotion : 0.9761513157894737
 final emotion : 0.8338815789473685
 final emotion : 0.625
 final emotion : 0.9498355263157895


In [None]:
predict(1,1,1)
getAccuracy()
predict(0,1,1.2) # aw =1.2
getAccuracy()
predict(0,1,1.5) # aw =1.5
getAccuracy()
predict(0,1,1.8) # aw =1.8
getAccuracy()
predict(0,1.2,1) # vw =1.2
getAccuracy()
predict(0,1.5,1) # vw =1.5
getAccuracy()
predict(0,1.8,1) # vw =1.8
getAccuracy()
predict(1,0,1.2) # aw =1.2
getAccuracy()
predict(1,0,1.5) # aw =1.5
getAccuracy()
predict(1,0,1.8) # aw =1.8
getAccuracy()
predict(1,1.2,0) # vw =1.2
getAccuracy()
predict(1,1.5,0) # vw =1.5
getAccuracy()
predict(1,1.8,0) # vw =1.8
getAccuracy()
predict(1,1,1.2) # aw =1.2
getAccuracy()
predict(1,1,1.5) # aw =1.5
getAccuracy()
predict(1,1,1.8) # aw =1.8
getAccuracy()
predict(1,1.2,1) # vw =1.2
getAccuracy()
predict(1,1.5,1) # vw =1.5
getAccuracy()
predict(1,1.8,1) # vw =1.8
getAccuracy()
predict(1.2,1,1) # tw =1.2
getAccuracy()
predict(1.5,1,1) # tw =1.5
getAccuracy()
predict(1.8,1,1) # tw =1.8
getAccuracy()
predict(1.2,1.5,1) # tw =1.2,vw =1.5
getAccuracy()
predict(1.5,1.5,1) # tw =1.5,vw = 1.5
getAccuracy()
predict(1.8,1.5,1) # tw =1.8,vw =1.5
getAccuracy()
predict(1.2,1,1.5) # tw =1.2
getAccuracy()
predict(1.5,1,1.5) # tw =1.5
getAccuracy()
predict(1.8,1,1.5) # tw =1.8
getAccuracy()
predict(1,1.5,1.2) # aw =1.2
getAccuracy()
predict(1,1.2,1.5) # aw =1.5
getAccuracy()
predict(1,1.2,1.8) # aw =1.8
getAccuracy()
predict(1,1.8,1.2) # aw =1.8
getAccuracy()
predict(1,1.5,1.8) # aw =1.8
getAccuracy()
predict(1,1.8,1.5) # aw =1.8
getAccuracy()

 final emotion : 0.9712171052631579
 final emotion : 0.953125
 final emotion : 0.9588815789473685
 final emotion : 0.959703947368421
 final emotion : 0.8634868421052632
 final emotion : 0.7976973684210527
 final emotion : 0.7532894736842105
 final emotion : 0.962171052631579
 final emotion : 0.959703947368421
 final emotion : 0.9580592105263158
 final emotion : 0.7006578947368421
 final emotion : 0.671875
 final emotion : 0.6661184210526315
 final emotion : 0.975328947368421
 final emotion : 0.9761513157894737
 final emotion : 0.9736842105263158
 final emotion : 0.9506578947368421
 final emotion : 0.9004934210526315
 final emotion : 0.8379934210526315
 final emotion : 0.9761513157894737
 final emotion : 0.977796052631579
 final emotion : 0.9786184210526315
 final emotion : 0.9095394736842105
 final emotion : 0.9243421052631579
 final emotion : 0.9391447368421053
 final emotion : 0.9761513157894737
 final emotion : 0.9802631578947368
 final emotion : 0.9810855263157895
 final emotion : 

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

predicted_labels = label_encoder.inverse_transform(audioemotions)
true_labels = label_encoder.inverse_transform(finalemotion)
classification_metrics = classification_report(predicted_labels, true_labels)
confusion_mtx = confusion_matrix(predicted_labels, true_labels)

print('Classification Report:')
print(classification_metrics)

print('Confusion Matrix:')
print(confusion_mtx)

Classification Report:
              precision    recall  f1-score   support

       Angry       0.99      0.97      0.98       197
     Disgust       0.90      0.97      0.93       184
        Fear       0.97      0.91      0.94       204
       Happy       0.95      0.99      0.97       219
     Neutral       0.94      0.96      0.95       206
         Sad       0.94      0.90      0.92       206

    accuracy                           0.95      1216
   macro avg       0.95      0.95      0.95      1216
weighted avg       0.95      0.95      0.95      1216

Confusion Matrix:
[[191   0   2   4   0   0]
 [  0 179   0   2   2   1]
 [  0   5 185   1   5   8]
 [  0   3   0 216   0   0]
 [  0   3   1   2 198   2]
 [  1   9   3   2   5 186]]


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

predicted_labels = label_encoder.inverse_transform(videoemotions)
true_labels = label_encoder.inverse_transform(finalemotion)
classification_metrics = classification_report(predicted_labels, true_labels)
confusion_mtx = confusion_matrix(predicted_labels, true_labels)

print('Classification Report:')
print(classification_metrics)

print('Confusion Matrix:')
print(confusion_mtx)

Classification Report:
              precision    recall  f1-score   support

       Angry       0.55      0.55      0.55       192
     Disgust       0.73      0.64      0.68       229
        Fear       0.58      0.57      0.57       196
       Happy       0.79      0.89      0.84       203
     Neutral       0.63      0.60      0.61       223
         Sad       0.43      0.49      0.46       173

    accuracy                           0.62      1216
   macro avg       0.62      0.62      0.62      1216
weighted avg       0.63      0.62      0.63      1216

Confusion Matrix:
[[105  18  30   9  15  15]
 [ 13 146  12  18  10  30]
 [ 31  13 111   8  14  19]
 [  3   4   2 180   6   8]
 [ 19   8  14   9 133  40]
 [ 21  10  22   3  32  85]]


In [None]:
predicted_labels = label_encoder.inverse_transform(textemotions)
true_labels = label_encoder.inverse_transform(finalemotion)
classification_metrics = classification_report(predicted_labels, true_labels)
confusion_mtx = confusion_matrix(predicted_labels, true_labels)

print('Classification Report:')
print(classification_metrics)

print('Confusion Matrix:')
print(confusion_mtx)

Classification Report:
              precision    recall  f1-score   support

       Angry       1.00      1.00      1.00       192
     Disgust       0.84      0.78      0.81       213
        Fear       0.77      1.00      0.87       148
       Happy       1.00      1.00      1.00       227
     Neutral       0.99      1.00      0.99       207
         Sad       1.00      0.86      0.92       229

    accuracy                           0.94      1216
   macro avg       0.93      0.94      0.93      1216
weighted avg       0.94      0.94      0.94      1216

Confusion Matrix:
[[192   0   0   0   0   0]
 [  0 167  43   0   3   0]
 [  0   0 148   0   0   0]
 [  0   0   0 227   0   0]
 [  0   0   0   0 207   0]
 [  0  32   0   0   0 197]]


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

predicted_labels = label_encoder.inverse_transform(finalemotions)
true_labels = label_encoder.inverse_transform(finalemotion)
classification_metrics = classification_report(predicted_labels, true_labels)
confusion_mtx = confusion_matrix(predicted_labels, true_labels)

print('Classification Report:')
print(classification_metrics)

print('Confusion Matrix:')
print(confusion_mtx)

Classification Report:
              precision    recall  f1-score   support

       Angry       1.00      0.97      0.99       197
     Disgust       0.94      0.97      0.96       194
        Fear       0.95      0.94      0.95       192
       Happy       1.00      1.00      1.00       228
     Neutral       0.96      0.98      0.97       206
         Sad       0.97      0.96      0.96       199

    accuracy                           0.97      1216
   macro avg       0.97      0.97      0.97      1216
weighted avg       0.97      0.97      0.97      1216

Confusion Matrix:
[[192   0   2   0   2   1]
 [  0 188   2   0   2   2]
 [  0   6 181   0   3   2]
 [  0   1   0 227   0   0]
 [  0   1   2   0 202   1]
 [  0   3   4   0   1 191]]


In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(['Angry','Disgust','Fear','Happy','Neutral','Sad'])