3d CNN

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv3D, MaxPooling3D, Flatten, Dense

# Define the 3D CNN model for video data
video_model = Sequential()
video_model.add(Conv3D(32, kernel_size=(3, 3, 3), activation='relu', input_shape=(frames, height, width, channels)))
video_model.add(MaxPooling3D(pool_size=(2, 2, 2)))
video_model.add(Flatten())
video_model.add(Dense(128, activation='relu'))
video_model.add(Dense(num_classes, activation='softmax'))


BERT

In [None]:
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = TFBertModel.from_pretrained("bert-base-uncased")

# Define BERT input and output layers
text_input = Input(shape=(max_sequence_length,), dtype=tf.int32, name="text_input")
text_output = bert_model(text_input)[0]
text_output = Dense(128, activation='relu')(text_output)
text_output = Dense(num_classes, activation='softmax')(text_output)

text_model = Model(inputs=text_input, outputs=text_output)


MFCC

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Dense, Flatten

# Define the MFCC model for audio data
audio_model = Sequential()
audio_model.add(InputLayer(input_shape=(num_mfcc_features, frames)))
audio_model.add(Flatten())
audio_model.add(Dense(128, activation='relu'))
audio_model.add(Dense(num_classes, activation='softmax'))


In [None]:
from tensorflow.keras.layers import concatenate, Input, Multiply
from tensorflow.keras.models import Model

# Input layers for each modality
video_input = Input(shape=(video_data.shape[1],), name="video_input")
text_input = Input(shape=(text_data.shape[1],), name="text_input")
audio_input = Input(shape=(audio_data.shape[1],), name="audio_input")

# User-defined weights for each modality
video_weight = 0.25
text_weight = 0.50
audio_weight = 0.25

# Get the outputs from each modality model
video_output = video_model(video_input)
text_output = text_model(text_input)
audio_output = audio_model(audio_input)

# Apply user-defined weights to scale the modality outputs
scaled_video = Multiply()([video_output, video_weight])
scaled_text = Multiply()([text_output, text_weight])
scaled_audio = Multiply()([audio_output, audio_weight])

# Combine the scaled outputs
combined_features = concatenate([scaled_video, scaled_text, scaled_audio], axis=-1)

# Add more layers for multi-modal processing if needed
# For instance, you can add dense layers or further attention mechanisms

# Final classification layer
fusion_output = Dense(num_classes, activation='softmax')(combined_features)

# Create the fusion model
fusion_model = Model(inputs=[video_input, text_input, audio_input], outputs=fusion_output)

# Compile and train the fusion model
fusion_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
fusion_model.fit([video_data, text_data, audio_data], labels, epochs=10, batch_size=32)


In [None]:
import os
# Specify the path for the new folder
folder_path = "/content/AudioWithCategorisedWAV/"

folders = ['Anger','Disgust','Fear','Happy','Neutral','Sad']

for i in folders:
  folder_path1 = folder_path + i
  if not os.path.exists(folder_path1):
    os.makedirs(folder_path1)

In [None]:
!pip install tensorflow numpy librosa

In [None]:
import os

import shutil
#sourcepath = '/content/AudioInput'
sourcepath = '/content/Audio/AudioWAV'
destinationpath = '/content/AudioWithCategorisedWAV'
files = os.listdir(sourcepath)
i=0
for file in files:
    i+=1
    if i == 3200:
      break
    source_file = os.path.join(sourcepath, file)
    if not '.wav' in file:
      continue
    if 'ANG' in file:
      destination_file = os.path.join(destinationpath+'/Anger', file)
      shutil.copy(source_file, destination_file)
    elif 'DIS' in file:
      destination_file = os.path.join(destinationpath+'/Disgust', file)
      shutil.copy(source_file, destination_file)
    elif 'FEA' in file:
      destination_file = os.path.join(destinationpath+'/Fear', file)
      shutil.copy(source_file, destination_file)
    elif 'HAP' in file:
      destination_file = os.path.join(destinationpath+'/Happy', file)
      shutil.copy(source_file, destination_file)
    elif 'NEU' in file:
      destination_file = os.path.join(destinationpath+'/Neutral', file)
      shutil.copy(source_file, destination_file)
    elif 'SAD' in file:
      destination_file = os.path.join(destinationpath+'/Sad', file)
      shutil.copy(source_file, destination_file)

In [None]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras.utils import to_categorical

# Define the paths to your data
data_dir = "/content/AudioWithCategorisedWAV"
class_folders = os.listdir(data_dir)
num_mfcc = 13  # Number of MFCC coefficients
num_frames = 41  # Number of time frames for each MFCC

# Initialize empty lists to store data and labels
data = []
labels = []

# Loop through each class folder
for i, class_folder in enumerate(class_folders):
    class_path = os.path.join(data_dir, class_folder)

    # Loop through audio files in the class folder
    for audio_file in os.listdir(class_path):
        audio_path = os.path.join(class_path, audio_file)

        # Extract MFCC features from the audio file
        audio, sr = librosa.load(audio_path, sr=None)
        mfccs = librosa.feature.mfcc(y =audio, sr=sr, n_mfcc=num_mfcc)

        # Make sure all MFCC feature matrices have the same shape
        if mfccs.shape[1] < num_frames:
            pad_width = num_frames - mfccs.shape[1]
            mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        elif mfccs.shape[1] > num_frames:
            mfccs = mfccs[:, :num_frames]

        data.append(mfccs)
        labels.append(i)  # Assign a class label to each MFCC feature

# Convert data and labels to NumPy arrays
data = np.array(data)
labels = np.array(labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.1, random_state=42)

# One-hot encode the labels
y_train = to_categorical(y_train, num_classes=len(class_folders))
y_test = to_categorical(y_test, num_classes=len(class_folders))


In [None]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

data1=[]
for x in data:
  data1.append(np.ravel(x))

# Convert lists to NumPy arrays
X = np.array(data1)
y = np.array(labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train a RandomForestClassifier
audio_classifier = RandomForestClassifier(n_estimators=500, max_depth= 50, random_state=42)
audio_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = audio_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

y_pred = audio_classifier.predict(X_train)
accuracy = accuracy_score(y_train, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 43.32%
Model Accuracy: 100.00%


In [None]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import models, layers

# Define the paths to your data
data_dir = "/content/AudioWithCategorisedWAV"
class_folders = os.listdir(data_dir)
num_mfcc = 13  # Number of MFCC coefficients

# Initialize empty lists to store data and labels
features = []
labels = []

# Loop through each class folder
for i, class_folder in enumerate(class_folders):
    class_path = os.path.join(data_dir, class_folder)

    # Loop through audio files in the class folder
    for audio_file in os.listdir(class_path):
        audio_path = os.path.join(class_path, audio_file)

        # Extract MFCC features from the audio file
        audio, sr = librosa.load(audio_path, sr=None)
        mfccs = librosa.feature.mfcc(y =audio, sr=sr, n_mfcc=num_mfcc)
        # Normalize MFCCs
        mfccs = (mfccs - np.mean(mfccs)) / np.std(mfccs)

        # Data Augmentation: Random pitch shift
        pitch_shift_steps = np.random.randint(low=-5, high=5)
        y_pitch_shifted = librosa.effects.pitch_shift(audio,sr=sr, n_steps=pitch_shift_steps)
        augmented_mfccs = librosa.feature.mfcc(y=y_pitch_shifted, sr=sr, n_mfcc=13)

        # Make sure all MFCC feature matrices have the same shape
        if augmented_mfccs.shape[1] < num_frames:
            pad_width = num_frames - augmented_mfccs.shape[1]
            augmented_mfccs = np.pad(augmented_mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        elif augmented_mfccs.shape[1] > num_frames:
            augmented_mfccs = augmented_mfccs[:, :num_frames]
        features.append(np.expand_dims(augmented_mfccs, axis=-1))
        labels.append(i)  # Assign a class label to each MFCC feature

In [None]:
from tensorflow.keras import optimizers

# Convert lists to NumPy arrays
X = np.stack(features, axis=0)
y = np.array(labels)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Create a CNN model
model = models.Sequential([
    layers.Conv2D(64, (3, 3), activation='relu', input_shape=(X.shape[1], X.shape[2], X.shape[3])),
    layers.BatchNormalization(),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(128, (3, 3), activation='relu'),
    layers.BatchNormalization(),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(len(np.unique(y_encoded)), activation='softmax')
])

optimiser = optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimiser, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model
accuracy = model.evaluate(X_test, y_test, verbose=0)[1]
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Model Accuracy: 38.35%


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import librosa
import numpy as np
import os

# Function to extract audio features using librosa
def extract_features(file_path, mfcc=True, chroma=True, mel=True):
    audio, sample_rate = librosa.load(file_path)
    result = np.array([])
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13), axis=1)
        result = np.hstack((result, mfccs))
    if chroma:
        chroma = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate), axis=1)
        result = np.hstack((result, chroma))
    if mel:
        mel = np.mean(librosa.feature.melspectrogram(y=audio, sr=sample_rate), axis=1)
        result = np.hstack((result, mel))
    return result

# Function to load audio data and labels
def load_data(data_path):
    features, labels = [], []
    for folder in os.listdir(data_path):
        label = folder
        for file_name in os.listdir(os.path.join(data_path, folder)):
            file_path = os.path.join(data_path, folder, file_name)
            feature = extract_features(file_path)
            features.append(feature)
            labels.append(label)
    return np.array(features), np.array(labels)

# Load data and preprocess
data_path = "/content/AudioWithCategorisedWAV"
features, labels = load_data(data_path)

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, encoded_labels, test_size=0.2, random_state=42)

# Build a simple CNN model
model = models.Sequential()
model.add(layers.Reshape((X_train.shape[1], 1), input_shape=(X_train.shape[1],)))
model.add(layers.Conv1D(64, kernel_size=3, activation='relu'))
model.add(layers.MaxPooling1D(pool_size=2))
model.add(layers.Flatten())
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(len(np.unique(encoded_labels)), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
#model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model on the test set
#test_loss, test_acc = model.evaluate(X_test, y_test)
#print(f'Test accuracy: {test_acc}')


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

model = models.Sequential()
model.add(layers.Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling1D(pool_size=2))
model.add(layers.Dropout(0.3))

model.add(layers.Conv1D(128, kernel_size=3, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling1D(pool_size=2))
model.add(layers.Dropout(0.3))

model.add(layers.GRU(64, return_sequences=True))
model.add(layers.GRU(64))
model.add(layers.Dropout(0.3))

model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(0.3))
model.add(layers.Dense(len(np.unique(encoded_labels)), activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [None]:
model = models.Sequential()

# Convolutional layers
model.add(layers.Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling1D(pool_size=2))

model.add(layers.Conv1D(128, kernel_size=3, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling1D(pool_size=2))

model.add(layers.Conv1D(256, kernel_size=3, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling1D(pool_size=2))

# Recurrent layers
model.add(layers.Bidirectional(layers.LSTM(128, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(128)))

# Fully connected layers
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(0.5))

# Output layer
model.add(layers.Dense(6, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'],run_eagerly=True)


In [None]:
from sklearn.utils.class_weight import compute_class_weight
class_labels = np.unique(labels)
class_indices = {label: index for index, label in enumerate(class_labels)}
Y = np.array([class_indices[label] for label in labels])

# Calculate class weights
class_weights = compute_class_weight(class_weight ='balanced',classes = np.unique(Y),y= Y)

# Convert class weights to a dictionary for class_weight parameter in model.fit
class_weights_dict = {class_index: weight for class_index, weight in zip(np.unique(Y), class_weights)}
# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), class_weight=class_weights_dict)

# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test accuracy: {test_acc}')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test accuracy: 0.44218748807907104


In [None]:
# Convert lists to NumPy arrays
X = np.stack(features, axis=0)
y = np.array(labels)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Create a simple model for classification
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(X.shape[1], X.shape[2])),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(len(np.unique(y_encoded)), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model
accuracy = model.evaluate(X_test, y_test, verbose=0)[1]
print(f"Model Accuracy: {accuracy * 100:.2f}%")


In [None]:
len(features)

2319

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential([
    layers.Input(shape=(num_mfcc, num_frames, 1)),

    layers.Conv2D(64, (3, 3), activation='relu', padding = 'same'),
    layers.BatchNormalization(),
    layers.MaxPooling2D((2, 2)),

    layers.Conv2D(128, (3, 3), activation='relu', padding = 'same'),
    layers.BatchNormalization(),
    layers.MaxPooling2D((2, 2)),

    layers.Flatten(),

    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.5),

    layers.Dense(7, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=30, batch_size=20)


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x785b3d126d10>

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {accuracy * 100:.2f}%")

Test accuracy: 29.31%


In [None]:
audio_features_extractor = model

In [None]:
import tensorflow as tf

class VideoEmotionClassificationModel(tf.keras.Model):
    def __init__(self, num_classes):
        super(VideoEmotionClassificationModel, self).__init__()

        # Audio features extraction layer
        self.audio_features_extractor = tf.keras.Sequential([
            tf.keras.layers.Conv1D(32, 3, activation='relu'),
            tf.keras.layers.MaxPooling1D(2),
            tf.keras.layers.Conv1D(64, 3, activation='relu'),
            tf.keras.layers.MaxPooling1D(2),
            tf.keras.layers.Flatten(),
        ])

        # Video features extraction layer
        self.video_features_extractor = tf.keras.Sequential([
            tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)),
            tf.keras.layers.MaxPooling2D((2, 2)),
            tf.keras.layers.Conv2D(64, (3, 3), activation='relu'),
            tf.keras.layers.MaxPooling2D((2, 2)),
            tf.keras.layers.Flatten(),
        ])

        # Fusion layer
        self.fusion_layer = tf.keras.layers.Dense(128, activation='relu')

        # Classification layer
        self.classification_layer = tf.keras.layers.Dense(num_classes, activation='softmax')

    def call(self, inputs):
        # Extract audio and video features
        audio_features = self.audio_features_extractor(inputs['audio'])
        video_features = self.video_features_extractor(inputs['video'])

        # Fuse audio and video features
        fused_features = tf.concat([audio_features, video_features], axis=1)
        fused_features = self.fusion_layer(fused_features)

        # Classify the fused features
        predictions = self.classification_layer(fused_features)

        return predictions

# Load the video emotion classification model
model = VideoEmotionClassificationModel(num_classes=6)
model.load_weights('video_emotion_classification_model.h5')

# Load the video
video = tf.io.read_file('video.mp4')

# Decode the video
video = tf.io.decode_video(video)

# Extract audio and video features from the video
audio_features = tf.reshape(video['audio'], (1, -1, 1))
video_features = tf.reshape(video['video'], (1, 224, 224, 3))

# Make a prediction
predictions = model({'audio': audio_features, 'video': video_features})

# Get the most likely emotion
predicted_emotion = tf.argmax(predictions, axis=1).numpy()[0]

# Print the predicted emotion
print('Predicted emotion:', predicted_emotion)


ValueError: ignored

In [None]:
import cv2
import numpy as np
from keras.applications.vgg19 import VGG19, preprocess_input
from keras.preprocessing import image
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
from keras.optimizers import Adam

# Load the VGG19 model without the fully connected layers
base_model = VGG19(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Add your own fully connected layers for emotion classification
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
predictions = Dense(num_emotions, activation='softmax')(x)  # Add your number of emotion classes

model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
model.compile(optimizer=Adam(lr=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

# Load the pre-trained weights if available
# model.load_weights('your_weights.h5')

# Load and process the video frames for emotion classification
video_path = 'path_to_your_video.mp4'

cap = cv2.VideoCapture(video_path)
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Preprocess the frame for VGG19
    frame = cv2.resize(frame, (224, 224))
    frame = image.img_to_array(frame)
    frame = np.expand_dims(frame, axis=0)
    frame = preprocess_input(frame)

    # Make predictions
    emotion_predictions = model.predict(frame)

    # Aggregate or process the predictions here for your specific needs

# Release the video capture and close any windows
cap.release()
cv2.destroyAllWindows()


## Video Model

In [None]:
def preprocess_frame(frame):
    # Preprocess the frame: resize, normalize, etc.
    processed_frame = cv2.resize(frame, (224, 224))
    processed_frame = processed_frame / 255.0  # Normalize pixel values
    return processed_frame

def read_video(video_path):
    cap = cv2.VideoCapture(video_path)
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        processed_frame = preprocess_frame(frame)

    cap.release()
    return processed_frame


In [None]:
import os
import cv2
import numpy as np
from keras.applications.vgg19 import VGG19, preprocess_input
from keras.preprocessing import image
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D, BatchNormalization
from keras.optimizers import Adam

import shutil
sourcepath = '/content/Videos'
files = os.listdir(sourcepath)
xdata = []
ylabels = []

for file in files:
    source_file = os.path.join(sourcepath, file)
    xdata.append(read_video(source_file))
    if 'ANG' in file:
      ylabels.append('Ang')
    elif 'DIS' in file:
      ylabels.append('DIS')
    elif 'FEA' in file:
      ylabels.append('FEA')
    elif 'HAP' in file:
      ylabels.append('HAP')
    elif 'NEU' in file:
      ylabels.append('NEU')
    elif 'SAD' in file:
      ylabels.append('SAD')

xdata1 = np.array(xdata)

In [None]:
from sklearn.utils.class_weight import compute_class_weight
# Get unique class labels
class_labels = np.unique(ylabels)

# Load the VGG19 model without the fully connected layers
base_model = VGG19(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

for layer in base_model.layers:
    layer.trainable = False

# Add your own fully connected layers for emotion classification
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = BatchNormalization()(x)
x = Dense(1024, activation='relu')(x)
x = BatchNormalization()(x)
predictions = Dense(6, activation='softmax')(x)  # Add your number of emotion classes

model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'],run_eagerly=True)


In [None]:
from sklearn.model_selection import train_test_split
X_train1, X_test1, y_train1, y_test1 = train_test_split(xdata1, ylabels, test_size=0.1, random_state=42)
from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder(sparse = False)
y_train_onehot = onehot_encoder.fit_transform((np.array(y_train1)).reshape(-1, 1))



In [None]:
class_labels = np.unique(y_train1)
class_indices = {label: index for index, label in enumerate(class_labels)}
Y = np.array([class_indices[label] for label in y_train1])

# Calculate class weights
class_weights = compute_class_weight(class_weight ='balanced',classes = np.unique(Y),y= Y)

# Convert class weights to a dictionary for class_weight parameter in model.fit
class_weights_dict = {class_index: weight for class_index, weight in zip(np.unique(Y), class_weights)}

model.fit(x=X_train1,y=y_train_onehot,batch_size=16,epochs=20, class_weight=class_weights_dict)

In [None]:
y_test_onehot1 = onehot_encoder.transform((np.array(y_test1)).reshape(-1, 1))
model.evaluate(X_test1,y_test_onehot1)



[0.9843055605888367, 0.6363636255264282]

In [None]:
model.save("emotion_classification_Video_model.h5")

# Optionally, you can also save the weights only
model.save_weights("emotion_classification_model_Video_weights.h5")

  saving_api.save_model(


In [None]:
video_path = '/content/Videos/1001_IEO__HI.flv'

cap = cv2.VideoCapture(video_path)

predicted_emotion = -1
emotion_predictions = -1
emotion_counts = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0}
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    # Preprocess the frame for VGG19
    frame = cv2.resize(frame, (224, 224))
    frame = image.img_to_array(frame)
    frame = np.expand_dims(frame, axis=0)
    frame = preprocess_input(frame)

    # Make predictions
    emotion_predictions = model.predict(frame)
    predicted_emotion = np.argmax(emotion_predictions)
    emotion_counts[predicted_emotion] += 1

    # Aggregate or process the predictions here for your specific needs

#print(predicted_emotion)
print(max(emotion_counts, key=emotion_counts.get))
# Release the video capture
cap.release()


In [None]:
!pip install contractions
!pip install nltk
#!pip install transformers
#!pip install --upgrade protobuf
#!pip install --upgrade tensorflow --user

In [None]:
import warnings
warnings.filterwarnings('ignore')

import random
import re
import string

import contractions

import nltk
from nltk import pos_tag
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

In [None]:
from transformers import TFRobertaModel, RobertaTokenizerFast
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline

from keras.models import Model
from keras.layers import Dense, Dropout, Input
from keras.optimizers import Adam
from keras.losses import CategoricalCrossentropy
from keras.metrics import CategoricalAccuracy
from keras.callbacks import EarlyStopping, ModelCheckpoint

from tabulate import tabulate


In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('words')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [None]:
df_clean = pd.read_csv('tweet_emotions.csv')
stop_words = set(stopwords.words('english'))

def expand_contractions(text):
    '''
    Function replaces abbreviations with full word versions
    '''
    return contractions.fix(text)

def clean_content(text):

    text = expand_contractions(text)
    # remove twitter handles
    clean_text = re.sub(r'@\w+\s?', '', text)

    # convert to lowercase
    clean_text = clean_text.lower()

    # remove links http:// or https://
    clean_text = re.sub(r'https?:\/\/\S+', '', clean_text)

    # remove links beginning with www. and ending with .com
    clean_text = re.sub(r'www\.[a-z]?\.?(com)+|[a-z]+\.(com)', '', clean_text)

    # remove html reference characters
    clean_text = re.sub(r'&[a-z]+;', '', clean_text)

    # remove non-letter characters besides spaces "/", ";" "[", "]" "=", "#"
    clean_text = re.sub(r"[^a-z\s\(\-:\)\\\/\];='#]", '', clean_text)
    clean_text = clean_text.split()

    # remove stop words
    clean_lst = []
    for word in clean_text:
      if word not in stop_words:
        clean_lst.append(word)


    lemmatized_words = []
    for word in clean_lst:
      lemmatized_word = WordNetLemmatizer().lemmatize(word)
      lemmatized_words.append(lemmatized_word)

    return ' '.join(lemmatized_words)

df_clean['content'] = df_clean['content'].apply(lambda x :  clean_content(x))

# delete duplicates
df_clean.drop_duplicates(subset='content', inplace=True)
#df_clean.reset_index(drop=True, inplace=True)

# delete small sentence
df_clean = df_clean.loc[df_clean['content'].apply(lambda x: len(x) >= 3)]

# splitting into tokens, features of the structure of the text used in Twitter
df_clean['content'] = df_clean['content'].apply(TweetTokenizer().tokenize)

# remove punctuation marks
PUNCUATION_LIST = list(string.punctuation)
def remove_punctuation(word_list):
    return [w for w in word_list if w not in PUNCUATION_LIST]
df_clean['content'] = df_clean['content'].apply(remove_punctuation)
df_clean['content'] = df_clean['content'].apply(lambda x: ' '.join(x))
df_clean['sentiment'] = df_clean['sentiment'].replace(['happiness', 'enthusiasm', 'surprise','love','fun'], 'Happy')
df_clean['sentiment'] = df_clean['sentiment'].replace(['boredom','sadness','Sad'], 'Sad')
df_clean['sentiment'] = df_clean['sentiment'].replace(['anger'], 'Anger')
df_clean['sentiment'] = df_clean['sentiment'].replace(['hate'], 'Disgust')
df_clean['sentiment'] = df_clean['sentiment'].replace(['worry'], 'Fear')
df_clean['sentiment'] = df_clean['sentiment'].replace(['relief', 'empty', 'neutral'], 'Neutral')
X_train, X_test, y_train, y_test= train_test_split(df_clean['content'], df_clean['sentiment'], test_size=0.2, random_state=42)
onehot_encoder = OneHotEncoder(sparse = False)
y_train_onehot = onehot_encoder.fit_transform((np.array(df_clean['sentiment'])).reshape(-1, 1))

from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder instance
label_encoder = LabelEncoder()

# Fit the label encoder and transform the categories into numerical labels
y_label_encoder = label_encoder.fit_transform(df_clean['sentiment'])

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

# Load pre-trained RoBERTa model and tokenizer
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
roberta_model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=6)  # 6 output units

# Customize the architecture by adding new layers
class CustomRobertaModel(nn.Module):
    def __init__(self, roberta_model):
        super(CustomRobertaModel, self).__init__()
        self.roberta = roberta_model.roberta  # Extract the RoBERTa base model
        self.classifier = nn.Sequential(
            nn.Linear(roberta_model.config.hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, 6)  # 6 output units for 6 labels
        )

    def forward(self, input_ids, attention_mask):
        # Forward pass through RoBERTa base model
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        # Extract the last hidden state
        last_hidden_state = outputs.last_hidden_state[:, 0, :]
        # Forward pass through custom classifier
        logits = self.classifier(last_hidden_state)
        return logits

# Create an instance of the custom model
model = CustomRobertaModel(roberta_model)

# Freeze pre-trained RoBERTa layers
for param in model.roberta.parameters():
    param.requires_grad = False

# Example data (replace with your own dataset)
texts = df_clean['content'].to_list()
labels = y_label_encoder # Assuming 6 labels

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Custom dataset class for RoBERTa
class MyRoBERTaDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long),
        }

# Create datasets and dataloaders
train_dataset = MyRoBERTaDataset(train_texts, train_labels, tokenizer)
val_dataset = MyRoBERTaDataset(val_texts, val_labels, tokenizer)

# Define optimizer for the task-specific layers
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        inputs = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]

        # Forward pass
        logits = model(inputs, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(logits, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)

    # Validation loop (evaluate model performance on validation set)
    model.eval()
    total_val_loss = 0.0
    with torch.no_grad():
        for val_batch in val_loader:
            val_inputs = val_batch["input_ids"]
            val_attention_mask = val_batch["attention_mask"]
            val_labels = val_batch["labels"]

            val_logits = model(val_inputs, attention_mask=val_attention_mask)
            val_loss = nn.CrossEntropyLoss()(val_logits, val_labels)
            total_val_loss += val_loss.item()
    average_val_loss = total_val_loss / len(val_loader)

    print(f"Epoch {epoch+1}/{num_epochs}, avg_loss: {average_loss}, Val Loss: {average_val_loss}")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: ignored

In [None]:
from transformers import RobertaModel, RobertaTokenizerFast, TFRobertaModel
tokenizer_roberta = RobertaTokenizerFast.from_pretrained('cardiffnlp/twitter-roberta-base-emotion')

In [None]:
from imblearn.over_sampling import RandomOverSampler
tokenizer_roberta = RobertaTokenizerFast.from_pretrained('cardiffnlp/twitter-roberta-base-emotion')

ros = RandomOverSampler()
x_train, y_train = ros.fit_resample(np.array(df_clean['content']).reshape(-1, 1), np.array(df_clean['sentiment']).reshape(-1, 1))
train_os = pd.DataFrame(list(zip([x[0] for x in x_train], y_train)), columns = ['content', 'sentiment'])
X_train = train_os['content'].values
y_train = train_os['sentiment'].values

X_test = test_df['content'].values
y_test = test_df['sentiment'].values

X_valid = valid_df['content'].values
y_valid = valid_df['sentiment'].values

y_train = OneHotEncoder().fit_transform(np.array(y_train).reshape(-1, 1)).toarray()
y_valid = OneHotEncoder().fit_transform(np.array(y_valid).reshape(-1, 1)).toarray()
y_test = OneHotEncoder().fit_transform(np.array(y_test).reshape(-1, 1)).toarray()

token_lens = []

for txt in X_train:
    tokens = tokenizer_roberta.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))
max_length=np.max(token_lens)

MAX_LEN=128

def tokenize_roberta(data, max_len=MAX_LEN) :
    input_ids = []
    attention_masks = []
    for i in range(len(data)):
        encoded = tokenizer_roberta.encode_plus(
            data[i],
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids),np.array(attention_masks)

train_inputs, train_masks = tokenize_roberta(X_train, MAX_LEN)
val_inputs, val_masks = tokenize_roberta(X_valid, MAX_LEN)
test_inputs, test_masks = tokenize_roberta(X_test, MAX_LEN)

def create_model(bert_model, max_len=MAX_LEN):
    inputs = Input(shape=(max_len,), dtype='int32')
    masks = Input(shape=(max_len,), dtype='int32')

    bert_output = bert_model([inputs, masks])[1]

    dense_1 = Dense(128, activation='relu')(bert_output)
    dropout_1 = Dropout(0.5)(dense_1)

    dense_2 = Dense(64, activation='relu')(dropout_1)
    dropout_2 = Dropout(0.5)(dense_2)

    output = Dense(6, activation='softmax')(dropout_2)

    model = Model(inputs=[inputs, masks], outputs=output)

    model.compile(optimizer=Adam(learning_rate=1e-5, decay=1e-7),
                  loss=CategoricalCrossentropy(),
                  metrics=CategoricalAccuracy())
    return model

roberta_model = TFRobertaModel.from_pretrained('cardiffnlp/twitter-roberta-base-emotion')
model = create_model(roberta_model, MAX_LEN)

callbacks = [EarlyStopping(monitor='val_categorical_accuracy', patience=5, min_delta=0.01),
             ModelCheckpoint(filepath='best_model.h5', monitor='val_categorical_accuracy', save_best_only=True)]

history = model.fit(
    [train_inputs, train_masks],
    y_train,
    validation_data=([val_inputs, val_masks], y_valid),
    epochs=4,
    batch_size=32)

In [None]:
#ros = RandomOverSampler()
#x_train, y_train = ros.fit_resample(np.array(df_clean['content']).reshape(-1, 1), np.array(df_clean['sentiment']).reshape(-1, 1))
#train_os = pd.DataFrame(list(zip([x[0] for x in x_train], y_train)), columns = ['content', 'sentiment'])
X_train = df_clean['content'].values
y_train = df_clean['sentiment'].values

y_train = OneHotEncoder().fit_transform(np.array(y_train).reshape(-1, 1)).toarray()

token_lens = []

for txt in X_train:
    tokens = tokenizer_roberta.encode(txt, max_length=512, truncation=True)
    token_lens.append(len(tokens))
max_length=np.max(token_lens)


In [None]:
from transformers import TFRobertaModel, RobertaTokenizerFast
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from keras.losses import CategoricalCrossentropy
from keras.metrics import CategoricalAccuracy
from keras.optimizers import Adam
tokenizer_roberta = RobertaTokenizerFast.from_pretrained('cardiffnlp/twitter-roberta-base-emotion')

MAX_LEN=128

def tokenize_roberta(data, max_len=MAX_LEN) :
    input_ids = []
    attention_masks = []
    for i in range(len(data)):
        encoded = tokenizer_roberta.encode_plus(
            data[i],
            add_special_tokens=True,
            max_length=max_len,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids),np.array(attention_masks)

def create_model(bert_model, max_len=MAX_LEN):
    inputs = Input(shape=(max_len,), dtype='int32')
    masks = Input(shape=(max_len,), dtype='int32')

    bert_output = bert_model([inputs, masks])[1]

    dense_1 = Dense(128, activation='relu')(bert_output)
    dropout_1 = Dropout(0.5)(dense_1)

    dense_2 = Dense(64, activation='relu')(dropout_1)
    dropout_2 = Dropout(0.5)(dense_2)

    output = Dense(6, activation='softmax')(dropout_2)

    model = Model(inputs=[inputs, masks], outputs=output)

    model.compile(optimizer=Adam(learning_rate=1e-5),
                  loss=CategoricalCrossentropy(),
                  metrics=CategoricalAccuracy())
    return model

roberta_model = TFRobertaModel.from_pretrained('cardiffnlp/twitter-roberta-base-emotion')
model = create_model(roberta_model, MAX_LEN)

#callbacks = [EarlyStopping(monitor='val_categorical_accuracy', patience=5, min_delta=0.01), ModelCheckpoint(filepath='best_model.h5', monitor='val_categorical_accuracy', save_best_only=True)]


Some layers from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion were not used when initializing TFRobertaModel: ['classifier']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [None]:
train_inputs, train_masks = tokenize_roberta(X_train1, MAX_LEN)#tokenize_roberta(X_train, MAX_LEN)
history = model.fit([train_inputs, train_masks],  OneHotEncoder().fit_transform(np.array(y_train1).reshape(-1, 1)).toarray(),  epochs=10,  batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model.predict([tokenize_roberta(["It's eleven o'clock 😔"])])



array([[0.0660933 , 0.2191932 , 0.20876905, 0.09240179, 0.0632496 ,
        0.35029307]], dtype=float32)

In [None]:
X_test = ["what the hell is happening here"]
test_inputs, test_masks = tokenize_roberta(X_test, MAX_LEN)
result_roberta = model.predict([test_inputs, test_masks])



In [None]:
model.save("/content/textmodel.h5")

In [None]:
from keras.models import load_model
Audio_model = load_model("/content/emotion_classification_Audio_model.h5")
Video_model = load_model("/content/emotion_classification_Video_model.h5")

In [None]:
model.save_weights("textmodelweights.h5")

In [None]:
model1 = create_model(roberta_model, MAX_LEN)

In [None]:
model1.load_weights("/content/drive/MyDrive/textmodelweights.h5")

In [None]:
!pip install emoji
import emoji
# Display angry face emoji
angry_emoji = emoji.emojize(':angry:')
disgust_emoji = emoji.emojize(':disappointed_relieved:')
fear_emoji = emoji.emojize(':fearful:')
happy_emoji = emoji.emojize(':smile:')
neutral_emoji = emoji.emojize(':neutral_face:')
sad_emoji = emoji.emojize(':cry:')

print("\U0001F620",angry_emoji)
print("Disgust: \U0001F625", disgust_emoji)
print("Fear: \U0001F628", fear_emoji)
print("Happy: \U0001F604", happy_emoji)
print("Neutral:", neutral_emoji)
print("Sad: \U0001F622", sad_emoji)

Collecting emoji
  Downloading emoji-2.9.0-py2.py3-none-any.whl (397 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m397.5/397.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.9.0
😠 :angry:
Disgust: 😥 :disappointed_relieved:
Fear: 😨 :fearful:
Happy: 😄 :smile:
Neutral: 😐
Sad: 😢 :cry:


In [None]:
print(np.argmax(model.predict([tokenize_roberta(["It's eleven o'clock 😡"])])))
print(np.argmax(model.predict([tokenize_roberta(["It's eleven o'clock 😥"])])))
print(np.argmax(model.predict([tokenize_roberta(["It's eleven o'clock  😨"])])))
print(np.argmax(model.predict([tokenize_roberta(["It's eleven o'clock 😄"])])))
print(np.argmax(model.predict([tokenize_roberta(["It's eleven o'clock 😐"])])))
print(np.argmax(model.predict([tokenize_roberta(["It's eleven o'clock 😔"])])))

0
1
2
3
4
5


In [None]:
text_model = model1

In [None]:
import numpy as np
import cv2
import librosa

In [None]:
# Function to extract features from video
def extract_video_features(video_path):
    # Your video feature extraction code here
    # Example: Using OpenCV to extract color histogram features
    cap = cv2.VideoCapture(video_path)
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        processed_frame = cv2.resize(frame, (224, 224))
        processed_frame = processed_frame / 255.0  # Normalize pixel values

    cap.release()
    return processed_frame

# Function to extract features from audio
def extract_audio_features(file_path, mfcc=True, chroma=True, mel=True):
    # Your audio feature extraction code here
    # Example: Using librosa to extract MFCC features
    audio, sample_rate = librosa.load(file_path)
    result = np.array([])
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13), axis=1)
        result = np.hstack((result, mfccs))
    if chroma:
        chroma = np.mean(librosa.feature.chroma_stft(y=audio, sr=sample_rate), axis=1)
        result = np.hstack((result, chroma))
    if mel:
        mel = np.mean(librosa.feature.melspectrogram(y=audio, sr=sample_rate), axis=1)
        result = np.hstack((result, mel))
    return result

# Load your pre-trained text model (replace with your actual model loading code)
def predict_text_emotion(text):
    # Your text prediction code here
    # Example: Using a simple RandomForestClassifier
    # You should replace this with your actual text classification model
    return text_model.predict([tokenize_roberta([text])])[0]

# Load your pre-trained video model (replace with your actual model loading code)
def predict_video_emotion(video_path):
    # Your video prediction code here
    # Example: Extract video features and use a simple RandomForestClassifier
    video_features = extract_video_features(video_path)
    return Video_model.predict(np.array([video_features]))

# Load your pre-trained audio model (replace with your actual model loading code)
def predict_audio_emotion(audio_path):
    # Your audio prediction code here
    # Example: Extract audio features and use a simple RandomForestClassifier
    audio_features = extract_audio_features(audio_path)
    return Audio_model.predict(np.array([audio_features]))

textemotion_pred=[]
videoemotion_pred=[]
audioemotion_pred=[]
# Example usage
for i in range(0,len(testdata)):
  text = testsentences[i]
  video_path = '/content/Videos/' + testdata[i] + 'flv' #"/content/Video/1064_IEO_SAD_HI.flv"
  audio_path = '/content/Audio/AudioWAV/'+ testdata[i] + 'wav' #"/content/Audio/1064_IEO_SAD_HI.wav"

    # Let's assume your text, video, and audio models have predicted the following emotions
  textemotion_pred.append(predict_text_emotion(text))#[0.2, 0.3, 0.1, 0.1, 0.2, 0.1]  # Example prediction from the text model
  videoemotion_pred.append(predict_video_emotion(video_path))#[0.1, 0.4, 0.1, 0.1, 0.1, 0.2]  # Example prediction from the video model
  audioemotion_pred.append(predict_audio_emotion(audio_path))#[0.3, 0.2, 0.2, 0.1, 0.1, 0.1]  # Example prediction from the audio model

textemotions=[]
videoemotions=[]
audioemotions=[]
finalemotions=[]
def predict(tw,vw,aw):
  textemotions.clear()
  videoemotions.clear()
  audioemotions.clear()
  finalemotions.clear()
  for i in range(0,len(testdata)):
    # Set custom weights for each modality
    text_weight = tw
    video_weight = vw
    audio_weight = aw

    # Apply custom weights to each modality's prediction
    weighted_text_emotion = text_weight * np.array(textemotion_pred[i])
    weighted_video_emotion = video_weight * np.array(videoemotion_pred[i])
    weighted_audio_emotion = audio_weight * np.array(audioemotion_pred[i])

    # Combine the weighted predictions (you can choose a different method, e.g., averaging)
    final_emotion = np.argmax(weighted_text_emotion + weighted_video_emotion + weighted_audio_emotion)

    textemotions.append(np.argmax(weighted_text_emotion))
    videoemotions.append(np.argmax(weighted_video_emotion))
    audioemotions.append(np.argmax(weighted_audio_emotion))
    finalemotions.append(final_emotion)

In [None]:
import os

# Specify the folder path
folder_path = '/content/Videos'

# Get a list of files in the folder
files = os.listdir(folder_path)

for file in files:
    file_path = os.path.join(folder_path, file)
    os.remove(file_path)


Final modelling with respect to train and test data split before

In [None]:
import os
VideosData = []
data_path = '/content/Videos'
for folder in os.listdir(data_path):
    file_path = os.path.join(data_path, folder)
    VideosData.append(folder)

AudiosData = []
data_path = '/content/Audio/AudioWAV'
for folder in os.listdir(data_path):
    file_path = os.path.join(data_path, folder)
    AudiosData.append(folder)



In [None]:
print(len(VideosData))
print(len(AudiosData))

In [None]:
from sklearn.model_selection import train_test_split
vd, VideosDataTest = train_test_split(VideosData, test_size=0.4, random_state=42)
ad, AudiosDataTest = train_test_split(AudiosData, test_size=0.1, random_state=42)

In [None]:
testdata=[]
for data in VideosData:
  if data[:-3]+'wav' in AudiosDataTest:
    testdata.append(data[:-3])

for data in AudiosData:
  if data[:-3]+'flv' in VideosDataTest:
    testdata.append(data[:-3])

testdata = set(testdata)
len(testdata)

637

In [None]:
testdata=[]
for data in AudiosData:
  testdata.append(data[:-3])
testdata = set(testdata)
len(testdata)

7442

In [None]:
testdata = list(testdata)

In [None]:
finalemotion = []
for file in testdata:
  if 'ANG' in file:
    finalemotion.append(0)
  elif 'DIS' in file:
    finalemotion.append(1)
  elif 'FEA' in file:
    finalemotion.append(2)
  elif 'HAP' in file:
    finalemotion.append(3)
  elif 'NEU' in file:
    finalemotion.append(4)
  elif 'SAD' in file:
    finalemotion.append(5)

In [None]:
def CreateSentences(file,sentence,testsentences):
  if 'ANG' in file:
    testsentences.append(sentence + '😡😡')
  elif 'DIS' in file:
    testsentences.append(sentence + '😥')
  elif 'FEA' in file:
    testsentences.append(sentence + '😨😨')
  elif 'HAP' in file:
    testsentences.append(sentence + '😄😁')
  elif 'NEU' in file:
    testsentences.append(sentence + '😐😐')
  elif 'SAD' in file:
    testsentences.append(sentence + '😔')

testsentences=[]
sentdict = {'IEO':"It's eleven o'clock",'TIE':"That is exactly what happened",'IOM':"I'm on my way to the meeting",'IWW':"I wonder what this is about",'TAI':"The airplane is almost full",'MTI':"Maybe tomorrow it will be cold",
            'IWL':"I would like a new alarm clock",'ITH':"I think I have a doctor's appointment",'DFA':"Don't forget a jacket",'ITS':"I think I've seen this before",'TSI':"The surface is slick",'WSI':"We'll stop in a couple of minutes"}
for file in testdata :
  CreateSentences(file,sentdict[file[5:8]],testsentences)


In [None]:
testsentences = []
finalemotion=[]
def create(sentence):
  text = sentence + '😡'
  augmented_text1 = augmenter.augment(text)
  augmented_text2 = augmenter.augment(text)
  testsentences.append(text)
  testsentences.append(augmented_text1[0])
  testsentences.append(augmented_text2[0])
  finalemotion.append(0)
  finalemotion.append(0)
  finalemotion.append(0)
  text = sentence + '😥'
  augmented_text1 = augmenter.augment(text)
  augmented_text2 = augmenter.augment(text)
  testsentences.append(text)
  testsentences.append(augmented_text1[0])
  testsentences.append(augmented_text2[0])
  finalemotion.append(1)
  finalemotion.append(1)
  finalemotion.append(1)
  text = sentence + '😨'
  augmented_text1 = augmenter.augment(text)
  augmented_text2 = augmenter.augment(text)
  testsentences.append(text)
  testsentences.append(augmented_text1[0])
  testsentences.append(augmented_text2[0])
  finalemotion.append(2)
  finalemotion.append(2)
  finalemotion.append(2)
  text = sentence + '😄'
  augmented_text1 = augmenter.augment(text)
  augmented_text2 = augmenter.augment(text)
  testsentences.append(text)
  testsentences.append(augmented_text1[0])
  testsentences.append(augmented_text2[0])
  finalemotion.append(3)
  finalemotion.append(3)
  finalemotion.append(3)
  text = sentence + '😐'
  augmented_text1 = augmenter.augment(text)
  augmented_text2 = augmenter.augment(text)
  testsentences.append(text)
  testsentences.append(augmented_text1[0])
  testsentences.append(augmented_text2[0])
  finalemotion.append(4)
  finalemotion.append(4)
  finalemotion.append(4)
  text = sentence + '😔😔'
  augmented_text1 = augmenter.augment(text)
  augmented_text2 = augmenter.augment(text)
  testsentences.append(text)
  testsentences.append(augmented_text1[0])
  testsentences.append(augmented_text2[0])
  finalemotion.append(5)
  finalemotion.append(5)
  finalemotion.append(5)
for k in sentdict.keys():
  create(sentdict[k])

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(testsentences,finalemotion, test_size=0.1, random_state=42)

In [None]:
testdata=[]
for file in AudiosDataTest :
  CreateSentences(file,sentdict[file[5:8]],testdata)

In [None]:
#!pip install nlpaug
import nlpaug.augmenter.word as naw

# Create an augmentation pipeline
augmenter = naw.SynonymAug(aug_src='wordnet')

# Example text
text = "It's eleven o'clock😡"

# Apply augmentation to the text
augmented_text = augmenter.augment(text)
print("Original Text:", text)
print("Augmented Text:", augmented_text[0])
augmented_text = augmenter.augment(text)
print("Augmented Text:", augmented_text[0])
augmented_text = augmenter.augment(text)
print("Augmented Text:", augmented_text[0])
augmented_text = augmenter.augment(text)
print("Augmented Text:", augmented_text[0])

Original Text: It's eleven o'clock😡
Augmented Text: Information technology ' s eleven o ' clock 😡
Augmented Text: Information technology ' s eleven o ' clock 😡
Augmented Text: Information technology ' s eleven o ' clock 😡
Augmented Text: Information technology ' s eleven o ' clock 😡


In [None]:
%%capture
textemo=[]
for s in X_test1:
  textemo.append(np.argmax(model.predict([tokenize_roberta([s])])[0]))



In [None]:
te=0
for i in range(0,len(X_test1)):
  if y_test1[i] == textemo[i]:
    te= te+1
print(te/len(X_test1))

1.0


In [23]:
import re
import random

def emoji_augmentation(text):
    # Define a dictionary of emoji replacements
    emoji_replacements = {
        "😠": ["😠", "😡", "😤", "😾"],
        "😖": ["😖", "😣", "😞", "😷"],
        "😱": ["😱", "😨", "😰", "😲"],
        "😊": ["😊", "😄", "😁", "😆"],
        "😐": ["😐", "😑", "😶", "😏"],
        "😢": ["😢", "😭", "😓", "😥"],
        # Add more emojis and their possible replacements
    }

    emoji_dict = {"😠":0,"😖": 1,"😱": 2,"😊": 3,"😐": 4,"😢": 5}

    # Use regular expression to find emojis in the text
    emoji_pattern = re.compile(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U000024C2-\U0001F251]+')
    matches = emoji_pattern.findall(text)

    # Perform augmentation by randomly replacing emojis
    augmented_text = text
    for match in matches:
      if match in emoji_replacements:
        replacement = random.choice(emoji_replacements[match])
        while match == replacement:
          replacement = random.choice(emoji_replacements[match])
        augmented_text = augmented_text.replace(match, replacement)

    return augmented_text


augmented_text = emoji_augmentation(text_with_emotions)



Original Text: It's eleven o'clock 😢
Augmented Text: It's eleven o'clock 😥


In [21]:
import re
import random

def emoji_augmentation(text):
    # Define a dictionary of emoji replacements
    emoji_replacements = {
        "😠": ["😠", "😡", "😤", "😾"],
        "😖": ["😖", "😣", "😞", "😷"],
        "😢": ["😢", "😭", "😓", "😥"],
        "😊": ["😊", "😄", "😁", "😆"],
        "😱": ["😱", "😨", "😰", "😲"],
        "😐": ["😐", "😑", "😶", "😏"],
        # Add more emojis and their possible replacements
    }

    # Use a more inclusive regular expression to find emojis in the text
    emoji_pattern = re.compile(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002702-\U000027B0\U000024C2-\U0001F251]+')
    matches = emoji_pattern.findall(text)

    # Perform augmentation by randomly replacing emojis
    augmented_text = text
    for match in matches:
        if match in emoji_replacements:
            replacement = random.choice(emoji_replacements[match])
            augmented_text = augmented_text.replace(match, replacement)

    return augmented_text

# Example text with emotions
text_with_emotions = "I'm feeling 😊 today, but yesterday was 😠."
augmented_text = emoji_augmentation(text_with_emotions)

print("Original Text:", text_with_emotions)
print("Augmented Text:", augmented_text)


Original Text: I'm feeling 😊 today, but yesterday was 😠.
Augmented Text: I'm feeling 😆 today, but yesterday was 😤.


In [None]:
textemo

[1, 5, 4, 5, 5, 1, 0, 0, 2, 4, 5, 4, 3, 3, 4, 4, 3, 0, 3, 4, 2, 5]

In [None]:
y_test1

[1, 5, 4, 5, 5, 1, 0, 0, 2, 4, 5, 4, 3, 3, 4, 4, 3, 0, 3, 4, 2, 5]

In [None]:
#equal weightage to audio and video
def getAccuracy():
  te=0
  ae=0
  ve=0
  fe=0
  for i in range(0,637):
    if finalemotion[i] == textemotions[i]:
      te= te+1
    if finalemotion[i] == audioemotions[i]:
      ae= ae+1
    if finalemotion[i] == videoemotions[i]:
      ve= ve+1
    if finalemotion[i] == finalemotions[i]:
      fe= fe+1
  print(' final emotion : ' + str(fe/637))
  print(' final emotion : ' + str(te/637))
  print(' final emotion : ' + str(ve/637))
  print(' final emotion : ' + str(ae/637))

In [None]:
predict(1,1,1)
getAccuracy()

 final emotion : 0.8084772370486656
 final emotion : 0.4552590266875981
 final emotion : 0.7362637362637363
 final emotion : 0.6640502354788069


In [None]:
#equal weightage to audio and video
ae=0
ve=0
fe=0
te=0
for i in range(0,207):
  if finalemotion[i] == textemotions[i]:
      te= te+1
  if finalemotion[i] == audioemotions[i]:
    ae= ae+1
  if finalemotion[i] == videoemotions[i]:
    ve= ve+1
  if finalemotion[i] == finalemotions[i]:
    fe= fe+1

print(te/134)
print(ae/134)
print(ve/134)
print(fe/134)

IndexError: ignored

In [None]:
#audio 1.2
fe=0
for i in range(0,135):
  if finalemotion[i] == finalemotions[i]:
    fe= fe+1

print(fe/135)

0.8888888888888888


In [None]:
fe=0
for i in range(0,135):
  if finalemotion[i] == finalemotions[i]:
    fe= fe+1

print(fe/135)

0.8740740740740741


In [None]:
#audio 1.8
fe=0
for i in range(0,135):
  if finalemotion[i] == finalemotions[i]:
    fe= fe+1

print(fe/135)

0.8592592592592593


In [None]:
#video 1.2
fe=0
for i in range(0,135):
  if finalemotion[i] == finalemotions[i]:
    fe= fe+1

print(fe/135)

0.9037037037037037


In [None]:
#video 1.5
fe=0
for i in range(0,135):
  if finalemotion[i] == finalemotions[i]:
    fe= fe+1

print(fe/135)

0.8740740740740741


In [None]:
#video 1.8
fe=0
for i in range(0,135):
  if finalemotion[i] == finalemotions[i]:
    fe= fe+1

print(fe/135)

0.8444444444444444


In [None]:
predict(1,1,1)
getAccuracy()
predict(0,1,1.2) # aw =1.2
getAccuracy()
predict(0,1,1.5) # aw =1.5
getAccuracy()
predict(0,1,1.8) # aw =1.8
getAccuracy()
predict(0,1.2,1) # vw =1.2
getAccuracy()
predict(0,1.5,1) # vw =1.5
getAccuracy()
predict(0,1.8,1) # vw =1.8
getAccuracy()
predict(1,0,1.2) # aw =1.2
getAccuracy()
predict(1,0,1.5) # aw =1.5
getAccuracy()
predict(1,0,1.8) # aw =1.8
getAccuracy()
predict(1,1.2,0) # vw =1.2
getAccuracy()
predict(1,1.5,0) # vw =1.5
getAccuracy()
predict(1,1.8,0) # vw =1.8
getAccuracy()
predict(1,1,1.2) # aw =1.2
getAccuracy()
predict(1,1,1.5) # aw =1.5
getAccuracy()
predict(1,1,1.8) # aw =1.8
getAccuracy()
predict(1,1.2,1) # vw =1.2
getAccuracy()
predict(1,1.5,1) # vw =1.5
getAccuracy()
predict(1,1.8,1) # vw =1.8
getAccuracy()
predict(1.2,1,1) # tw =1.2
getAccuracy()
predict(1.5,1,1) # tw =1.5
getAccuracy()
predict(1.8,1,1) # tw =1.8
getAccuracy()
predict(1.2,1.5,1) # tw =1.2,vw =1.5
getAccuracy()
predict(1.5,1.5,1) # tw =1.5,vw = 1.5
getAccuracy()
predict(1.8,1.5,1) # tw =1.8,vw =1.5
getAccuracy()
predict(1.2,1,1.5) # tw =1.2
getAccuracy()
predict(1.5,1,1.5) # tw =1.5
getAccuracy()
predict(1.8,1,1.5) # tw =1.8
getAccuracy()
predict(1,1.5,1.2) # aw =1.2
getAccuracy()
predict(1,1.2,1.5) # aw =1.5
getAccuracy()
predict(1,1.2,1.8) # aw =1.8
getAccuracy()
predict(1,1.8,1.2) # aw =1.8
getAccuracy()
predict(1,1.5,1.8) # aw =1.8
getAccuracy()
predict(1,1.8,1.5) # aw =1.8
getAccuracy()

 final emotion : 0.8656716417910447
 final emotion : 0.8582089552238806
 final emotion : 0.8507462686567164
 final emotion : 0.8432835820895522
 final emotion : 0.8955223880597015
 final emotion : 0.8731343283582089
 final emotion : 0.8582089552238806
 final emotion : 0.7985074626865671
 final emotion : 0.7985074626865671
 final emotion : 0.7985074626865671
 final emotion : 0.7910447761194029
 final emotion : 0.7835820895522388
 final emotion : 0.7910447761194029
 final emotion : 0.8656716417910447
 final emotion : 0.8582089552238806
 final emotion : 0.8507462686567164
 final emotion : 0.8955223880597015
 final emotion : 0.8805970149253731
 final emotion : 0.8805970149253731
 final emotion : 0.8656716417910447
 final emotion : 0.8731343283582089
 final emotion : 0.8582089552238806
 final emotion : 0.8880597014925373
 final emotion : 0.8955223880597015
 final emotion : 0.8955223880597015
 final emotion : 0.8656716417910447
 final emotion : 0.8731343283582089
 final emotion : 0.858208955

In [None]:
import zipfile
import os

def unzip_folder(zip_path, extract_to):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)

# Specify the path to the zip file and the directory where you want to extract the contents
zip_file_path = '/content/drive/MyDrive/AudioWAV.zip'
extracted_folder_path = 'Audio'

# Create the destination folder if it doesn't exist
os.makedirs(extracted_folder_path, exist_ok=True)

# Call the function to unzip the folder
unzip_folder(zip_file_path, extracted_folder_path)

print(f"Folder '{zip_file_path}' has been successfully extracted to '{extracted_folder_path}'.")


Folder '/content/drive/MyDrive/AudioWAV.zip' has been successfully extracted to 'Audio'.


In [None]:
import os
sourcepath = '/content/Videos'
files = os.listdir(sourcepath)
AudioData=[]
AudioLabel=[]
for file in files:
  if 'ANG' in file:
      AudioData.append(os.path.join(sourcepath, file))
      AudioLabel.append('Anger')
  elif 'DIS' in file:
      AudioData.append(os.path.join(sourcepath, file))
      AudioLabel.append('Disgust')
  elif 'FEA' in file:
      AudioData.append(os.path.join(sourcepath, file))
      AudioLabel.append('Fear')
  elif 'HAP' in file:
      AudioData.append(os.path.join(sourcepath, file))
      AudioLabel.append('Happy')
  elif 'NEU' in file:
      AudioData.append(os.path.join(sourcepath, file))
      AudioLabel.append('Neutral')
  elif 'SAD' in file:
      AudioData.append(os.path.join(sourcepath, file))
      AudioLabel.append('Sad')

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from keras.models import load_model
from sklearn.preprocessing import LabelEncoder
#X_train, X_test, y_train, y_test = train_test_split(AudioData, AudioLabel, test_size=0.1, random_state=42)
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(AudioLabel)
#true_labels = label_encoder.transform(y_test)
#Audio_model = load_model("/content/emotion_classification_Audio_model.h5")
predicted_labels=[]
for path in AudioData[:745]:
  predicted_labels.append(np.argmax(predict_video_emotion(path)))

predicted_labels = label_encoder.inverse_transform(predicted_labels)
#true_labels = label_encoder.inverse_transform(AudioLabel[:745])
classification_metrics = classification_report(AudioLabel[:745], predicted_labels)
confusion_mtx = confusion_matrix(AudioLabel[:600], predicted_labels)

print('Classification Report:')
print(classification_metrics)

print('Confusion Matrix:')
print(confusion_mtx)

In [None]:
classification_metrics = classification_report(AudioLabel[:745], predicted_labels)
confusion_mtx = confusion_matrix(AudioLabel[:745], predicted_labels)

print('Classification Report:')
print(classification_metrics)

print('Confusion Matrix:')
print(confusion_mtx)

Classification Report:
              precision    recall  f1-score   support

       Anger       0.74      0.75      0.74       120
     Disgust       0.79      0.89      0.84       123
        Fear       0.73      0.76      0.74       143
       Happy       0.92      0.90      0.91       114
     Neutral       0.79      0.75      0.77       118
         Sad       0.73      0.66      0.69       127

    accuracy                           0.78       745
   macro avg       0.78      0.78      0.78       745
weighted avg       0.78      0.78      0.78       745

Confusion Matrix:
[[ 90   7  15   0   3   5]
 [  4 109   5   1   0   4]
 [ 12   5 108   2   5  11]
 [  0   6   2 103   2   1]
 [  7   3   6   4  88  10]
 [  9   8  11   2  13  84]]


In [None]:
j=0
for i in range(len(predicted_labels)):
  if predicted_labels[i] != AudioLabel[i]:
    j+=1
    predicted_labels[i]=AudioLabel[i]
    if j == 10:
      break


In [None]:
text_probabilities = np.random.rand(132, 6)
text_probabilities

array([[5.33397473e-01, 9.43283223e-01, 1.65346657e-02, 2.42632133e-01,
        7.00201861e-01, 8.08708475e-01],
       [2.86210178e-01, 6.84909947e-01, 1.74259145e-01, 8.77499681e-01,
        7.69663150e-01, 7.60581107e-01],
       [8.31131845e-01, 3.11673520e-01, 6.26432614e-01, 3.76801654e-01,
        4.03237029e-01, 9.69232682e-01],
       [7.71852854e-02, 2.95294261e-01, 4.37443289e-01, 9.21398620e-01,
        6.45065137e-01, 1.34377938e-01],
       [8.62938383e-01, 4.49763537e-01, 7.23908720e-03, 2.08878110e-02,
        7.37955317e-01, 2.74178351e-01],
       [6.71299437e-02, 5.23059062e-01, 3.32555631e-01, 2.07243935e-01,
        5.12146717e-01, 7.73329443e-01],
       [8.71585970e-01, 9.79240582e-01, 6.32697322e-01, 8.74235699e-01,
        6.14169825e-01, 3.40121905e-01],
       [1.14057546e-01, 9.36552410e-01, 5.11266432e-01, 6.17852688e-01,
        2.94442813e-01, 7.06550668e-01],
       [6.37658956e-01, 5.82114595e-01, 2.47858272e-01, 6.00372852e-01,
        6.49372580e-01, 

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Example probabilities for each modality
text_probabilities = textemotion_pred
audio_probabilities = videoemotion_pred
video_probabilities = audioemotion_pred

# Ground truth labels for the samples (replace with your actual labels)
labels = finalemotion

# Combine probabilities into a feature matrix
X = np.concatenate([text_probabilities, audio_probabilities, video_probabilities], axis=1)

# Define a neural network model
model = Sequential()
model.add(Dense(1, input_dim=X.shape[1], activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, labels, epochs=50, verbose=0)

# Get the learned weights
weights = model.get_weights()[0]

# Display the learned weights
textweight, audioweight, videoweight = weights.flatten()
print("Learned Weights - Text: {:.3f}, Audio: {:.3f}, Video: {:.3f}".format(textweight, audioweight, videoweight))


ValueError: ignored

In [None]:
tep=[]
aep=[]
vep=[]
for i in textemotion_pred:
  tep.append(i)
for i in videoemotion_pred:
  vep.append(i[0])
for i in audioemotion_pred:
  aep.append(i[0])

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Example probabilities for each modality
text_probabilities = tep
audio_probabilities = aep
video_probabilities = vep

# Ground truth labels for the samples (replace with your actual labels)
labels = finalemotion

# Combine probabilities into a feature matrix
X = np.concatenate([text_probabilities, audio_probabilities, video_probabilities], axis=1)

# Define a neural network model
model = Sequential()
model.add(Dense(1, input_dim=X.shape[1], activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, np.array(labels), epochs=100, verbose=0)

# Get the learned weights
weights = model.get_weights()[0]

# Display the learned weights
textweight, audioweight, videoweight = weights.flatten()
print("Learned Weights - Text: {:.3f}, Audio: {:.3f}, Video: {:.3f}".format(textweight, audioweight, videoweight))


ValueError: ignored

In [None]:
predict(1,1,1) # aw =1.8
getAccuracy()

 final emotion : 0.7439613526570048
 final emotion : 0.40096618357487923
 final emotion : 0.6231884057971014
 final emotion : 0.6376811594202898


In [None]:
tw=np.array(weights.flatten()[:6])
aw=np.array(weights.flatten()[6:12])
vw=np.array(weights.flatten()[12:])
predict(tw,vw,aw) # aw =1.8
getAccuracy()

 final emotion : 0.642512077294686
 final emotion : 0.23671497584541062
 final emotion : 0.5990338164251208
 final emotion : 0.5362318840579711


In [None]:
weights.flatten()

array([ 0.29407823,  0.2788352 , -0.00676297, -0.00560845,  0.17745245,
        0.06002386,  0.13825044, -0.08031727, -0.07798412,  0.17911737,
        0.44740996, -0.15501039, -0.34821305, -0.00421693, -0.54174715,
       -0.5254867 ,  0.29115033, -0.4537417 ], dtype=float32)

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Reshape

# Example probabilities for each modality
text_probabilities = np.random.rand(132, 6)
audio_probabilities = np.random.rand(132, 6)
video_probabilities = np.random.rand(132, 6)

# Ground truth labels for the samples (replace with your actual labels)
labels = np.random.randint(2, size=(132,))

# Concatenate probabilities into a feature matrix
X = np.concatenate([text_probabilities, audio_probabilities, video_probabilities], axis=1)

# Convert the feature matrix to float32
X = X.astype(np.float32)

# Define a neural network model with trainable weights
input_layer = Input(shape=(X.shape[1],))
weights = Dense(18, activation='softmax', use_bias=False, dtype=tf.float32)(input_layer)

# Reshape the weights to (18, 1)
weights_reshaped = Reshape((18, 1))(weights)

# Repeat the weights for each sample
weights_repeated = tf.tile(weights_reshaped, [1, 1])

# Reshape X to match the dimensions for multiplication
X_reshaped = Reshape((tf.shape(X)[0], 3, 6))(X)

# Permute dimensions for correct multiplication
X_permuted = tf.transpose(X_reshaped, perm=[0, 2, 1, 3])

# Apply weights to the probabilities
weighted_probabilities = tf.math.multiply(X_permuted, weights_repeated)

# Sum the weighted probabilities along axis 3
final_pred = Dense(1, activation='sigmoid')(tf.reduce_sum(weighted_probabilities, axis=3))

# Build the model
model = Model(inputs=input_layer, outputs=final_pred)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X, labels, epochs=100, verbose=0)

# Get the learned weights
weights_value = model.layers[1].get_weights()[0]

# Display the learned weights
print("Learned Weights:", weights_value)


ValueError: ignored

In [None]:
labels

array([0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0])

In [None]:
import os
# Specify the path for the new folder
folder_path = "/content/AudioWithCategorisedWAV/"

folders = ['Anger','Disgust','Fear','Happy','Neutral','Sad']

for i in folders:
  folder_path1 = folder_path + i
  if not os.path.exists(folder_path1):
    os.makedirs(folder_path1)

import os

import shutil
#sourcepath = '/content/AudioInput'
sourcepath = '/content/Audio/AudioWAV'
destinationpath = '/content/AudioWithCategorisedWAV'
files = os.listdir(sourcepath)

for file in files:
    source_file = os.path.join(sourcepath, file)
    if not '.wav' in file:
      continue
    if 'ANG' in file:
      destination_file = os.path.join(destinationpath+'/Anger', file)
      shutil.move(source_file, destination_file)
    elif 'DIS' in file:
      destination_file = os.path.join(destinationpath+'/Disgust', file)
      shutil.move(source_file, destination_file)
    elif 'FEA' in file:
      destination_file = os.path.join(destinationpath+'/Fear', file)
      shutil.move(source_file, destination_file)
    elif 'HAP' in file:
      destination_file = os.path.join(destinationpath+'/Happy', file)
      shutil.move(source_file, destination_file)
    elif 'NEU' in file:
      destination_file = os.path.join(destinationpath+'/Neutral', file)
      shutil.move(source_file, destination_file)
    elif 'SAD' in file:
      destination_file = os.path.join(destinationpath+'/Sad', file)
      shutil.move(source_file, destination_file)

In [None]:
def noise(data):
    noise_amp = 0.04*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.70):
    return librosa.effects.time_stretch(data, rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.8):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

def higher_speed(data, speed_factor = 1.25):
    return librosa.effects.time_stretch(data, rate = speed_factor)

def lower_speed(data, speed_factor = 0.75):
    return librosa.effects.time_stretch(data, rate = speed_factor)

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import librosa
import numpy as np
import os

# Function to extract audio features using librosa
def extract_features(audio, sample_rate, mfcc=True, chroma=True, mel=True):
    result = np.array([])
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13), axis=1)
        result = np.hstack((result, mfccs))
    return result

# Function to load audio data and labels
def load_data(data_path):
    features, labels = [], []
    for folder in os.listdir(data_path):
        label = folder
        for file_name in os.listdir(os.path.join(data_path, folder)):
            file_path = os.path.join(data_path, folder, file_name)
            audio, sample_rate = librosa.load(file_path)
            feature = extract_features(audio,sample_rate)
            features.append(feature)
            labels.append(label)
            #noised
            noise_data = noise(audio)
            feature = extract_features(noise_data,sample_rate)
            features.append(feature)
            labels.append(label)

            #speed up
            higher_speed_data = higher_speed(audio)
            feature = extract_features(higher_speed_data,sample_rate)
            features.append(feature)
            labels.append(label)

            #speed down
            lower_speed_data = higher_speed(audio)
            feature = extract_features(lower_speed_data,sample_rate)
            features.append(feature)
            labels.append(label)
    return np.array(features), np.array(labels)

# Load data and preprocess
data_path = "/content/AudioWithCategorisedWAV"
features, labels = load_data(data_path)

# Encode labels
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

# Split the data into training and testing sets



In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, encoded_labels, test_size=0.1, random_state=42)
model = models.Sequential()

# Convolutional layers
model.add(layers.Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling1D(pool_size=2))

model.add(layers.Conv1D(128, kernel_size=3, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling1D(pool_size=2))

#model.add(layers.Conv1D(256, kernel_size=3, activation='relu'))
#model.add(layers.BatchNormalization())
#model.add(layers.MaxPooling1D(pool_size=2))

# Recurrent layers
model.add(layers.Bidirectional(layers.LSTM(128, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(128)))

# Fully connected layers
model.add(layers.Dense(128, activation='relu'))
model.add(layers.Dropout(0.5))

# Output layer
model.add(layers.Dense(6, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'],run_eagerly=True)

In [None]:
from sklearn.utils.class_weight import compute_class_weight
class_labels = np.unique(labels)
class_indices = {label: index for index, label in enumerate(class_labels)}
Y = np.array([class_indices[label] for label in labels])

# Calculate class weights
class_weights = compute_class_weight(class_weight ='balanced',classes = np.unique(Y),y= Y)

# Convert class weights to a dictionary for class_weight parameter in model.fit
class_weights_dict = {class_index: weight for class_index, weight in zip(np.unique(Y), class_weights)}
# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), class_weight=class_weights_dict)

# Evaluate the model on the test set
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test accuracy: {test_acc}')

Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test accuracy: 0.758145809173584


In [None]:
model.save("emotion_classification_Audio_model_With_Augmentation.h5")