In [4]:
import torch
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.preprocessing import LabelEncoder
from collections import Counter

In [5]:
# ================================================================================================= TEXT MODALITY ==========================================================================================================

# Load text dataset
text_csv_path = r'C:\Users\ssid7\Desktop\Projects\MCA_Major_Project-Multimodal_Emotion_Recognition_System\tweet_emotions.csv'
df = pd.read_csv(
    text_csv_path,
    usecols=['content', 'sentiment'],  # Only load necessary columns
    dtype={'content': 'string', 'sentiment': 'category'}
)
df = df.rename(columns={'content': 'tweet', 'sentiment': 'label'})

# Filter and preprocess labels
labels = ['neutral', 'sadness', 'happiness']
df = df.query('label in @labels')
df['label'] = df['label'].replace({'sadness': 'negative', 'happiness': 'positive'})

# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
text_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)  # 3 classes: negative, neutral, positive
text_model.eval()

# Encode labels for text
label_encoder_text = LabelEncoder()
label_encoder_text.fit(['negative', 'neutral', 'positive'])

# Text prediction function
def predict_emotion_from_text(text):
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    with torch.no_grad():
        outputs = text_model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    predicted_class = torch.argmax(logits, dim=1).item()
    return label_encoder_text.inverse_transform([predicted_class])[0]




# ================================================================================================= IMAGE MODALITY =======================================================================================================


# Define dataset paths
train_dir = r'C:\Users\ssid7\Desktop\Projects\MCA_Major_Project-Multimodal_Emotion_Recognition_System\data\train'
test_dir = r'C:\Users\ssid7\Desktop\Projects\MCA_Major_Project-Multimodal_Emotion_Recognition_System\data\test'

# Image data generators
train_datagen = ImageDataGenerator(rescale=1.0/255.0, rotation_range=10, width_shift_range=0.1, height_shift_range=0.1, horizontal_flip=True)

test_datagen = ImageDataGenerator(rescale=1.0/255.0)

# Load training and testing image data
train_data = train_datagen.flow_from_directory(
    train_dir,
    target_size=(48, 48),
    color_mode="rgb",
    batch_size=64,
    class_mode='categorical'
)

test_data = test_datagen.flow_from_directory(
    test_dir,
    target_size=(48, 48),
    color_mode="rgb",
    batch_size=64,
    class_mode='categorical'
)

# Define VGG16 model for image emotion recognition
def create_vgg_model(num_classes):
    base_model = VGG16(weights='imagenet', include_top=False, input_shape=(48, 48, 3))
    x = Flatten()(base_model.output)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(num_classes, activation='softmax')(x)  # Adjust output layer to match number of classes
    model = Model(inputs=base_model.input, outputs=x)
    return model

# Number of classes in the dataset
num_classes = train_data.num_classes

# Initialize and train the model
image_model = create_vgg_model(num_classes)
image_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train and save the model weights
image_model.fit(
    train_data,
    validation_data=test_data,
    epochs=2,
)
image_model.save_weights("vgg_emotion_model.weights.h5")

# Load pre-trained image model
image_model.load_weights("vgg_emotion_model.weights.h5")

# Encode labels for images
label_encoder_image = LabelEncoder()
label_encoder_image.fit(list(train_data.class_indices.keys()))



# Image prediction function
def predict_emotion_from_image(image_path):
    from tensorflow.keras.preprocessing import image
    img = image.load_img(image_path, target_size=(48, 48))
    img_array = image.img_to_array(img) / 255.0
    img_array = np.expand_dims(img_array, axis=0)

    with tf.device('/CPU:0'):  # Ensure the prediction runs on the CPU
        predictions = image_model.predict(img_array)
    predicted_class = np.argmax(predictions, axis=1)[0]
    return label_encoder_image.inverse_transform([predicted_class])[0]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Found 28709 images belonging to 7 classes.
Found 7178 images belonging to 7 classes.
Epoch 1/2


  self._warn_if_super_not_called()


[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1364s[0m 3s/step - accuracy: 0.2191 - loss: 1.9402 - val_accuracy: 0.2471 - val_loss: 1.8203
Epoch 2/2
[1m449/449[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m657s[0m 1s/step - accuracy: 0.2437 - loss: 1.8222 - val_accuracy: 0.2471 - val_loss: 1.8146


In [6]:
# ======================================================================================== MULTIMODAL FUSION=====================================================================================================================

# Fusion function combining text and image predictions
image_path = r'C:\Users\ssid7\Desktop\Projects\MCA_Major_Project-Multimodal_Emotion_Recognition_System\data\test\happy\PrivateTest_647018.jpg'
text = "Buying new clothes in discount"
def predict_multimodal_emotion_batch(texts, image_paths):
    all_predictions = []

    # Process texts
    for text in texts:
        try:
            text_emotion = predict_emotion_from_text(text)
            print(f"[Text] '{text}' -> {text_emotion}")
            all_predictions.append(text_emotion)
        except Exception as e:
            print(f"Error in text prediction: {e}")

    # Process images
    for img_path in image_paths:
        try:
            image_emotion = predict_emotion_from_image(img_path)
            print(f"[Image] '{img_path}' -> {image_emotion}")
            all_predictions.append(image_emotion)
        except Exception as e:
            print(f"Error in image prediction: {e}")

    # Count most frequent emotion(s)
    counter = Counter(all_predictions)
    most_common = counter.most_common()

    # Handle tie
    if len(most_common) > 1 and most_common[0][1] == most_common[1][1]:
        top_emotions = [emotion for emotion, count in most_common if count == most_common[0][1]]
        return f"Tie between: {', '.join(top_emotions)}"
    else:
        return f"Final Predicted Emotion: {most_common[0][0]}"

In [7]:
# ========================================================================================= EXAMPLE USAGE ======================================================================================================================

texts = [
    "I just got a promotion at work!",
    "Feeling a bit low today.",
    "Enjoying a relaxing weekend.",
    "So frustrated with the traffic!",
    "Watching my favorite movie."
]

image_paths = [
    r'data\test\fear\PrivateTest_2159049.jpg',
    r'data\test\happy\PrivateTest_258543.jpg',
    r'C:\Users\ssid7\Desktop\Projects\MCA_Major_Project-Multimodal_Emotion_Recognition_System\data\test\happy\PublicTest_99753173.jpg',
    r'C:\Users\ssid7\Desktop\Projects\MCA_Major_Project-Multimodal_Emotion_Recognition_System\data\test\happy\PublicTest_98334368.jpg',
    r'C:\Users\ssid7\Desktop\Projects\MCA_Major_Project-Multimodal_Emotion_Recognition_System\data\test\happy\PublicTest_99849498.jpg'
]

final_result = predict_multimodal_emotion_batch(texts, image_paths)
print(final_result)

[Text] 'I just got a promotion at work!' -> positive
[Text] 'Feeling a bit low today.' -> positive
[Text] 'Enjoying a relaxing weekend.' -> positive
[Text] 'So frustrated with the traffic!' -> positive
[Text] 'Watching my favorite movie.' -> positive
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 388ms/step
[Image] 'data\test\fear\PrivateTest_2159049.jpg' -> happy
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
[Image] 'data\test\happy\PrivateTest_258543.jpg' -> happy
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[Image] 'C:\Users\ssid7\Desktop\Projects\MCA_Major_Project-Multimodal_Emotion_Recognition_System\data\test\happy\PublicTest_99753173.jpg' -> happy
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[Image] 'C:\Users\ssid7\Desktop\Projects\MCA_Major_Project-Multimodal_Emotion_Recognition_System\data\test\happy\PublicTest_98334368.jpg' -> happy
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m