# **Capstone Project: Sentiment Analysis using Image & Text Data**

In [None]:

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.layers import Embedding, LSTM, GRU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


#  **Data Preprocessing**

Text  **Data Preprocessing**

In [None]:
# Text Data Preprocessing
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

text_data = pd.read_csv('text_data.csv')
text_data['cleaned_text'] = text_data['text_column'].apply(preprocess_text)

# Tokenization
max_words = 5000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(text_data['cleaned_text'])
sequences = tokenizer.texts_to_sequences(text_data['cleaned_text'])
text_padded = pad_sequences(sequences, maxlen=100)


**Image Data Preprocessing**

In [None]:
# Image Data Preprocessing
image_dir = 'image_dataset/'
image_data = []
labels = []
label_map = {"positive": 0, "negative": 1, "neutral": 2}

for label in os.listdir(image_dir):
    for image_file in os.listdir(os.path.join(image_dir, label)):
        img_path = os.path.join(image_dir, label, image_file)
        img = load_img(img_path, target_size=(128, 128))
        img_array = img_to_array(img) / 255.0
        image_data.append(img_array)
        labels.append(label_map[label])

image_data = np.array(image_data)
labels = np.array(labels)

# data split
X_train_img, X_test_img, y_train_img, y_test_img = train_test_split(image_data, labels, test_size=0.2, random_state=42)


# **Model Architecture**

**CNN Model for Images**

In [None]:
#CNN Model for Image Sentiment Analysis
cnn_model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(128, 128, 3)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
cnn_model.summary()

# Train cnn model
cnn_history = cnn_model.fit(X_train_img, y_train_img, validation_split=0.2, epochs=10, batch_size=32)


**RNN Model for Text**

In [None]:
# RNN Model for Text Sentiment Analysis
rnn_model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=100),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')
])

rnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
rnn_model.summary()

# Train RNN Model
text_labels = text_data['label_column'].map(label_map).values
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(text_padded, text_labels, test_size=0.2, random_state=42)
rnn_history = rnn_model.fit(X_train_text, y_train_text, validation_split=0.2, epochs=10, batch_size=32)


# **Model Evaluation**

In [None]:
# Evaluation Metrics for CNN
y_pred_img = np.argmax(cnn_model.predict(X_test_img), axis=1)
print("Confusion Matrix for Images:")
print(confusion_matrix(y_test_img, y_pred_img))
print("Classification Report for Images:")
print(classification_report(y_test_img, y_pred_img))


In [None]:
# Evaluation Metrics for RNN
y_pred_text = np.argmax(rnn_model.predict(X_test_text), axis=1)
print("Confusion Matrix for Text:")
print(confusion_matrix(y_test_text, y_pred_text))
print("Classification Report for Text:")
print(classification_report(y_test_text, y_pred_text))


#  Saving Models

In [None]:
# Save Trained Models
cnn_model.save('cnn_sentiment_model')
rnn_model.save('rnn_sentiment_model')
