<a href="https://colab.research.google.com/github/priyavratamohan/Sentiment-Analysis-using-CNN-and-LSTM/blob/main/SentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 10000  # We'll only consider the top 10,000 words in the dataset
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words)

# Pad sequences to ensure each input is the same length
maxlen = 100  # Limit reviews to 100 words for simplicity
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)

In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

# Initialize the Lemmatizer
lemmatizer = WordNetLemmatizer()

# Sample preprocessing function
def preprocess_text(text):
    # Lowercase text
    text = text.lower()

    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Tokenize
    words = text.split()

    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]

    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
import os
import zipfile
import requests

# Define the URL for downloading GloVe embeddings
url = "http://nlp.stanford.edu/data/glove.6B.zip"
glove_zip_path = "glove.6B.zip"

# Download GloVe embeddings
if not os.path.exists(glove_zip_path):
    print("Downloading GloVe embeddings...")
    r = requests.get(url)
    with open(glove_zip_path, 'wb') as f:
        f.write(r.content)

# Unzip the file
with zipfile.ZipFile(glove_zip_path, 'r') as zip_ref:
    zip_ref.extractall()

print("GloVe embeddings downloaded and extracted!")

GloVe embeddings downloaded and extracted!


In [4]:
import numpy as np

# Load pre-trained GloVe embeddings
embedding_index = {}
with open('glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = vector

# Create an embedding matrix for the words in our dataset
embedding_dim = 100  # Dimension of GloVe embeddings
embedding_matrix = np.zeros((max_words, embedding_dim))

# Create the embedding matrix for our dataset's words
word_index = imdb.get_word_index()
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Dropout

# Define the model
model = Sequential()

# 1. Embedding Layer
model.add(Embedding(input_dim=10000,  # Vocabulary size (max_words)
                    output_dim=100,   # Embedding size (GloVe is 100-dimensional)
                    weights=[embedding_matrix],  # Pre-trained GloVe embeddings
                    input_length=100,  # Each input review has 100 words
                    trainable=False))  # Keep embeddings static (non-trainable)

# 2. Convolutional Layer
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))

# 3. LSTM Layer
model.add(LSTM(units=128, return_sequences=False))

# 4. Dense Layer with Dropout
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))

# 5. Output Layer (Sigmoid for binary classification)
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()



In [6]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Load the IMDB dataset
max_words = 10000  # Use the top 10,000 most frequent words
max_len = 100  # Pad sequences to have a max length of 100 words

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words)

# Pad sequences to ensure uniform length
x_train = pad_sequences(x_train, maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)

# Split the training data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# Check the shapes
print(f"x_train shape: {x_train.shape}, y_train shape: {y_train.shape}")
print(f"x_val shape: {x_val.shape}, y_val shape: {y_val.shape}")

x_train shape: (20000, 100), y_train shape: (20000,)
x_val shape: (5000, 100), y_val shape: (5000,)


In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Define the model with increased dropout
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.6))  # Increased dropout
model.add(LSTM(128))
model.add(Dropout(0.6))  # Increased dropout
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.0001)

# Train the model
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=128,
                    validation_data=(x_val, y_val),
                    callbacks=[early_stop, reduce_lr])  # Added learning rate scheduling

Epoch 1/10
[1m 39/157[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m1:55[0m 976ms/step - accuracy: 0.5051 - loss: 0.6880

In [None]:
import matplotlib.pyplot as plt

# Plot training & validation accuracy
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(loc='lower right')

# Plot training & validation loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(loc='upper right')

plt.tight_layout()
plt.show()

In [None]:
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(x_test, y_test)

# Output the results
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

In [None]:
model.save("sentiment_analysis_model.keras")

In [None]:
from tensorflow.keras.models import load_model
model = load_model("sentiment_analysis_model.keras")


In [None]:
def predict_sentiment(text):
    preprocessed_text = preprocess_text(text)
    tokenized = imdb.get_word_index()
    # Convert the preprocessed text to a sequence of indices
    sequence = [tokenized.get(word, 0) for word in preprocessed_text.split()]
    padded_sequence = pad_sequences([sequence], maxlen=max_len)
    prediction = model.predict(padded_sequence)
    return "Positive" if prediction > 0.5 else "Negative"

In [None]:
print(predict_sentiment("I am happy"))