In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
# Load the IMDB dataset
df = pd.read_csv('IMDB Dataset.csv')


In [3]:
# Preprocess data
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
X = df['review'].values
y = df['sentiment'].values

In [4]:
# Use TensorFlow tokenizer to tokenize and pad sequences
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences, padding='post', maxlen=200)


In [5]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X_padded, y, test_size=0.2, random_state=42)


In [6]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout

# Build the CNN model
model = Sequential([
    Embedding(input_dim=124254, output_dim=128, input_length=200, input_shape=(200,)),  # Embedding layer with updated input_dim
    Conv1D(128, 5, activation='relu'),  # Convolution layer with 128 filters
    MaxPooling1D(pool_size=4),  # Max pooling layer
    Conv1D(128, 5, activation='relu'),  # Another convolution layer
    MaxPooling1D(pool_size=4),  # Max pooling layer
    Flatten(),  # Flatten the output to feed into the fully connected layer
    Dense(64, activation='relu'),  # Fully connected layer with 64 neurons
    Dropout(0.5),  # Dropout to avoid overfitting
    Dense(1, activation='sigmoid')  # Output layer (sigmoid for binary classification)
])

# Compile the model with a simple optimizer and loss function (just for the summary)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary to check the layers and parameters
model.summary()



  super().__init__(**kwargs)


In [7]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [8]:
# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val), verbose=1)


Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 179ms/step - accuracy: 0.6619 - loss: 0.5458 - val_accuracy: 0.8917 - val_loss: 0.2549
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 178ms/step - accuracy: 0.9218 - loss: 0.2102 - val_accuracy: 0.8957 - val_loss: 0.2543
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 198ms/step - accuracy: 0.9599 - loss: 0.1180 - val_accuracy: 0.8895 - val_loss: 0.3181
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 190ms/step - accuracy: 0.9846 - loss: 0.0506 - val_accuracy: 0.8825 - val_loss: 0.4010
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 188ms/step - accuracy: 0.9914 - loss: 0.0285 - val_accuracy: 0.8787 - val_loss: 0.6465
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 188ms/step - accuracy: 0.9929 - loss: 0.0212 - val_accuracy: 0.8733 - val_loss: 0.5931
Epoc

In [9]:
# Save the trained model
model.save("imdb_sentiment_model_cnn.h5")
print("Model saved as imdb_sentiment_model_cnn.h5")



Model saved as imdb_sentiment_model_cnn.h5


In [10]:
# Evaluate the model
loss, accuracy = model.evaluate(X_val, y_val)
print(f'Validation Loss: {loss:.4f}')
print(f'Validation Accuracy: {accuracy:.4f}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.8831 - loss: 0.9343
Validation Loss: 0.9747
Validation Accuracy: 0.8845


In [11]:
# Prediction function for RNN
def predict_sentiment_rnn(review):
    sequence = tokenizer.texts_to_sequences([review])
    padded_sequence = pad_sequences(sequence, maxlen=200)
    prediction = model.predict(padded_sequence)
    return "positive" if prediction[0][0] > 0.5 else "negative"

In [12]:
# Example usage
new_review = "This movie had so much potential, but it fell flat in every way. The characters were one-dimensional, and the ending was predictable."
print(f"RNN Prediction: {predict_sentiment_rnn(new_review)}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 189ms/step
RNN Prediction: negative


To calculate the unique words in the datasets

In [13]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer  # Updated import path

# Load the CSV file into a pandas DataFrame
# Replace 'your_file.csv' with the actual path to your CSV file
df = pd.read_csv('IMDB Dataset.csv')

# Extract the review column from the DataFrame
reviews = df['review'].astype(str).tolist()

# Initialize the tokenizer
tokenizer = Tokenizer()

# Fit the tokenizer on the review data
tokenizer.fit_on_texts(reviews)

# Get the number of unique words in the dataset
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 for padding token

# Print the vocabulary size
print("Vocabulary size:", vocab_size)



Vocabulary size: 124253
