In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

In [3]:
# Load dataset
dataset = pd.read_csv('IMDB Dataset.csv')

# Split dataset into features and labels
X = dataset['review']
y = dataset['sentiment']

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
X = tokenizer.texts_to_sequences(X)

# Pad sequences to make them of equal length
X = pad_sequences(X, maxlen=100)

# Convert labels to numerical values (0 for negative, 1 for positive)
y = (y == 'positive').astype(int)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Define the model
model = Sequential()
model.add(Embedding(10000, 32, input_length=100))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)




Epoch 1/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - accuracy: 0.7113 - loss: 0.5085 - val_accuracy: 0.8652 - val_loss: 0.3216
Epoch 2/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - accuracy: 0.9417 - loss: 0.1616 - val_accuracy: 0.8470 - val_loss: 0.4227
Epoch 3/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - accuracy: 0.9910 - loss: 0.0320 - val_accuracy: 0.8350 - val_loss: 0.6453
Epoch 4/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - accuracy: 0.9987 - loss: 0.0068 - val_accuracy: 0.8378 - val_loss: 0.8456
Epoch 5/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - accuracy: 0.9990 - loss: 0.0037 - val_accuracy: 0.8325 - val_loss: 1.0125
Epoch 6/10
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - accuracy: 0.9960 - loss: 0.0122 - val_accuracy: 0.8240 - val_loss: 0.9883
Epoch 7/10
[1m1

<keras.src.callbacks.history.History at 0x11fc3f521d0>

In [6]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print('Test Accuracy:', accuracy)


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8332 - loss: 1.0699
Test Accuracy: 0.8317000269889832


In [8]:
# Load dataset
dataset = pd.read_csv('IMDB Dataset.csv')

# Assume 'review_text' is the column containing the reviews
review_to_predict = dataset['review'].iloc[0]  # Selecting the first review for prediction
original_sentiment = dataset['sentiment'].iloc[0]  # Original sentiment label

# Tokenize and pad the review
review_sequence = tokenizer.texts_to_sequences([review_to_predict])
review_sequence = pad_sequences(review_sequence, maxlen=100)

# Predict sentiment
predicted_sentiment = model.predict(review_sequence)

# Convert predicted sentiment to 'positive' or 'negative'
predicted_sentiment_label = 'positive' if predicted_sentiment > 0.5 else 'negative'

# Print original and predicted sentiment
print("Original Sentiment:", original_sentiment)
print("Predicted Sentiment:", predicted_sentiment_label)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step
Original Sentiment: positive
Predicted Sentiment: positive
