In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("./IMDB Dataset.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
data['sentiment'] = np.where(data['sentiment'] ==  'positive', 1, 0)
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [4]:
reviews = data['review'].to_numpy()
labels = data['sentiment'].to_numpy()

In [5]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=0)

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_train)

train_sequence = tokenizer.texts_to_sequences(x_train)
train_padded = pad_sequences(train_sequence, maxlen=200, padding="post", truncating="post")

test_sequence = tokenizer.texts_to_sequences(x_test)
test_padded = pad_sequences(test_sequence, maxlen=200, padding="post", truncating="post")

In [8]:
lstm_model = tf.keras.models.Sequential([
    # 10000: vocablulary size, 32: embedding dimension, 200: sequence length, input length
    tf.keras.layers.Embedding(10000, 32, input_length=200),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(10, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
lstm_model.compile(optimizer="adam", loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

In [9]:
bi_lstm_model = tf.keras.models.Sequential([
    # 10000: vocablulary size, 32: embedding dimension, 200: sequence length, input length
    tf.keras.layers.Embedding(10000, 32, input_length=200),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(10, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
bi_lstm_model.compile(optimizer="adam", loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

In [10]:
lstm_model.fit(train_padded, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1eb72b23b90>

In [11]:
bi_lstm_model.fit(train_padded, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1eb050428d0>

In [12]:
lstm_model.evaluate(test_padded, y_test, verbose=2)

313/313 - 6s - loss: 0.3407 - accuracy: 0.8613 - 6s/epoch - 18ms/step


[0.34065714478492737, 0.861299991607666]

In [14]:
bi_lstm_model.evaluate(test_padded, y_test, verbose=2)

313/313 - 6s - loss: 0.6221 - accuracy: 0.8496 - 6s/epoch - 21ms/step


[0.6221182942390442, 0.8496000170707703]