# RNN (Simple or Standard)

## problem definition
- text classification

### import required packages

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

### load the data

In [4]:
from tensorflow.keras.datasets import imdb

# maxinum words per review = 10000
(input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=10000)

In [8]:
# get the embeddings
embeddings = imdb.get_word_index()

In [28]:
input_train.shape

(25000, 500)

In [19]:
y_train

array([1, 0, 0, ..., 0, 1, 0])

In [11]:
embeddings.get('product')

2217

### pre-processing

In [13]:
from tensorflow.keras.preprocessing import sequence

# reshaping the input_train with 500 values
input_train = sequence.pad_sequences(input_train, maxlen=500)

In [14]:
input_train

array([[   0,    0,    0, ...,   19,  178,   32],
       [   0,    0,    0, ...,   16,  145,   95],
       [   0,    0,    0, ...,    7,  129,  113],
       ...,
       [   0,    0,    0, ...,    4, 3586,    2],
       [   0,    0,    0, ...,   12,    9,   23],
       [   0,    0,    0, ...,  204,  131,    9]], dtype=int32)

### train the model

In [18]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, SimpleRNN

# create a model
model = Sequential()

# add embedding layer
# 10000: max length of the words
# 32: output length
model.add(Embedding(10000, 32))

# add simple RNN layer to connect all hidden layers to get output from previous layer
model.add(SimpleRNN(32))

# add hidden layer
model.add(Dense(128))
model.add(Dense(64))
model.add(Dense(32))
model.add(Dense(16))

# add the output layer
model.add(Dense(1, activation="sigmoid"))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [17]:
# train the model
model.fit(input_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x3051a0550>

### model evaluation

In [25]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize

# get the words and their embeddings
word_index = imdb.get_word_index()

def preprocess_review(review):
    # Tokenize the review
    tokens = word_tokenize(review.lower())
    sequences = [word_index.get(word, 0) for word in tokens]
    
    # Pad the sequence
    padded_sequence = pad_sequences([sequences], maxlen=500)
    return padded_sequence

In [27]:
# new reviews to predict the sentiment
reviews = [
    "I really loved this movie! It was fantastic.",
    "This was the worst film I have ever seen.",
    "this product worst",
    "this a very bad product"
]


# convert all reviews to their embeddings
preprocessed_reviews = [preprocess_review(review) for review in reviews]

# predict each review using model
predictions = [model.predict(review) for review in preprocessed_reviews]
print(predictions)

for review, pred in zip(reviews, predictions):
    sentiment = 'Positive' if pred[0][0] > 0.5 else 'Negative'
    print(f'Review: "{review}" => Sentiment: {sentiment} (Probability: {pred[0][0]:.4f})')

[array([[0.54553884]], dtype=float32), array([[0.5546291]], dtype=float32), array([[0.5489311]], dtype=float32), array([[0.5323883]], dtype=float32)]
Review: "I really loved this movie! It was fantastic." => Sentiment: Positive (Probability: 0.5455)
Review: "This was the worst film I have ever seen." => Sentiment: Positive (Probability: 0.5546)
Review: "this product worst" => Sentiment: Positive (Probability: 0.5489)
Review: "this a very bad product" => Sentiment: Positive (Probability: 0.5324)
