In [94]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/simple-rnn/embedding.ipynb
/kaggle/input/simple-rnn/main.py
/kaggle/input/simple-rnn/simple_rnn_imdb.h5
/kaggle/input/simple-rnn/prediction.ipynb
/kaggle/input/simple-rnn/requirements.txt
/kaggle/input/simple-rnn/simplernn.ipynb


In [95]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
import numpy as np

# Checking what one_hot encoding does to sentences

In [96]:
sent = [
    "The quick brown fox jumps over the lazy dog.",
    "Python is a versatile and powerful programming language.",
    "The sun always rises in the east and sets in the west.",
    "Artificial intelligence is rapidly changing our world.",
    "Reading a good book is a wonderful way to relax."
]

In [97]:
voc_size = 10000

In [98]:
one_hot_rep = [one_hot(_, voc_size) for _ in sent]
one_hot_rep

[[6378, 7221, 5532, 5801, 2817, 9238, 6378, 1459, 581],
 [1827, 4734, 71, 5031, 6547, 2896, 5441, 5809],
 [6378, 2138, 7678, 3626, 8509, 6378, 6555, 6547, 5049, 8509, 6378, 2605],
 [8787, 5014, 4734, 1642, 3968, 1912, 5722],
 [2630, 71, 1977, 4420, 4734, 71, 6152, 6365, 3373, 4771]]

In [99]:
sent_length = 10
embedd_doc = pad_sequences(one_hot_rep , padding='pre' , maxlen= sent_length)
embedd_doc

array([[   0, 6378, 7221, 5532, 5801, 2817, 9238, 6378, 1459,  581],
       [   0,    0, 1827, 4734,   71, 5031, 6547, 2896, 5441, 5809],
       [7678, 3626, 8509, 6378, 6555, 6547, 5049, 8509, 6378, 2605],
       [   0,    0,    0, 8787, 5014, 4734, 1642, 3968, 1912, 5722],
       [2630,   71, 1977, 4420, 4734,   71, 6152, 6365, 3373, 4771]],
      dtype=int32)

# Loading Data

In [100]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.utils import pad_sequences

In [101]:
max_features = 10000
(X_train , y_train) , (X_test, y_test) = imdb.load_data(num_words=max_features)
print(f"{X_train.shape} , {X_test.shape}")
print(f"{y_train.shape} , {y_test.shape}")

(25000,) , (25000,)
(25000,) , (25000,)


In [102]:
word_index = imdb.get_word_index()
word_index

{'fawn': 34701,
 'tsukino': 52006,
 'nunnery': 52007,
 'sonja': 16816,
 'vani': 63951,
 'woods': 1408,
 'spiders': 16115,
 'hanging': 2345,
 'woody': 2289,
 'trawling': 52008,
 "hold's": 52009,
 'comically': 11307,
 'localized': 40830,
 'disobeying': 30568,
 "'royale": 52010,
 "harpo's": 40831,
 'canet': 52011,
 'aileen': 19313,
 'acurately': 52012,
 "diplomat's": 52013,
 'rickman': 25242,
 'arranged': 6746,
 'rumbustious': 52014,
 'familiarness': 52015,
 "spider'": 52016,
 'hahahah': 68804,
 "wood'": 52017,
 'transvestism': 40833,
 "hangin'": 34702,
 'bringing': 2338,
 'seamier': 40834,
 'wooded': 34703,
 'bravora': 52018,
 'grueling': 16817,
 'wooden': 1636,
 'wednesday': 16818,
 "'prix": 52019,
 'altagracia': 34704,
 'circuitry': 52020,
 'crotch': 11585,
 'busybody': 57766,
 "tart'n'tangy": 52021,
 'burgade': 14129,
 'thrace': 52023,
 "tom's": 11038,
 'snuggles': 52025,
 'francesco': 29114,
 'complainers': 52027,
 'templarios': 52125,
 '272': 40835,
 '273': 52028,
 'zaniacs': 52130,

In [103]:
max_len = 500
X_train = sequence.pad_sequences(X_train, maxlen = max_len)
X_test = sequence.pad_sequences(X_test, maxlen = max_len)
X_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

# Train Simple RNN

In [104]:
model = Sequential()
model.add(Embedding(max_features , 128,input_length=max_len)) # Embedding layer 
model.add(SimpleRNN(128,activation='relu'))
model.add(Dense(1,activation='sigmoid'))

In [105]:
model.build(input_shape=(None, max_len))
model.summary()

In [106]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'] )

In [107]:
from tensorflow.keras.callbacks import EarlyStopping
earlystopping = EarlyStopping(monitor='val_loss' , patience=5 ,restore_best_weights=True)
earlystopping

<keras.src.callbacks.early_stopping.EarlyStopping at 0x7ecb0a36b010>

In [108]:
history = model.fit(
    X_train,y_train,epochs=10,
    batch_size=32,
    validation_split= 0.3,
    callbacks=[earlystopping]
)

Epoch 1/10
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 44ms/step - accuracy: 0.5778 - loss: 40337141760.0000 - val_accuracy: 0.5937 - val_loss: 0.6446
Epoch 2/10
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 41ms/step - accuracy: 0.6693 - loss: 0.6054 - val_accuracy: 0.6949 - val_loss: 0.5798
Epoch 3/10
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 41ms/step - accuracy: 0.8036 - loss: 0.4744 - val_accuracy: 0.6984 - val_loss: 0.5397
Epoch 4/10
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 41ms/step - accuracy: 0.8540 - loss: 0.3602 - val_accuracy: 0.7932 - val_loss: 0.4508
Epoch 5/10
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 41ms/step - accuracy: 0.9017 - loss: 0.2603 - val_accuracy: 0.7967 - val_loss: 0.4482
Epoch 6/10
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 41ms/step - accuracy: 0.9179 - loss: 0.2166 - val_accuracy: 0.7925 - val_loss: 0.4792
Epoch 

# Model Save

In [109]:
model.save('simple_rnn.h5')

# Model Predictions

In [110]:
from tensorflow.keras.models import load_model

In [111]:
word_index = imdb.get_word_index()
reverse_word_index= {value:key for key , value in word_index.items()}

In [112]:
model = load_model('/kaggle/working/simple_rnn.h5')
model.summary()

In [113]:
model.get_weights()

[array([[ 0.01031565,  0.5066088 ,  0.1652798 , ..., -0.48290038,
         -0.45486096, -0.2791064 ],
        [-0.07264405,  0.07480893,  0.04717711, ..., -0.08136766,
         -0.05114775, -0.02493075],
        [-0.0288228 ,  0.1861178 , -0.12359876, ..., -0.1718672 ,
         -0.12255639, -0.18061198],
        ...,
        [ 0.0086747 ,  0.0086452 ,  0.08603252, ..., -0.009713  ,
          0.05308983, -0.0252499 ],
        [ 0.04090451,  0.06847464,  0.01871277, ..., -0.05820893,
          0.0647435 , -0.06157221],
        [ 0.0021202 , -0.05052185, -0.09516062, ...,  0.11013264,
         -0.02169516, -0.05956076]], dtype=float32),
 array([[ 0.12204533, -0.18442635, -0.06280167, ..., -0.05893873,
         -0.1108468 ,  0.03163798],
        [ 0.00207872, -0.04468548,  0.05143514, ..., -0.05277706,
          0.17383881, -0.18644015],
        [ 0.02615026,  0.09490366, -0.07995023, ..., -0.06702261,
          0.06994058,  0.16564047],
        ...,
        [-0.13302717,  0.03916568,  0.1

In [114]:
def decode_review(encoded_reviews):
    return ' '.join([reverse_word_index.get(i-3, "?")] for i in encoded_review)

def preprocess_text(text):
    words = text.lower().split()
    encoded_review = [word_index.get(word,2)+3 for word in words]
    padded_review = sequence.pad_sequences([encoded_review] , maxlen=1000)
    return padded_review

In [115]:
def predict_sentiment(review):
    preprocessed_review = preprocess_text(review)
    prediction = model.predict(preprocessed_review)
    sentiment = 'Positive' if prediction[0][0] > 0.5 else 'Negative'
    return sentiment, prediction[0][0]

In [116]:
example_review = "This movie was trash"
sentiment, prediction = predict_sentiment(example_review)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 372ms/step


In [117]:
sentiment , prediction

('Negative', 0.24240339)

In [118]:
example_review = "This movie was awesome"
sentiment, prediction = predict_sentiment(example_review)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step


In [119]:
sentiment , prediction

('Positive', 0.8234848)