<a href="https://colab.research.google.com/github/rhaymisonbetini/KERAS_IMDB_50K/blob/main/SENTIMENTAL_IMDB_50K.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
import tensorflow as tf
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.set_visible_devices(physical_devices, 'GPU')
tf.config.run_functions_eagerly(False)

In [3]:
!cp /content/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
!unzip imdb-dataset-of-50k-movie-reviews.zip

Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 93% 24.0M/25.7M [00:02<00:00, 20.0MB/s]
100% 25.7M/25.7M [00:02<00:00, 11.5MB/s]
Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [14]:
df = pd.read_csv('/content/IMDB Dataset.csv')

In [15]:
def remove_html_tags(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub('', text)

df['review'] = df['review'].apply(remove_html_tags)

In [16]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [18]:
# Etapa 3: Tokenização
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(df['review'])
sequences = tokenizer.texts_to_sequences(df['review'])

In [19]:
data = pad_sequences(sequences, maxlen=200)

In [20]:
labels = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).values

In [21]:
X_train = data[:45000]
y_train = labels[:45000]
X_val = data[45000:50000]
y_val = labels[45000:50000]

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

In [24]:
model = Sequential()
model.add(Embedding(input_dim=20000, output_dim=128, input_length=200))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.3))
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 200, 128)          2560000   
                                                                 
 bidirectional_1 (Bidirecti  (None, 200, 256)          263168    
 onal)                                                           
                                                                 
 dropout_1 (Dropout)         (None, 200, 256)          0         
                                                                 
 lstm_3 (LSTM)               (None, 64)                82176     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 2905409 (11.08 MB)
Trainable params: 2905409 (11.08 MB)
Non-trainable params: 0 (0.00 Byte)
______________

In [25]:
import pickle

In [26]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

model.fit(X_train, y_train, epochs=3, batch_size=64, validation_data=(X_val, y_val), callbacks=[early_stopping])

with open('/content/sample_data/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [27]:
with open('/content/sample_data/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [31]:
positive_reviews = [
    "The direction and cinematography were absolutely stunning.",
    "I was deeply moved by the incredible performances.",
    "What an original concept, executed to perfection!",
    "The film's score was haunting and beautifully orchestrated.",
    "It's a masterpiece that stands the test of time."
]

for review in positive_reviews:
  new_review = review
  sequences = tokenizer.texts_to_sequences([new_review])
  padded_sequences = pad_sequences(sequences, maxlen=200)
  prediction = model.predict(padded_sequences)
  sentiment = "Positive" if prediction[0][0] > 0.5 else "Negative"
  print(f"Review sentiment: {sentiment} (Score: {prediction[0][0]})")

Review sentiment: Negative (Score: 0.37391674518585205)
Review sentiment: Positive (Score: 0.7835500240325928)
Review sentiment: Positive (Score: 0.7811959385871887)
Review sentiment: Positive (Score: 0.832990825176239)
Review sentiment: Positive (Score: 0.5512983798980713)


In [32]:
negative_reviews = [
    "The plot was thin and completely unoriginal.",
    "I found the acting to be lackluster and uninspired.",
    "The pacing was sluggish and the film dragged on.",
    "There were plot holes big enough to drive a truck through.",
    "The special effects were cheap-looking and took me out of the experience."
]
for review in negative_reviews:
  new_review = review
  sequences = tokenizer.texts_to_sequences([new_review])
  padded_sequences = pad_sequences(sequences, maxlen=200)
  prediction = model.predict(padded_sequences)
  sentiment = "Positive" if prediction[0][0] > 0.5 else "Negative"
  print(f"Review sentiment: {sentiment} (Score: {prediction[0][0]})")

Review sentiment: Negative (Score: 0.044506631791591644)
Review sentiment: Negative (Score: 0.11296072602272034)
Review sentiment: Negative (Score: 0.236013263463974)
Review sentiment: Positive (Score: 0.6113855838775635)
Review sentiment: Negative (Score: 0.36765921115875244)
