<a href="https://colab.research.google.com/github/seuha516/practice-machine-learning/blob/main/IMDB_%EA%B0%90%EC%84%B1%EB%B6%84%EB%A5%98.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from tensorflow.keras.datasets import imdb
(train_input, train_target), (test_input, test_target) = imdb.load_data(num_words = 1000)

In [2]:
from sklearn.model_selection import train_test_split
train_input, val_input, train_target, val_target = train_test_split(train_input, train_target, test_size = 0.2, stratify = train_target, random_state = 36)

In [3]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
train_seq = pad_sequences(train_input, maxlen = 200)
val_seq = pad_sequences(val_input, maxlen = 200)

In [None]:
from tensorflow import keras
model = keras.Sequential()
model.add(keras.layers.Embedding(1000, 64, input_length = 200))
model.add(keras.layers.LSTM(8, dropout = 0.3, return_sequences = True))
model.add(keras.layers.LSTM(8, dropout = 0.3))
model.add(keras.layers.Dense(1, activation = 'sigmoid'))
model.summary()

In [None]:
rmsprop=keras.optimizers.RMSprop(learning_rate = 1e-4)
model.compile(optimizer = rmsprop, loss = 'binary_crossentropy', metrics = ['accuracy'])
checkpoint_cb = keras.callbacks.ModelCheckpoint('best-model.h5')
early_stopping_cb = keras.callbacks.EarlyStopping(patience = 3, restore_best_weights = True)
history = model.fit(train_seq, train_target, epochs = 100, batch_size = 64,
                    validation_data = (val_seq, val_target),
                    callbacks = [checkpoint_cb, early_stopping_cb])

In [6]:
import numpy as np
rnn_model = keras.models.load_model('best-model.h5')
positive_input = []
positive_target = np.array([1] * 12500)
negative_input = []
negative_target = np.array([0] * 12500)

for i in range(25000):
  if test_target[i] == 1:
    positive_input.append(test_input[i])
  else:
    negative_input.append(test_input[i])

test_seq = pad_sequences(test_input, maxlen = 200)
positive_seq = pad_sequences(positive_input, maxlen = 200)
negative_seq = pad_sequences(negative_input, maxlen = 200)
positive_input=np.array(positive_input)
negative_input=np.array(negative_input)

print("=== 성능 평가 ===")
print("전체 테스트")
rnn_model.evaluate(test_seq, test_target)
print("긍정 문장 테스트")
rnn_model.evaluate(positive_seq, positive_target)
print("부정 문장 테스트")
rnn_model.evaluate(negative_seq, negative_target)



=== 성능 평가 ===
전체 테스트
긍정 문장 테스트
부정 문장 테스트


[0.2992817461490631, 0.8808000087738037]

In [27]:
import re
word_to_index = imdb.get_word_index()
def predict(sentence):
  print(sentence)
  sentence = re.sub('[^0-9a-zA-Z ]', '', sentence).lower()
  encoded = []
  for x in sentence.split():
    try :
      if word_to_index[x] <= 1000:
        encoded.append(word_to_index[x] + 3)
      else:
        encoded.append(2)
    except KeyError:
      encoded.append(2)
  data = pad_sequences([encoded], maxlen = 200)
  score = float(rnn_model.predict(data))
  if(score > 0.5):
    print("{:.3f}% 확률로 긍정\n".format(score * 100))
  else:
    print("{:.3f}% 확률로 부정\n".format((1 - score) * 100))

test_sentences = [
                  "Terrible plot and script, boring and pointless",
                  "Dr Stupid in the Mashed Potatoes",
                  "This is why we go to the movies",
                  "Tom Cruise really knows how a blockbuster film is made.",
                  "Either I am growing older Or MCU is becoming more Childish",
                  "Watched it twice and will watch it again and again",
                  "I thought I was watching a Thor parody.",
                  "Can't remember the last time I smiled so much in the cinema!",

                  "I love this movie. It's wonderful!",
                  "I lov this movie. It's wonderfull!",
                  "One of the worst superhero movies out there",
                  "One of the wrost superhero movies out there",
                  "A perfect storm",
                  "Frankly, I think this is a wild goose chase.",
                  "I don't know why everyone hates this movie."
                  ]

for sentence in test_sentences:
  predict(sentence)

Terrible plot and script, boring and pointless
92.702% 확률로 부정

Dr Stupid in the Mashed Potatoes
83.318% 확률로 부정

This is why we go to the movies
60.113% 확률로 부정

Tom Cruise really knows how a blockbuster film is made.
67.803% 확률로 긍정

Either I am growing older Or MCU is becoming more Childish
67.356% 확률로 부정

Watched it twice and will watch it again and again
71.477% 확률로 긍정

I thought I was watching a Thor parody.
51.389% 확률로 부정

Can't remember the last time I smiled so much in the cinema!
63.911% 확률로 긍정

I love this movie. It's wonderful!
70.926% 확률로 긍정

I lov this movie. It's wonderfull!
55.801% 확률로 부정

One of the worst superhero movies out there
89.323% 확률로 부정

One of the wrost superhero movies out there
51.870% 확률로 부정

A perfect storm
65.659% 확률로 긍정

Frankly, I think this is a wild goose chase.
51.089% 확률로 긍정

I don't know why everyone hates this movie.
50.949% 확률로 긍정

