In [None]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re

In [None]:
data = pd.read_csv('IMDB Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
data.shape

(50000, 2)

In [None]:
data["sentiment"].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
english_stops = set(stopwords.words('english'))

In [None]:
def load_dataset():
    df = pd.read_csv('IMDB Dataset.csv')
    x_data = df['review']       # Reviews/Input
    y_data = df['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case
    
    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

In [None]:
x_data, y_data = load_dataset()

In [None]:
x_data.head()

0    [one, reviewers, mentioned, watching, oz, epis...
1    [a, wonderful, little, production, the, filmin...
2    [i, thought, wonderful, way, spend, time, hot,...
3    [basically, family, little, boy, jake, thinks,...
4    [petter, mattei, love, time, money, visually, ...
Name: review, dtype: object

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

In [None]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [None]:
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

In [None]:
print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[   23 12223     3 ...     0     0     0]
 [    8     3  1116 ...     0     0     0]
 [  170 15390   285 ...    63   196     1]
 ...
 [ 2025    32  1121 ...  1563  1020    12]
 [ 5678 27883   332 ...     0     0     0]
 [    1   422  2765 ...  2240    60  1911]] 

Encoded X Test
 [[    8    45   581 ...     0     0     0]
 [   50   909   277 ...     0     0     0]
 [   51     5 54657 ...     0     0     0]
 ...
 [  452    99    63 ...  9536   181    68]
 [    8  5201  2121 ...     0     0     0]
 [    1   772     3 ...     0     0     0]] 

Maximum review length:  131


In [None]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 131, 32)           2959424   
                                                                 
 lstm_2 (LSTM)               (None, 64)                24832     
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 2,984,321
Trainable params: 2,984,321
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [None]:
model.fit(x_train, y_train, batch_size = 128, epochs = 10, callbacks=[checkpoint])


Epoch 1/10
Epoch 1: accuracy improved from -inf to 0.77043, saving model to models/LSTM.h5
Epoch 2/10
Epoch 2: accuracy improved from 0.77043 to 0.92930, saving model to models/LSTM.h5
Epoch 3/10
Epoch 3: accuracy improved from 0.92930 to 0.96172, saving model to models/LSTM.h5
Epoch 4/10
Epoch 4: accuracy improved from 0.96172 to 0.97380, saving model to models/LSTM.h5
Epoch 5/10
Epoch 5: accuracy improved from 0.97380 to 0.98462, saving model to models/LSTM.h5
Epoch 6/10
Epoch 6: accuracy improved from 0.98462 to 0.98760, saving model to models/LSTM.h5
Epoch 7/10
Epoch 7: accuracy improved from 0.98760 to 0.99018, saving model to models/LSTM.h5
Epoch 8/10
Epoch 8: accuracy did not improve from 0.99018
Epoch 9/10
Epoch 9: accuracy improved from 0.99018 to 0.99252, saving model to models/LSTM.h5
Epoch 10/10
Epoch 10: accuracy improved from 0.99252 to 0.99395, saving model to models/LSTM.h5


<keras.callbacks.History at 0x7fe386bab410>

In [None]:
y_pred = model.predict(x_test, batch_size = 128)

In [None]:
y_pred

array([[0.9990225 ],
       [0.00295034],
       [0.00262812],
       ...,
       [0.99617434],
       [0.9987327 ],
       [0.00246713]], dtype=float32)

In [None]:


true = 0
for i, y in enumerate(y_test):
    if y_pred[i]<=0.5:
       pred = 0
    else:
      pred = 1
    if y == pred:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

Correct Prediction: 8613
Wrong Prediction: 1387
Accuracy: 86.13


In [None]:
model.save("models/LSTM.h5")

In [None]:
loaded_model = load_model('models/LSTM.h5')

In [None]:
review = str(input('Movie Review: '))

Movie Review: Nothing was typical about this. Everything was beautifully done in this movie, the story, the flow, the scenario, everything. I highly recommend it for mystery lovers, for anyone who wants to watch a good movie!


In [None]:
# Pre-process input
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned: ', review)

words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)

Cleaned:  Nothing was typical about this Everything was beautifully done in this movie the story the flow the scenario everything I highly recommend it for mystery lovers for anyone who wants to watch a good movie
Filtered:  ['nothing typical everything beautifully done movie story flow scenario everything i highly recommend mystery lovers anyone wants watch good movie']


In [None]:
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')
print(tokenize_words)

[[  76  690  171 1198  129    3   13 2772 2605  171    1  438  284  703
  1678  153  393   33    9    3    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0]]


In [None]:
result = loaded_model.predict(tokenize_words)
print(result)

[[0.9988941]]


In [None]:
if result >= 0.7:
    print('positive')
else:
    print('negative')

positive


In [55]:
dict1 = {
    "name" : "subhash",
    "city" : "Bareilly"
}

In [56]:
dict1

{'city': 'Bareilly', 'name': 'subhash'}