<a href="https://colab.research.google.com/github/saikiranchetti18/sentiment_analysis/blob/main/MovieReview.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import pandas as pd 
import numpy as np 
import re    
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Embedding, LSTM, Dense 
from tensorflow.keras.models import load_model 
from tensorflow.keras.callbacks import ModelCheckpoint  


In [29]:

data = pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [30]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
english_stops =set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
def load_dataset():
    df = pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv')
    x_data = df['review']      
    y_data = df['sentiment']  
    x_data = x_data.replace({'<.*?>': ''}, regex = True)         
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)   
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  
    x_data = x_data.apply(lambda review: [w.lower() for w in review])
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()


In [32]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)


In [33]:
def get_max_length():
    length_reviews= []
    for review in x_train:
        length_reviews.append(len(review))
    return int(np.ceil(np.mean(length_reviews)))

In [34]:

token = Tokenizer(lower=False)    
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1 

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[1088  278    8 ...    0    0    0]
 [   8    3   21 ...    0    0    0]
 [  39   21 2027 ...    0    0    0]
 ...
 [ 611 1745   69 ...    0    0    0]
 [   2 1691 1220 ...    0    0    0]
 [ 499  315    3 ...    0    0    0]] 

Encoded X Test
 [[2251  616   70 ...    0    0    0]
 [   2  615 7015 ...    0    0    0]
 [   8 1153 2528 ...    0    0    0]
 ...
 [3624   23  878 ...    0    0    0]
 [ 394 1457  965 ...    0    0    0]
 [ 204   92    9 ... 1845  148  123]] 

Maximum review length:  130


In [35]:

EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT)) 
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 130, 32)           2944800   
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 2,969,697
Trainable params: 2,969,697
Non-trainable params: 0
_________________________________________________________________
None


In [37]:
checkpoint = ModelCheckpoint('models/LSTM.h5',monitor='accuracy',save_best_only=True,verbose=1)

In [38]:
model.fit(x_train, y_train, batch_size = 128, epochs =10, callbacks=[checkpoint])

Epoch 1/10

Epoch 00001: accuracy improved from -inf to 0.74240, saving model to models/LSTM.h5
Epoch 2/10

Epoch 00002: accuracy improved from 0.74240 to 0.92475, saving model to models/LSTM.h5
Epoch 3/10

Epoch 00003: accuracy improved from 0.92475 to 0.96145, saving model to models/LSTM.h5
Epoch 4/10

Epoch 00004: accuracy improved from 0.96145 to 0.97798, saving model to models/LSTM.h5
Epoch 5/10

Epoch 00005: accuracy improved from 0.97798 to 0.98595, saving model to models/LSTM.h5
Epoch 6/10

Epoch 00006: accuracy improved from 0.98595 to 0.98985, saving model to models/LSTM.h5
Epoch 7/10

Epoch 00007: accuracy did not improve from 0.98985
Epoch 8/10

Epoch 00008: accuracy did not improve from 0.98985
Epoch 9/10

Epoch 00009: accuracy improved from 0.98985 to 0.99142, saving model to models/LSTM.h5
Epoch 10/10

Epoch 00010: accuracy improved from 0.99142 to 0.99238, saving model to models/LSTM.h5


<keras.callbacks.History at 0x7f10ba782790>

In [40]:

predict_x=model.predict(x_test) 
y_pred=np.argmax(predict_x,axis=1)
true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

Correct Prediction: 5007
Wrong Prediction: 4993
Accuracy: 50.07


In [None]:
loaded_model = load_model('models/LSTM.h5')

In [46]:
def preprocess(review):
    regex = re.compile(r'[^a-zA-Z\s]')
    review = regex.sub('', review)
    print('Cleaned: ', review)

    words = review.split(' ')
    filtered = [w for w in words if w not in english_stops]
    filtered = ' '.join(filtered)
    filtered = [filtered.lower()]

    print('Filtered: ', filtered)
    tokenize_words = token.texts_to_sequences(filtered)
    tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')
    return (tokenize_words)

In [49]:

result=model.predict(preprocess(input()))
print(result)

the movie is good
Cleaned:  the movie is good
Filtered:  ['movie good']
[[0.9985483]]


In [50]:
print ("positive_review") if result>=0.6 else print("negative_review")


positive_review
