In [1]:
import pandas as pd
import numpy as np
from keras.datasets import imdb

In [2]:
(X_train,y_train), (X_test,y_test) = imdb.load_data()

  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


**Exploring the Data and Vocabulary**

In [3]:
word_to_id = imdb.get_word_index()

In [4]:
id_to_word = {id:word for word,id in word_to_id.items()}

In [5]:
all_data = X_train + X_test

In [6]:
vocab = set()
for rev in all_data:
  vocab = vocab.union(set(rev))

In [7]:
len(vocab)

88585

In [8]:
max([i for word,i in word_to_id.items()])

88584

In [9]:
vocab_len = len(vocab) + 2 
#1 is added to hold 0 for keras pad_sequences

In [10]:
print([id_to_word[i] for i in X_train[6]])

['the', 'boiled', 'full', 'involving', 'to', 'impressive', 'boring', 'this', 'as', 'murdering', 'naschy', 'br', 'villain', 'council', 'suggestion', 'need', 'has', 'of', 'costumes', 'b', 'message', 'to', 'may', 'of', 'props', 'this', 'echoed', 'concentrates', 'concept', 'issue', 'skeptical', 'to', "god's", 'he', 'is', 'dedications', 'unfolds', 'movie', 'women', 'like', "isn't", 'surely', "i'm", 'rocketed', 'to', 'toward', 'in', "here's", 'for', 'from', 'did', 'having', 'because', 'very', 'quality', 'it', 'is', "captain's", 'starship', 'really', 'book', 'is', 'both', 'too', 'worked', 'carl', 'of', 'mayfair', 'br', 'of', 'reviewer', 'closer', 'figure', 'really', 'there', 'will', 'originals', 'things', 'is', 'far', 'this', 'make', 'mistakes', "kevin's", 'was', "couldn't", 'of', 'few', 'br', 'of', 'you', 'to', "don't", 'female', 'than', 'place', 'she', 'to', 'was', 'between', 'that', 'nothing', 'dose', 'movies', 'get', 'are', '498', 'br', 'yes', 'female', 'just', 'its', 'because', 'many', '

In [11]:
print(y_train[6])

1


In [12]:
max_rev_len = max([len(rev) for rev in all_data])

In [13]:
max_rev_len

2697

In [14]:
from keras.preprocessing.sequence import pad_sequences
X_train = pad_sequences(X_train,maxlen=max_rev_len)
X_test = pad_sequences(X_test,maxlen = max_rev_len)

In [15]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout

In [16]:

model = Sequential()
max_embedding_len = 32
vocabulary = len(vocab)
model.add(Embedding(input_dim = 88587,output_dim=max_embedding_len,input_length=max_rev_len))
model.add(Dropout(0.25))
#Randomly turninig off 25 % of neurons
model.add(LSTM(100))
model.add(Dropout(0.2))

model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 2697, 32)          2834784   
_________________________________________________________________
dropout (Dropout)            (None, 2697, 32)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               53200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 2,888,085
Trainable params: 2,888,085
Non-trainable params: 0
_________________________________________________________________


In [18]:
from keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss',patience=3)

In [33]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=20,callbacks=[early_stop])
#this shell should be run on machine with good GPU as the data is quit large.
#Google colab is recommended

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20


<keras.callbacks.History at 0x7fee9599bdd0>

In [34]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [35]:
pred = model.predict_classes(X_test)



In [36]:
print(classification_report(pred,y_test))

              precision    recall  f1-score   support

           0       0.91      0.80      0.85     14092
           1       0.78      0.89      0.83     10908

    accuracy                           0.84     25000
   macro avg       0.84      0.85      0.84     25000
weighted avg       0.85      0.84      0.84     25000



In [37]:
print(accuracy_score(pred,y_test))

0.84264


In [38]:
model.save('sent_model.h5')

In [19]:
from tensorflow.keras.models import load_model

In [20]:
model = load_model('sent_model.h5')

In [22]:
import re

In [23]:
def pred_on_new_data(model,string,word_to_id=word_to_id):
    max_len = 2697
    strip_special_chars = re.compile("[^A-Za-z0-9 ]+")
    
    string = string.lower().replace("<br />", " ")
    
    string=re.sub(strip_special_chars, "", string)
    
    words = string.split()
    
    test = [[word_to_id[word] if word in word_to_id else 0 for word in words]]
    
    test = pad_sequences(test,maxlen = max_len)
    
    pred_prob = model.predict(test)
    pred_class = model.predict_classes(test)
    
    return (pred_prob[0][0],pred_class[0][0])

In [27]:
string = 'This is absolutely disgusting. I do not like this very much.Negativity only'

In [28]:
pred_prob,pred_class = pred_on_new_data(model,string)

In [29]:
pred_class

0