# Learnt From

In [3]:
#https://towardsdatascience.com/sentiment-analysis-using-lstm-step-by-step-50d074f09948
#and
#udemy "mosh"
#and
#https://www.kaggle.com/lakshmi25npathi/sentiment-analysis-of-imdb-movie-reviews

# Loading the data

In [4]:
import pandas as pd
imdb_data=pd.read_csv('IMDB_Dataset.csv')
print(imdb_data.shape)
imdb_data.head(5)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Analysis

In [5]:
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


# Sentiment Count

In [6]:
imdb_data.sentiment.value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

# Removing html stripps

In [7]:
from bs4 import BeautifulSoup

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

imdb_data.review=imdb_data.review.apply(strip_html)

In [8]:
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# All reviews in lower case

In [9]:
imdb_data.review=imdb_data.review.str.lower()
imdb_data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


# Remove punchuations

In [10]:
from string import punctuation
print(punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [11]:
def remove_punc(text):
    all_text=''.join([c for c in text if c not in punctuation])
    return all_text
imdb_data.review=imdb_data.review.apply(remove_punc)

In [12]:
imdb_data.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


# Word dictionary creation

In [13]:
def total_words(imdb_data):
    all_text=''.join([c for c in imdb_data.review])
    return all_text

In [14]:
all_text=total_words(imdb_data)
all_text=all_text.split(" ")

In [15]:
all_text[:10]

['one',
 'of',
 'the',
 'other',
 'reviewers',
 'has',
 'mentioned',
 'that',
 'after',
 'watching']

In [16]:
from collections import Counter
count_words=Counter(all_text)
print(list(count_words.items())[:20])

[('one', 49428), ('of', 287970), ('the', 647301), ('other', 17737), ('reviewers', 500), ('has', 32858), ('mentioned', 1024), ('that', 135525), ('after', 13999), ('watching', 8737), ('just', 34470), ('1', 2234), ('oz', 240), ('episode', 3005), ('youll', 2596), ('be', 52615), ('hooked', 272), ('they', 41288), ('are', 58120), ('right', 6121)]


In [17]:
from operator import itemgetter
total_words=len(all_text)
sorted_words = sorted(count_words.items(),key=itemgetter(1),reverse=True)
print(sorted_words[:20])

[('the', 647301), ('and', 319230), ('a', 317795), ('of', 287970), ('to', 266062), ('is', 209832), ('in', 182408), ('it', 148754), ('this', 137960), ('i', 136392), ('that', 135525), ('was', 95113), ('as', 89462), ('with', 86373), ('for', 85760), ('movie', 82143), ('but', 80231), ('film', 73471), ('on', 66100), ('not', 59203)]


In [18]:
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}
print(list(vocab_to_int.items())[:20])

[('the', 1), ('and', 2), ('a', 3), ('of', 4), ('to', 5), ('is', 6), ('in', 7), ('it', 8), ('this', 9), ('i', 10), ('that', 11), ('was', 12), ('as', 13), ('with', 14), ('for', 15), ('movie', 16), ('but', 17), ('film', 18), ('on', 19), ('not', 20)]


In [19]:
from sklearn.externals import joblib
filename='vocab_to_int.pkl'
joblib.dump(vocab_to_int,filename)



['vocab_to_int.pkl']

# Tokenize

In [20]:
def tokenize(text):
    token_list=[]
    for w in text.split():
        if w in vocab_to_int:
            token_list.append(vocab_to_int[w])
        else:
            pass
    return token_list

In [21]:
imdb_data.review=imdb_data.review.apply(tokenize)

In [22]:
imdb_data.head()

Unnamed: 0,review,sentiment
0,"[27, 4, 1, 78, 1919, 43, 1057, 11, 102, 147, 4...",positive
1,"[3, 382, 112, 355, 1, 1341, 2965, 6, 51, 17955...",positive
2,"[10, 190, 9, 12, 3, 382, 98, 5, 1103, 63, 19, ...",positive
3,"[684, 223, 3, 235, 113, 3, 112, 433, 3622, 117...",negative
4,"[34394, 110, 7, 1, 63, 4, 296, 6, 3, 2161, 136...",positive


In [23]:
def binary(text):
    if text=="positive":
        return 1
    else:
        return 0
imdb_data.sentiment=imdb_data.sentiment.apply(binary)

In [24]:
imdb_data.head()

Unnamed: 0,review,sentiment
0,"[27, 4, 1, 78, 1919, 43, 1057, 11, 102, 147, 4...",1
1,"[3, 382, 112, 355, 1, 1341, 2965, 6, 51, 17955...",1
2,"[10, 190, 9, 12, 3, 382, 98, 5, 1103, 63, 19, ...",1
3,"[684, 223, 3, 235, 113, 3, 112, 433, 3622, 117...",0
4,"[34394, 110, 7, 1, 63, 4, 296, 6, 3, 2161, 136...",1


# Train Test split

In [25]:
from sklearn.model_selection import train_test_split
train,test=train_test_split(imdb_data,test_size=0.20,shuffle=True)

In [26]:
X_train=train.review
y_train=train.sentiment
X_test=test.review
y_test=test.sentiment

In [27]:
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(40000,) (40000,)
(10000,) (10000,)


# Text for feeding

In [28]:
from tensorflow.keras.preprocessing import sequence
X_train = sequence.pad_sequences(X_train, maxlen=200)
X_test = sequence.pad_sequences(X_test, maxlen=200)

In [29]:
len(X_train[20])

200

In [30]:
import numpy as np

In [31]:
#np.reshape(X_train,)
X_train.shape

(40000, 200)

In [32]:
X_test

array([[  1773,     17,      6, ...,    199,      2,  17635],
       [     0,      0,      0, ...,      4,      1,     18],
       [     0,      0,      0, ...,      5,      9,     16],
       ...,
       [   156,   3697,      7, ...,      3,    399,    361],
       [    13, 161935,  12768, ...,     46,   1175,   1134],
       [     0,      0,      0, ...,    308,      9,     16]])

In [33]:
#X_train.shape

# Training model

In [34]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [35]:
model = Sequential()
vocabulay_size=X_train.max()+1
#model.add(Embedding(vocabulay_size, 256))
#model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2,return_sequences=True))
#model.add(LSTM(1000,dropout=0.2))
#model.add(Dense(512,activation='relu'))
#model.add(Dense(128,activation='relu'))
#model.add(Dense(1, activation='sigmoid'))

model.add(Embedding(vocabulay_size, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [36]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         31696256  
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 31,827,969
Trainable params: 31,827,969
Non-trainable params: 0
_________________________________________________________________


In [37]:
#learning_rate=1e-3,decay=1e-5
optimizer=Adam(lr=1e-3,decay=1e-5)
model.compile(loss='binary_crossentropy',optimizer=optimizer,metrics=['accuracy'])
callback=EarlyStopping(monitor='val_acc',mode='max',restore_best_weights=True)

In [38]:
model.fit(X_train, y_train,batch_size=32,epochs=15,verbose=2,validation_data=(X_test, y_test),
         callbacks=[callback])


Train on 40000 samples, validate on 10000 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/15
 - 1412s - loss: 0.4826 - acc: 0.7692 - val_loss: 0.4523 - val_acc: 0.7856
Epoch 2/15
 - 1377s - loss: 0.2764 - acc: 0.8918 - val_loss: 0.3337 - val_acc: 0.8749
Epoch 3/15
 - 1386s - loss: 0.1549 - acc: 0.9435 - val_loss: 0.3360 - val_acc: 0.8814
Epoch 4/15
 - 1393s - loss: 0.0892 - acc: 0.9695 - val_loss: 0.4055 - val_acc: 0.8714


<tensorflow.python.keras.callbacks.History at 0x2b21a848588>

# Testing model

In [39]:
score, acc = model.evaluate(X_test, y_test,batch_size=32,verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)


 - 21s - loss: 0.3360 - acc: 0.8814
Test score: 0.3359849332809448
Test accuracy: 0.8814


In [40]:
type(X_train)

numpy.ndarray

In [41]:
X_train.shape

(40000, 200)

In [48]:
import keras

Using TensorFlow backend.


In [49]:
model.save('sentimentModel.h5')