<a href="https://colab.research.google.com/github/sonudoo/DSA/blob/master/Machine%20Learning/SentimentAnalysisUsingKeras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Embedding, Input, Dropout
from tensorflow.keras.models import Model
import nltk
import string
import re
import numpy as np
from google.colab import drive

In [0]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
# Read the Word2Vec data set and parse it to vectors map

words = open("/content/drive/My Drive/Colab Notebooks/embeddings/glove.6B.50d.txt", "r", encoding="utf8").read().split("\n")

word_to_vec_map = {}
word_to_index_map = {}

j = 0
for i in words:
    l = i.split()
    if len(l) == 0:
        continue
    key, val = l[0], list(map(float, l[1:]))
    word_to_vec_map[key] = np.array(val)
    word_to_index_map[key] = j
    j += 1

# Add a dummy blank word of all zeros

word_to_vec_map[''] = np.zeros(word_to_vec_map['the'].shape)
word_to_index_map[''] = j

In [0]:
pos = open("/content/drive/My Drive/Colab Notebooks/datasets/ROTTEN_TOMATOES/rt-polarity.pos.txt").read().split("\n")[:-1]
neg = open("/content/drive/My Drive/Colab Notebooks/datasets/ROTTEN_TOMATOES/rt-polarity.neg.txt").read().split("\n")[:-1]
statements = pos + neg
labels = [1 for i in range(len(pos))] + [0 for i in range(len(neg))]

In [0]:
# Change case to lower case

statements = [statement.lower() for statement in statements]

# Remove all punctuations

statements = ["".join([' ' if y in string.punctuation else y for y in list(statement)]) for statement in statements]

# Tokenize 

statements = [re.split('\s+', statement) for statement in statements]

# Remove all words not in words_to_vec_map

statements = [[word for word in statement if word in word_to_vec_map] for statement in statements]

In [0]:
# Find the length of the longest statement

max_len = 0

for statement in statements:
    max_len = max(max_len, len(statement))

print("Maximum states is:", max_len)

Maximum states is: 54


In [0]:
# Applying padding

statements = [[word for word in statement] + ['' for y in range(max_len - len(statement))] for statement in statements]

In [0]:
# Replace word with word index

statements = [[word_to_index_map[word] for word in statement] for statement in statements]

In [0]:
X_train = np.array(statements)
Y_train = np.array(labels)

In [0]:
# We create an input layer. As a reminder, please note that we don't specify batch_size while specifying the input shape
# It is assumed to be the first argument

X_input = Input(shape=(max_len))

In [0]:
# Create an embedding layer now
# Embedding layer will map each index (index of a word from input layer) to a vector. This vector would represent
# the features of a word

vocab_len = len(word_to_vec_map) + 1 # 1 added to fit keras embedding requirement. This is the total number of words
embed_dimension = word_to_vec_map["is"].shape[0] # This is the number of feature in vector representation of a word

embedding_matrix = np.zeros((vocab_len, embed_dimension))

for word in word_to_vec_map:
    embedding_matrix[word_to_index_map[word], :] = word_to_vec_map[word]
    
embedding_layer = Embedding(vocab_len, embed_dimension, trainable=False)

# As the layer is supposed to be non-trainable and directly be used, so we will call build() and then set_weights
embedding_layer.build((None,))
    
embedding_layer.set_weights([embedding_matrix])

X = embedding_layer(X_input)


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [0]:
# Now we add a LSTM layer. The hidden state size is 128. We return all the sequences. Hence output is triple dimensional

X = LSTM(128, return_sequences=True)(X)

# Next we add some dropout

X = Dropout(rate=0.5)(X)

# Add another LSTM layer. But this time we only take the output of last state

X = LSTM(128)(X)

# Feed the 128 dimensional vector to Dense layer

X = Dense(10, activation='sigmoid')(X)

# Finally get a sigmoid output from another Dense layer

Y = Dense(1, activation='sigmoid')(X)

In [0]:
# Create the model

model = Model(inputs = X_input, outputs = Y)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 54)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 54, 50)            20000100  
_________________________________________________________________
lstm (LSTM)                  (None, 54, 128)           91648     
_________________________________________________________________
dropout (Dropout)            (None, 54, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 10)                1290      
_________________________________

In [0]:
model.fit(X_train, Y_train, epochs=400, batch_size=8192)

Train on 10662 samples
Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
Epoch 42/400
Epoch 43/400
Epoch 44/400
Epoch 45/400
Epoch 46/400
Epoch 47/400
Epoch 48/400
Epoch 49/400
Epoch 50/400
Epoch 51/400
Epoch 52/400
Epoch 53/400
Epoch 54/400
Epoch 55/400
Epoch 56/400
Epoch 57/400
Epoch 58/400
Epoch 59/400
Epoch 60/400
Epoch 61/400
Epoch 62/400
Epoch 63/400
Epoch 64/400
Epoch 65/400
Epoch 66/400
Epoch 67/400
Epoch 68/400
Epoch 69/400
Epoch 70/400
Epoch 71/400
Epoch 72/400
Epoch 73/400
Epoch 74/400
Epoch 75/400
Epoch 76/40

<tensorflow.python.keras.callbacks.History at 0x7f98177b9f28>

In [0]:
while True:
    statement = input("Enter statement:")
    if statement == "":
        break
    statement = statement.lower()
    statement = "".join([' ' if y in string.punctuation else y for y in list(statement)])
    statement = re.split('\s+', statement)
    statement = [word for word in statement if word in word_to_vec_map]
    statement = [word for word in statement] + ['' for y in range(max_len - len(statement))]
    statement = [word_to_index_map[word] for word in statement]
    predicted_label = model.predict(np.array(statement).reshape(1, max_len))
    if predicted_label >= 0.5:
        print("Positive")
    else:
        print("Negative")

Enter statement:i loved it
Positive
Enter statement:not worth the money
Positive
Enter statement:good film
Positive
Enter statement:i did not like it at all
Negative
Enter statement:brilliant acting
Positive
Enter statement:good direction
Positive
Enter statement:bad cinematography
Negative
Enter statement:good watch
Positive
Enter statement:worth the money
Positive
Enter statement:i hate this film
Negative
Enter statement:
