# NLP text sentiment analysis using tenorFlow

In [2]:
import json
import tensorflow as tf

import pandas as pd
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

ModuleNotFoundError: No module named 'tensorflow'

In [0]:
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
training_size = 20000

In [0]:
#load preprocessed data
data = pd.read_csv("/content/data_WithOutProductTitleWords_WithoutLessFrequentWords.tsv", sep='\t')

data.dropna(axis='rows', inplace=True)

def convertTextToList(x):
    text = list(x.split(" ")) 
    return text

data['review_body_cleaned'].apply(lambda x: convertTextToList(x))

#split data set before vectorization
train, test = train_test_split(data, test_size = 0.2, stratify = data['class'])



In [0]:
#Tokenize

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train['review_body_cleaned'])

word_index = tokenizer.word_index

print(len(word_index))

#train words
training_sequences = tokenizer.texts_to_sequences(train['review_body_cleaned'])
#training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type)

#test words
testing_sequences = tokenizer.texts_to_sequences(test['review_body_cleaned'])
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type)

3703


In [0]:
#Model Simple NN
model_nn = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 25, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model_nn.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [0]:
print(model_nn.summary())

num_epochs = 10
history = model_nn.fit(training_padded, train['class'], epochs=num_epochs, validation_data=(testing_padded,  test['class']), verbose=2)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 25)           92600     
_________________________________________________________________
global_average_pooling1d (Gl (None, 25)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 24)                624       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 25        
Total params: 93,249
Trainable params: 93,249
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
2547/2547 - 7s - loss: 0.4136 - accuracy: 0.8166 - val_loss: 0.3314 - val_accuracy: 0.8527
Epoch 2/10
2547/2547 - 7s - loss: 0.3148 - accuracy: 0.8650 - val_loss: 0.3210 - val_accuracy: 0.8607
Epoch 3/10
2547/2547 - 7s - loss: 0.302

In [0]:
sentence = ["like it"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=100, padding=padding_type )
print(model_nn.predict(padded))

[[0.7143575]]


In [0]:
#Model RNN 

vocab_size = len(word_index)+1
max_length = 100

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 100, input_length=max_length),
    tf.keras.layers.GRU(units=125),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
#model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])



In [0]:
print(model.summary())

num_epochs = 10
history = model.fit(training_padded, train['class'], epochs=num_epochs, validation_data=(testing_padded,  test['class']), verbose=2)

In [0]:
sentence = ["doesn't fit the product is not good at all"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=100, padding=padding_type )
print(model.predict(padded))

In [0]:
#Model RNN with GloVe embedding

import numpy as np 

vocab_size = len(word_index)+1

embeddings_index = dict()
f = open('glove.6B.100d.txt',  encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

embedding_matrix = np.zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    if index > vocab_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector
            

In [0]:
#Model RNN - GRU 

vocab_size = len(word_index)+1
max_length = 100

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 100, input_length=max_length, weights=[embedding_matrix], trainable=True),
    tf.keras.layers.GRU(units=250),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
#model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])



In [0]:
print(model.summary())

num_epochs = 15
history = model.fit(training_padded, train['class'], epochs=num_epochs, validation_data=(testing_padded,  test['class']), verbose=2)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          370400    
_________________________________________________________________
gru_1 (GRU)                  (None, 250)               264000    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 251       
Total params: 634,651
Trainable params: 634,651
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/15
2547/2547 - 689s - loss: 0.5467 - accuracy: 0.7654 - val_loss: 0.5476 - val_accuracy: 0.7657
Epoch 2/15
2547/2547 - 682s - loss: 0.5107 - accuracy: 0.7785 - val_loss: 0.3235 - val_accuracy: 0.8562
Epoch 3/15
2547/2547 - 683s - loss: 0.3031 - accuracy: 0.8672 - val_loss: 0.2997 - val_accuracy: 0.8680
Epoch 4/15
2547/2547 - 686s - loss: 0.2755 - accuracy: 0.8818

In [0]:
sentence = ["dont like it"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=100, padding=padding_type )
print(model.predict(padded))

[[0.33747125]]


In [0]:
model_json = model.to_json()
with open("GRU_vocab_size_100_maxlength_TrainableTrue_250_Sigmoid_epoch15.json", "w") as json_file:
    json_file.write(model_json)

# serialize weights to HDF5
model.save_weights("GRU_vocab_size_100_maxlength_TrainableTrue_250_Sigmoid_epoch15.h5")

In [0]:
#Model RNN - LSTM 

vocab_size = len(word_index)+1
max_length = 100

model_lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 100, input_length=max_length, weights=[embedding_matrix], trainable=True),
    tf.keras.layers.LSTM(units=250),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model_lstm.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
#model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])



In [0]:
print(model_lstm.summary())

num_epochs = 15
history = model_lstm.fit(training_padded, train['class'], epochs=num_epochs, validation_data=(testing_padded,  test['class']), verbose=2)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 100)          370400    
_________________________________________________________________
lstm (LSTM)                  (None, 250)               351000    
_________________________________________________________________
dense (Dense)                (None, 1)                 251       
Total params: 721,651
Trainable params: 721,651
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/15
2547/2547 - 833s - loss: 0.5467 - accuracy: 0.7655 - val_loss: 0.5466 - val_accuracy: 0.7657
Epoch 2/15
2547/2547 - 839s - loss: 0.5450 - accuracy: 0.7657 - val_loss: 0.5442 - val_accuracy: 0.7657
Epoch 3/15
2547/2547 - 839s - loss: 0.5448 - accuracy: 0.7657 - val_loss: 0.5442 - val_accuracy: 0.7657
Epoch 4/15
2547/2547 - 834s - loss: 0.5444 - accuracy: 0.7658 -

In [0]:
sentence = [""]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=100, padding=padding_type )
print(model_lstm.predict(padded))


[[0.17820473]]


In [0]:
model_json = model_lstm.to_json()
with open("LSTM_vocab_size_100_maxlength_TrainableTrue_250_Sigmoid_epoch15.json", "w") as json_file:
    json_file.write(model_json)

# serialize weights to HDF5
model_lstm.save_weights("LSTM_vocab_size_100_maxlength_TrainableTrue_250_Sigmoid_epoch15.h5")