<a href="https://colab.research.google.com/github/rajkumar2004725/Sentiment-Analysis-Using-LSTM/blob/main/amazon_reviews_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [2]:
!kaggle datasets download bittlingmayer/amazonreviews

Dataset URL: https://www.kaggle.com/datasets/bittlingmayer/amazonreviews
License(s): unknown
Downloading amazonreviews.zip to /content
 96% 475M/493M [00:00<00:00, 1.22GB/s]
100% 493M/493M [00:00<00:00, 1.22GB/s]


In [3]:
!unzip /content/amazonreviews.zip

Archive:  /content/amazonreviews.zip
  inflating: test.ft.txt.bz2         
  inflating: train.ft.txt.bz2        


In [4]:
import numpy as np
import pandas as pd
import bz2

In [5]:
train_file = bz2.BZ2File('/content/train.ft.txt.bz2')
test_file = bz2.BZ2File('/content/test.ft.txt.bz2')

In [6]:
train_file_lines = train_file.readlines()
test_file_lines = test_file.readlines()

In [7]:
train_file_lines = [x.decode('utf-8') for x in train_file_lines]
test_file_lines = [x.decode('utf-8') for x in test_file_lines]

In [8]:
train_file_lines[90]

'__label__1 No instructions included - do not trust seller: Promised with this item are "Complete Instructions" and the additional pledge that "Sweet Graces will email you with the Supply List and Instruction sheets on purchase - so you can be ready ahead of time!" I received none of this - only a plastic figurine and bracelet. To boot, Amazon claims they can do nothing to help me contact the seller. All I got was a phone number for the manufacturer. Let\'s hope that yields some results. Meanwhile, I\'m wishing I had listened to previous feedback about this unreliable seller :/\n'

In [9]:
train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_file_lines]
train_sentences = [x.split(' ', 1)[1][:-1].lower() for x in train_file_lines]

In [10]:
test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_file_lines]
test_sentences = [x.split(' ', 1)[1][:-1].lower() for x in test_file_lines]

In [11]:
test_sentences[157]

'terrific reference for identifying butterflies: is that a monarch, a viceroy or a queen butterfly? to identify butterflies look through the field guide to butterflies of texas. the next time, you can tell that it was a tiger swallowtail and not a zebra swallowtail or a spicebush swallowtail.children are fascinated by insects and butterflies as well. teach them to appreciate nature. learning the names of butterflies is a good way to start.'

In [12]:
import re

for i in range(len(train_sentences)):
    if 'www.' in train_sentences[i] or 'http:' in train_sentences[i] or 'https:' in train_sentences[i] or '.com' in train_sentences[i]:
        train_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", train_sentences[i])
for i in range(len(test_sentences)):
    if 'www.' in test_sentences[i] or 'http:' in test_sentences[i] or 'https:' in test_sentences[i] or '.com' in test_sentences[i]:
        test_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", test_sentences[i])


In [14]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting google_pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting tensorboard~=2.20.0 (from tensorflow)
  Downloading tensorboard-2.20.0-py3-none-any.whl.metadata (1.8 kB)
Collecting wheel<1.0,>=0.23.0 (from astunparse>=1.6.0->tensorflow)
  Downloading wheel-0.45.1-py3-none-any.whl.metadata (2.3 kB)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard~=2.20.0->tensorflow)
  Downloading tensorboard_data_server-0.

In [15]:
from keras.models import Model, Sequential
from keras.layers import Dense, Embedding, Input, Conv1D, GlobalMaxPool1D, Dropout, concatenate, Layer, InputSpec, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras import activations, initializers, regularizers, constraints
from keras.regularizers import l2
from tensorflow.keras.constraints import MaxNorm

In [16]:
max_features = 20000
maxlen = 100
tokenizer = Tokenizer(num_words=max_features)

In [17]:
tokenizer.fit_on_texts(train_sentences)
tokenized_train = tokenizer.texts_to_sequences(train_sentences)
tokenized_test = tokenizer.texts_to_sequences(test_sentences)

In [18]:
X_train = sequence.pad_sequences(tokenized_train, maxlen=maxlen)
X_test = sequence.pad_sequences(tokenized_test, maxlen=maxlen)

In [19]:
!kaggle datasets download bertcarremans/glovetwitter27b100dtxt

Dataset URL: https://www.kaggle.com/datasets/bertcarremans/glovetwitter27b100dtxt
License(s): CC0-1.0
Downloading glovetwitter27b100dtxt.zip to /content
 88% 351M/397M [00:00<00:00, 1.22GB/s]
100% 397M/397M [00:00<00:00, 1.22GB/s]


In [20]:
!unzip /content/glovetwitter27b100dtxt.zip

Archive:  /content/glovetwitter27b100dtxt.zip
  inflating: glove.twitter.27B.100d.txt  


In [21]:
EMBEDDING_FILE="/content/glove.twitter.27B.100d.txt"

In [22]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

In [23]:
all_embs = np.stack(list(embeddings_index.values()))
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
#change below line if computing normal stats is too slow
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size)) #embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [34]:
batch_size = 512
epochs = 7
embed_size = 100

In [35]:
def lstm_model(conv_layers = 2, max_dilation_rate = 3):
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=True)(inp)
    x = Dropout(0.25)(x)
    x = Conv1D(2*embed_size, kernel_size = 3)(x)
    prefilt = Conv1D(2*embed_size, kernel_size = 3)(x)
    x = prefilt
    for strides in [1, 1, 2]:
        x = Conv1D(128*2**(strides), strides = strides, kernel_regularizer=l2(4e-6), bias_regularizer=l2(4e-6), kernel_size=3, kernel_constraint=MaxNorm(10), bias_constraint=MaxNorm(10))(x)
    x_f = LSTM(512, kernel_regularizer=l2(4e-6), bias_regularizer=l2(4e-6), kernel_constraint=MaxNorm(10), bias_constraint=MaxNorm(10))(x)
    x_b = LSTM(512, kernel_regularizer=l2(4e-6), bias_regularizer=l2(4e-6), kernel_constraint=MaxNorm(10), bias_constraint=MaxNorm(10))(x)
    x = concatenate([x_f, x_b])
    x = Dropout(0.5)(x)
    x = Dense(64, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                optimizer='adam',
                  metrics=['binary_accuracy'])
    return model

lstm_model = lstm_model()
lstm_model.summary()


In [36]:
weight_path="early_weights.keras"
checkpoint = ModelCheckpoint(weight_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=5)
callbacks = [checkpoint, early_stopping]

In [37]:
import numpy as np

X_train = np.array(X_train)
train_labels = np.array(train_labels)

In [38]:
len(X_train)

3600000

In [39]:
X_t=X_train[:20000]

In [40]:
Y_t=train_labels[:20000]

In [41]:
type(X_t)

numpy.ndarray

In [42]:
lstm_model.fit(X_t, Y_t, batch_size=batch_size, epochs=epochs, shuffle = True, validation_split=0.20, callbacks=callbacks)

Epoch 1/7
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17s/step - binary_accuracy: 0.5055 - loss: 0.7867 
Epoch 1: val_loss improved from inf to 0.65191, saving model to early_weights.keras
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m595s[0m 18s/step - binary_accuracy: 0.5060 - loss: 0.7849 - val_binary_accuracy: 0.6428 - val_loss: 0.6519
Epoch 2/7
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17s/step - binary_accuracy: 0.6470 - loss: 0.6386 
Epoch 2: val_loss improved from 0.65191 to 0.46211, saving model to early_weights.keras
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m584s[0m 18s/step - binary_accuracy: 0.6484 - loss: 0.6373 - val_binary_accuracy: 0.7847 - val_loss: 0.4621
Epoch 3/7
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17s/step - binary_accuracy: 0.7784 - loss: 0.4890 
Epoch 3: val_loss improved from 0.46211 to 0.41666, saving model to early_weights.keras
[1m32/32[0m [32m━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7d306859fb00>

In [45]:
X_test = np.array(X_test)
test_labels = np.array(test_labels)

In [47]:
len(X_test)

400000

In [48]:
X_test=X_test[:5000]
test_labels=test_labels[:5000]

In [49]:
score, acc = lstm_model.evaluate(X_test, test_labels, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)


[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 5s/step - binary_accuracy: 0.8563 - loss: 0.3586
Test score: 0.36598774790763855
Test accuracy: 0.854200005531311
