<a href="https://colab.research.google.com/github/saptarshidatta96/Sentiment-Analysis/blob/main/Sentiment_Analysis_with_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import random
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text
import tensorflow as tf
from tensorflow import keras
from keras import models
from keras import initializers
from keras import regularizers
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.layers import CuDNNLSTM
from keras.layers import Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

random.seed(42)

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!tar -xvf "/content/gdrive/MyDrive/aclImdb_v1.tar.gz" -C "/content/"  

In [4]:
def load_dataset(dataset):
  data = []
  label = []
  for item in os.listdir('/content/aclImdb/{}/'.format(dataset)):
    if item == 'pos':
      tweet_txt = os.path.join('/content/aclImdb/{}/'.format(dataset), item)
      for tweets in os.listdir(tweet_txt):
        if tweets.endswith('.txt'):
          with open(os.path.join(tweet_txt, tweets)) as f:
            data.append(f.read())
          label.append(1)

    elif item == 'neg':
      tweet_txt = os.path.join('/content/aclImdb/{}/'.format(dataset), item)
      for tweets in os.listdir(tweet_txt):
        if tweets.endswith('.txt'):
          with open(os.path.join(tweet_txt, tweets)) as f:
            data.append(f.read())
          label.append(0)

  return data, label

In [5]:
train_data, train_label = load_dataset('train')
test_data, test_label = load_dataset('test')

In [6]:
def split_training_and_validation_sets(data, label, validation_split):

    num_training_samples = int((1 - validation_split) * len(data))
    return ((data[:num_training_samples], label[:num_training_samples]),
            (data[num_training_samples:], label[num_training_samples:]))

In [7]:
(train_data, train_label), (valid_data, valid_label) = split_training_and_validation_sets(train_data, train_label, 0.1)

In [8]:
random.seed(42)
random.shuffle(train_data)
random.seed(42)
random.shuffle(train_label)

In [9]:
train_label = tf.convert_to_tensor(train_label, dtype=tf.float32)
valid_label = tf.convert_to_tensor(valid_label, dtype=tf.float32)

In [10]:
def sequence_vectorizer(train_data, valid_data):

    # Create vocabulary with training texts.
    tokenizer = text.Tokenizer(num_words=20000)
    tokenizer.fit_on_texts(train_data)

    # Vectorize training and validation texts.
    x_train = tokenizer.texts_to_sequences(train_data)
    x_val = tokenizer.texts_to_sequences(valid_data)

    # Get max sequence length.
    max_length = len(max(x_train, key=len))
    if max_length > 500:
        max_length = 500

    # Fix sequence length to max value. Sequences shorter than the length are
    # padded in the beginning and sequences longer are truncated
    # at the beginning.
    x_train = sequence.pad_sequences(x_train, maxlen=max_length)
    x_val = sequence.pad_sequences(x_val, maxlen=max_length)

    x_train = tf.convert_to_tensor(x_train, dtype=tf.float32)
    x_val = tf.convert_to_tensor(x_val, dtype=tf.float32)

    return x_train, x_val, tokenizer.word_index

In [11]:
x_train, x_val, word_index = sequence_vectorizer(train_data, valid_data)

In [12]:
def LSTM_Model():
  model = models.Sequential()
  model.add(Embedding(20000, 120, input_length=500))
  model.add(SpatialDropout1D(0.4))
  model.add(CuDNNLSTM(176, return_sequences=True))
  model.add(Dropout(0.8))
  model.add(CuDNNLSTM(32))
  model.add(Dropout(0.8))
  model.add(Dense(1,activation='sigmoid'))
  return model
  

In [13]:
model = LSTM_Model()
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 120)          2400000   
                                                                 
 spatial_dropout1d (SpatialD  (None, 500, 120)         0         
 ropout1D)                                                       
                                                                 
 cu_dnnlstm (CuDNNLSTM)      (None, 500, 176)          209792    
                                                                 
 dropout (Dropout)           (None, 500, 176)          0         
                                                                 
 cu_dnnlstm_1 (CuDNNLSTM)    (None, 32)                26880     
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                        

In [14]:
callbacks = [EarlyStopping(monitor='val_loss', patience=2)]

model.fit(x_train, train_label,
            epochs=20,
            callbacks=callbacks,
            validation_data=(x_val, valid_label),
            verbose=2,
            batch_size=512)


Epoch 1/20
44/44 - 35s - loss: 0.6575 - accuracy: 0.6011 - val_loss: 0.3952 - val_accuracy: 0.9324 - 35s/epoch - 794ms/step
Epoch 2/20
44/44 - 28s - loss: 0.3934 - accuracy: 0.8454 - val_loss: 0.3575 - val_accuracy: 0.8692 - 28s/epoch - 645ms/step
Epoch 3/20
44/44 - 28s - loss: 0.2521 - accuracy: 0.9196 - val_loss: 0.3345 - val_accuracy: 0.8648 - 28s/epoch - 643ms/step
Epoch 4/20
44/44 - 28s - loss: 0.1988 - accuracy: 0.9394 - val_loss: 0.2806 - val_accuracy: 0.8900 - 28s/epoch - 643ms/step
Epoch 5/20
44/44 - 28s - loss: 0.1590 - accuracy: 0.9529 - val_loss: 0.3712 - val_accuracy: 0.8648 - 28s/epoch - 644ms/step
Epoch 6/20
44/44 - 29s - loss: 0.1437 - accuracy: 0.9594 - val_loss: 0.3350 - val_accuracy: 0.8916 - 29s/epoch - 651ms/step


<keras.callbacks.History at 0x7f911d3d2c10>

In [15]:
model.save('/content/gdrive/MyDrive/models/sentiment_analysis_LSTM_trained_model.h5',save_format= 'tf')

Load Model

In [16]:
loaded_model = keras.models.load_model('/content/gdrive/MyDrive/models/sentiment_analysis_LSTM_trained_model.h5')

In [17]:
x_test, _, _ = sequence_vectorizer(test_data, valid_data)
predictions = loaded_model.predict(x_test)
pred = [1 if a>0.5 else 0 for a in predictions]

In [18]:
print(pred)

[1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 

In [19]:
print(test_label)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [20]:
accuracy_score(pred, test_label)

0.54356

In [21]:
print(classification_report(pred, test_label))

              precision    recall  f1-score   support

           0       0.61      0.54      0.57     14057
           1       0.48      0.55      0.51     10943

    accuracy                           0.54     25000
   macro avg       0.54      0.54      0.54     25000
weighted avg       0.55      0.54      0.55     25000



In [22]:
confusion_matrix(pred, test_label)

array([[7573, 6484],
       [4927, 6016]])