<a href="https://colab.research.google.com/github/saptarshidatta96/Sentiment-Analysis/blob/main/Sentiment_Analysis_with_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [160]:
import os
import random
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text
import tensorflow as tf
from tensorflow import keras
from keras import models
from keras import initializers
from keras import regularizers
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.layers import CuDNNLSTM
from keras.layers import Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

random.seed(42)

In [161]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!tar -xvf "/content/gdrive/MyDrive/aclImdb_v1.tar.gz" -C "/content/"  

In [163]:
def load_dataset(dataset):
  data = []
  label = []
  for item in os.listdir('/content/aclImdb/{}/'.format(dataset)):
    if item == 'pos':
      tweet_txt = os.path.join('/content/aclImdb/{}/'.format(dataset), item)
      for tweets in os.listdir(tweet_txt):
        if tweets.endswith('.txt'):
          with open(os.path.join(tweet_txt, tweets)) as f:
            data.append(f.read())
          label.append(1)

    elif item == 'neg':
      tweet_txt = os.path.join('/content/aclImdb/{}/'.format(dataset), item)
      for tweets in os.listdir(tweet_txt):
        if tweets.endswith('.txt'):
          with open(os.path.join(tweet_txt, tweets)) as f:
            data.append(f.read())
          label.append(0)

  return data, label

In [164]:
train_data, train_label = load_dataset('train')
test_data, test_label = load_dataset('test')

In [165]:
def split_training_and_validation_sets(data, label, validation_split):

    num_training_samples = int((1 - validation_split) * len(data))
    return ((data[:num_training_samples], label[:num_training_samples]),
            (data[num_training_samples:], label[num_training_samples:]))

In [166]:
(train_data, train_label), (valid_data, valid_label) = split_training_and_validation_sets(train_data, train_label, 0.1)

In [167]:
random.seed(42)
random.shuffle(train_data)
random.shuffle(train_label)

In [168]:
train_label = tf.convert_to_tensor(train_label, dtype=tf.float32)
valid_label = tf.convert_to_tensor(valid_label, dtype=tf.float32)

In [169]:
def sequence_vectorizer(train_data, valid_data):

    # Create vocabulary with training texts.
    tokenizer = text.Tokenizer(num_words=20000)
    tokenizer.fit_on_texts(train_data)

    # Vectorize training and validation texts.
    x_train = tokenizer.texts_to_sequences(train_data)
    x_val = tokenizer.texts_to_sequences(valid_data)

    # Get max sequence length.
    max_length = len(max(x_train, key=len))
    if max_length > 500:
        max_length = 500

    # Fix sequence length to max value. Sequences shorter than the length are
    # padded in the beginning and sequences longer are truncated
    # at the beginning.
    x_train = sequence.pad_sequences(x_train, maxlen=max_length)
    x_val = sequence.pad_sequences(x_val, maxlen=max_length)

    x_train = tf.convert_to_tensor(x_train, dtype=tf.float32)
    x_val = tf.convert_to_tensor(x_val, dtype=tf.float32)

    return x_train, x_val, tokenizer.word_index

In [170]:
x_train, x_val, word_index = sequence_vectorizer(train_data, valid_data)

In [172]:
def LSTM_Model():
  model = models.Sequential()
  model.add(Embedding(20000, 120, input_length=500))
  model.add(SpatialDropout1D(0.4))
  model.add(CuDNNLSTM(176))
  model.add(Dense(1,activation='sigmoid'))
  return model
  

In [173]:
model = LSTM_Model()
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])


Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 500, 120)          2400000   
                                                                 
 spatial_dropout1d_9 (Spatia  (None, 500, 120)         0         
 lDropout1D)                                                     
                                                                 
 cu_dnnlstm_9 (CuDNNLSTM)    (None, 176)               209792    
                                                                 
 dense_9 (Dense)             (None, 1)                 177       
                                                                 
Total params: 2,609,969
Trainable params: 2,609,969
Non-trainable params: 0
_________________________________________________________________


In [174]:
callbacks = [EarlyStopping(monitor='val_loss', patience=2)]

model.fit(x_train, train_label,
            epochs=20,
            callbacks=callbacks,
            validation_data=(x_val, valid_label),
            verbose=2,
            batch_size=512)


Epoch 1/20
44/44 - 17s - loss: 0.6874 - accuracy: 0.5539 - val_loss: 0.8682 - val_accuracy: 0.0000e+00 - 17s/epoch - 375ms/step
Epoch 2/20
44/44 - 15s - loss: 0.6783 - accuracy: 0.5689 - val_loss: 0.9020 - val_accuracy: 0.0664 - 15s/epoch - 336ms/step
Epoch 3/20
44/44 - 15s - loss: 0.6263 - accuracy: 0.6460 - val_loss: 0.8267 - val_accuracy: 0.4096 - 15s/epoch - 340ms/step
Epoch 4/20
44/44 - 15s - loss: 0.5291 - accuracy: 0.7315 - val_loss: 1.1089 - val_accuracy: 0.3592 - 15s/epoch - 339ms/step
Epoch 5/20
44/44 - 15s - loss: 0.4315 - accuracy: 0.7942 - val_loss: 1.1665 - val_accuracy: 0.4216 - 15s/epoch - 339ms/step


<keras.callbacks.History at 0x7f510acf0610>

In [175]:
model.save('/content/gdrive/MyDrive/models/sentiment_analysis_LSTM_trained_model.h5',save_format= 'tf')

Load Model

In [176]:
loaded_model = keras.models.load_model('/content/gdrive/MyDrive/models/sentiment_analysis_LSTM_trained_model.h5')

In [177]:
x_test, _, _ = sequence_vectorizer(test_data, valid_data)
predictions = loaded_model.predict(x_test)
pred = [1 if a>0.5 else 0 for a in predictions]

In [178]:
print(pred)

[0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 

In [179]:
print(test_label)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [181]:
count = 0
for i, j in zip(pred, test_label):
  if i==j:
    count += 1

print(count/len(pred))

0.49612
