<a href="https://colab.research.google.com/github/saptarshidatta96/Sentiment-Analysis/blob/main/Sentiment_Analysis_with_MLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import random
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import models
from tensorflow.keras import initializers
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

random.seed(42)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!tar -xvf "/content/gdrive/MyDrive/aclImdb_v1.tar.gz" -C "/content/"  

In [None]:
def load_dataset(dataset):
  data = []
  label = []
  for item in os.listdir('/content/aclImdb/{}/'.format(dataset)):
    if item == 'pos':
      tweet_txt = os.path.join('/content/aclImdb/{}/'.format(dataset), item)
      for tweets in os.listdir(tweet_txt):
        if tweets.endswith('.txt'):
          with open(os.path.join(tweet_txt, tweets)) as f:
            data.append(f.read())
          label.append(1)

    elif item == 'neg':
      tweet_txt = os.path.join('/content/aclImdb/{}/'.format(dataset), item)
      for tweets in os.listdir(tweet_txt):
        if tweets.endswith('.txt'):
          with open(os.path.join(tweet_txt, tweets)) as f:
            data.append(f.read())
          label.append(0)

  return data, label

In [None]:
train_data, train_label = load_dataset('train')
test_data, test_label = load_dataset('test')

In [None]:
random.seed(42)
random.shuffle(train_data)
random.shuffle(train_label)

In [None]:
def split_training_and_validation_sets(data, label, validation_split):

    num_training_samples = int((1 - validation_split) * len(data))
    return ((data[:num_training_samples], label[:num_training_samples]),
            (data[num_training_samples:], label[num_training_samples:]))

In [None]:
(train_data, train_label), (valid_data, valid_label) = split_training_and_validation_sets(train_data, train_label, 0.1)

In [None]:
def ngram_vectorizer(train_data, train_label, valid_data):

    kwargs = {
            'ngram_range': (1, 2), 
            'dtype': 'int32',
            'strip_accents': 'unicode',
            'decode_error': 'replace',
            'analyzer': 'word', 
            'min_df': 2,
    }
    vectorizer = TfidfVectorizer(**kwargs)

    train_data = [str (item) for item in train_data]
    valid_data = [str (item) for item in valid_data]

    x_train = vectorizer.fit_transform(train_data)

    x_val = vectorizer.transform(valid_data)

    selector = SelectKBest(f_classif, k=min(20000, x_train.shape[1]))
    selector.fit(x_train, train_label)
    x_train = selector.transform(x_train)
    x_val = selector.transform(x_val)

    x_train = tf.convert_to_tensor(x_train.todense(), dtype=tf.float32)
    x_val = tf.convert_to_tensor(x_val.todense(), dtype=tf.float32)
    
    return x_train, x_val

In [None]:
def create_mlp_model():

  model = models.Sequential()
  model.add(Dropout(rate=0.02, input_shape=(20000,)))
  model.add(Dense(units=10000, activation='relu'))
  model.add(Dropout(rate=0.02))
  model.add(Dense(units=6000, activation='relu'))
  model.add(Dropout(rate=0.02))
  model.add(Dense(units=6000, activation='relu'))
  model.add(Dropout(rate=0.02))
  model.add(Dense(units=2000, activation='relu'))
  model.add(Dropout(rate=0.02))
  model.add(Dense(units=512, activation='relu'))
  model.add(Dense(units=256, activation='relu'))
  model.add(Dense(units=64, activation='relu'))
  model.add(Dropout(rate=0.02))
  model.add(Dense(units=1, activation='sigmoid'))

  return model

In [None]:
def train_ngram_model(train_data, train_label, learning_rate=1e-3, epochs=1000, batch_size=128):
    
    (train_data, train_label), (valid_data, valid_label) = split_training_and_validation_sets(train_data, train_label, 0.1)

    # Vectorize texts.
    x_train, x_val = ngram_vectorizer(train_data, train_label, valid_data)

    # Convert Labels to tensor.
    train_label = tf.convert_to_tensor(train_label, dtype=tf.float32)
    valid_label = tf.convert_to_tensor(valid_label, dtype=tf.float32)
    print(train_label)

    # Create model instance.
    model = create_mlp_model()

    # Compile model with learning parameters.
    model.compile(optimizer=Adam(lr=1e-3), loss='binary_crossentropy', metrics=['acc'])

    # Create callback for early stopping on validation loss. If the loss does
    # not decrease in two consecutive tries, stop training.
    callbacks = [EarlyStopping(monitor='val_loss', patience=2)]

    # Train and validate model.
    history = model.fit(
            x_train,
            train_label,
            epochs=epochs,
            callbacks=callbacks,
            validation_data=(x_val, valid_label),
            verbose=2,
            batch_size=batch_size)

    # Print results.
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(
            acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    # Save model.
    model.save('/content/gdrive/MyDrive/models/sentiment_analysis_trained_model.h5',save_format= 'tf')
    return history['val_acc'][-1], history['val_loss'][-1]


In [None]:
train_ngram_model(train_data, train_label, learning_rate=1e-3, epochs=1000, batch_size=128)



tf.Tensor([1. 1. 0. ... 1. 1. 0.], shape=(20250,), dtype=float32)


  super(Adam, self).__init__(name, **kwargs)


Epoch 1/1000
159/159 - 29s - loss: 0.5248 - acc: 0.6876 - val_loss: 1.4293 - val_acc: 0.5107 - 29s/epoch - 183ms/step
Epoch 2/1000
159/159 - 26s - loss: 0.1485 - acc: 0.9404 - val_loss: 1.5456 - val_acc: 0.5124 - 26s/epoch - 166ms/step
Epoch 3/1000
159/159 - 27s - loss: 0.0564 - acc: 0.9770 - val_loss: 3.1653 - val_acc: 0.5036 - 27s/epoch - 167ms/step
Validation accuracy: 0.5035555362701416, loss: 3.1652894020080566


(0.5035555362701416, 3.1652894020080566)

Load Saved Model

In [None]:
loaded_model = keras.models.load_model('/content/gdrive/MyDrive/models/sentiment_analysis_trained_model.h5')

In [None]:
x_test, _ = ngram_vectorizer(test_data, test_label, valid_data)
predictions = loaded_model.predict(x_test)
pred = [1 if a>0.5 else 0 for a in predictions]



In [None]:
count = 0
for i, j in zip(pred, test_label):
  if i==j:
    count += 1

print(count/len(pred))

0.49704
