# Sentiment Analysis with Keras Tokenization & Embeddings + Multilayer Bidirectional LSTM

Import required libs

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import random
import os

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Ensure TF using GPU, and enable eager execution.

In [None]:
print("TF version: ", tf.__version__)
if tf.__version__ < "2.0.0":
    tf.enable_eager_execution()
    print("Eager execution enabled.")
else:
    print("Eager execution enabled by default.")

if tf.test.gpu_device_name(): 
    print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
   print("Please install GPU version of TF")

Set random seed so we get consistent result when improveing our model

In [None]:
def setseed(seed = 0):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    
SEED = 0
setseed(SEED)

setseed()

Load dataset

In [None]:
train = pd.read_csv('../input/sentiment-analysis-on-movie-reviews/train.tsv.zip', sep = '\t')
test = pd.read_csv('../input/sentiment-analysis-on-movie-reviews/test.tsv.zip', sep = '\t')

Take a look at the data

In [None]:
print(train.shape, test.shape)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

**Text Preprocessing**

Normalization:
* converting numbers into words or removing numbers
* expanding abbreviations
* removing stop words
* remove sparse terms and particular words (Stemming/Lemmatization)

Available in the tokenizer:
* converting all letters to lower case
* removing punctuations, accent marks and other diacritics
* removing white spaces

In [None]:
from tqdm import tqdm

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import re

In [None]:
def clean_sentences(df):
    reviews = []
    
    for sent in tqdm(df['Phrase']):       
        #remove non-alphabetic characters
        review_text = re.sub("[^a-zA-Z]"," ", sent)
        
        #tokenize the sentences
        words = word_tokenize(review_text.lower())
        
        #lemmatize each word to its lemma
        lemmatizer = WordNetLemmatizer()
        lemma_words = [lemmatizer.lemmatize(i) for i in words]
        
        reviews.append(lemma_words)
    
    return(reviews)

In [None]:
%%time
train_sentences = clean_sentences(train)
test_sentences = clean_sentences(test)

print(len(train_sentences))
print(len(test_sentences))

In [None]:
print(train['Phrase'][0])
print(' '.join(train_sentences[0]))

Turn label into OHE format

In [None]:
from keras.utils import to_categorical

target = train.Sentiment.values
y_target = to_categorical(target)

# number of numerical values exist in y_target's column
num_classes = y_target.shape[1]

In [None]:
print(num_classes)

The sentiment labels are:

* 0 - negative
* 1 - somewhat negative
* 2 - neutral
* 3 - somewhat positive
* 4 - positive

Set Training & Validation set to 80/20

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_sentences,
                                                  y_target,
                                                  test_size = 0.2,
                                                  stratify = y_target)

In [None]:
X_train[0]

Get vocab sizes and max length

In [None]:
unique_words = set()
len_max = 0

for sent in tqdm(X_train):
    unique_words.update(sent)
    if(len_max < len(sent)):
        len_max = len(sent)

# length of the list of unique_words 
print('Number of vocabs: ', len(list(unique_words)))
print('Max length of text is: ', len_max)

Tokenize the dataset

In [None]:
vocab_size = len(list(unique_words))
embedding_dim = 300
max_length = len_max
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [None]:
%%time
tokenizer = Tokenizer(num_words = vocab_size,
                      # filters = '#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                      oov_token = oov_tok,
                      # lower = True,
                      char_level = False)

tokenizer.fit_on_texts(list(X_train))

# Training
X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train,
                        maxlen = max_length,
                        padding = padding_type,
                        truncating = trunc_type)

# Validation
X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val,
                      maxlen = max_length,
                      padding = padding_type,
                      truncating = trunc_type)

# Testing
X_test = tokenizer.texts_to_sequences(test_sentences)
X_test = pad_sequences(X_test,
                       maxlen = max_length,
                       padding = padding_type,
                       truncating = trunc_type)

In [None]:
print("X_training shape   : ",X_train.shape)
print("X_validation shape : ",X_val.shape)
print("X_testing shape    : ",X_test.shape)

Train a Sentiment Model

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, dropout = 0.8, recurrent_dropout=0.8, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, dropout = 0.5, recurrent_dropout=0.5, return_sequences=False)),
    tf.keras.layers.Dense(128, activation = 'relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_classes, activation = 'softmax')
])

model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
model.summary()

In [None]:
from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(min_delta = 0.001,
                               mode = 'max',
                               monitor = 'val_acc',
                               patience = 2)
callback = [early_stopping]

In [None]:
%%time

num_epochs = 5

history = model.fit(X_train,
                    y_train,
                    validation_data = (X_val, y_val),
                    epochs = num_epochs,
                    batch_size = 256,
                    verbose = 1,
                    callbacks = callback)

Visualize the training graph

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()
  
plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')

Prepare for submission

In [None]:
test_id = test['PhraseId']

In [None]:
%%time

# y_pred = model.predict_classes(X_test)
y_pred = np.argmax(model.predict(X_test), axis=-1)

In [None]:
submission = pd.DataFrame({'PhraseId': test_id, 'Sentiment': y_pred})
submission.to_csv('movie_review_prediction_5EP_MLBDLSTM_submission.csv', index=False)
submission.head()