# Sentiment Analysis of IMDB review

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [None]:
!head -n 14210 IMDBDataset.csv | tail -n 5

"Any Batman fan will know just how great the films are, they've been a major success. Batman Returns however is by far the best film in the series. A combination of excellent directing, brilliant acting and settings makes this worthy of watching on a night in.<br /><br />Tim Burton, who directed this movie, has specifically made sure that this film gives a realistic atmosphere and he's done a great job. Danny Devito (Penguin man) is a man who has inherited penguin characteristics as a baby, and grown up to become a hideous and ugly...thing! Michelle Pfiffer plays the sleek and very seducing 'Catwoman' after cats had given her there genes from being bitten. The result in both the character changes is excellent and both Catwoman and Penguin man play a very important role in this excellent film. The mysterious Catwoman is great fun to watch - her classic sayings and a funny part in which skips with her whip in a jewelry shop adds such fun to the film. Danny Devito also does well, his abil

In [None]:
df = pd.read_csv('IMDBDataset.csv', nrows=15000)

## 1. Data preprocessing

In [None]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
14995,Bobcat Goldthwait should be commended for atte...,negative
14996,"And it's not because since her days on ""Claris...",positive
14997,A traveling couple (Horton and Hamilton)stumbl...,negative
14998,This film is deeply disappointing. Not only th...,negative


In [None]:
df = df.replace({'positive': 1, 'negative': 0})

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [None]:
X = df['review']
y = df['sentiment']

In [None]:
X.shape, y.shape

((15000,), (15000,))

In [None]:
y.value_counts()

sentiment
0    7609
1    7391
Name: count, dtype: int64

In [None]:
X.isnull().sum()

0

In [None]:
y.isnull().sum()

0

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape

((12000,), (3000,))

## 2. Tokenization

In [None]:
# Find the average number of tokens (words) in the trainig tweets
round(sum([len(i.split()) for i in X_train])/len(X_train))

231

In [None]:
from tensorflow.keras.layers import TextVectorization
max_vocab_length = 50000
max_length = 233

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    output_mode="int",
                                    output_sequence_length=max_length)
text_vectorizer.adapt(X_train)

In [None]:
len(text_vectorizer.get_vocabulary())

50000

## 3. Embedding layer

In [None]:
from tensorflow.keras.layers import Embedding
embedding = Embedding(input_dim=max_vocab_length,
                      output_dim=256,
                      embeddings_initializer='uniform',
                      input_length=max_length)

## 4. Creating Models

### Model 1 - Naive Bayes

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model_1 = Pipeline([
                    ("tfidf", TfidfVectorizer()),
                    ("clf", MultinomialNB())
])

history_1 = model_1.fit(X_train, y_train)

In [None]:
model_1_preds = model_1.predict(X_test)
model_1_preds

array([1, 0, 1, ..., 1, 0, 1])

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred):

  accuracy = accuracy_score(y_true, y_pred)
  precision, recall, f1_score, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")

  result = {"Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1_score}
  return result

In [None]:
model_1_results = calculate_results(y_test, model_1_preds)
model_1_results

{'Accuracy': 0.857,
 'Precision': 0.8607754475703326,
 'Recall': 0.857,
 'F1 Score': 0.8562460203335098}

### Model 2 - LSTM

In [None]:
from tensorflow.keras.layers import LSTM, Input, Dense
from tensorflow.keras.models import Model
inputs = Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = LSTM(units=64, return_sequences=True)(x)
x = LSTM(units=32)(x)
x = Dense(32, activation='relu')(x)
outputs = Dense(1, activation='sigmoid')(x)

model_2 = Model(inputs, outputs, name='LSTM_model')
model_2.summary()

Model: "LSTM_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVe  (None, 233)               0         
 ctorization)                                                    
                                                                 
 embedding (Embedding)       (None, 233, 256)          12800000  
                                                                 
 lstm (LSTM)                 (None, 233, 64)           82176     
                                                                 
 lstm_1 (LSTM)               (None, 32)                12416     
                                                                 
 dense (Dense)               (None, 32)                1056      
                                                        

In [None]:
model_2.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

In [None]:
history_2 = model_2.fit(X_train, y_train, epochs=5, validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model_2_pred_probs = model_2.predict(X_test)
model_2_pred_probs[:5]



array([[0.20761377],
       [0.47577325],
       [0.32152534],
       [0.4364241 ],
       [0.4732588 ]], dtype=float32)

In [None]:
model_2_preds = tf.squeeze(tf.round(model_2_pred_probs))
model_2_preds[:5]

<tf.Tensor: shape=(5,), dtype=float32, numpy=array([0., 0., 0., 0., 0.], dtype=float32)>

In [None]:
model_2_results = calculate_results(y_test, model_2_preds)
model_2_results

{'Accuracy': 0.53,
 'Precision': 0.5581865163260512,
 'Recall': 0.53,
 'F1 Score': 0.4108159480049664}

**Note** - Let's try to add few things to improve accuracy of our LSTM model

### Model 3 - Bidirectional LSTM

In [None]:
from tensorflow.keras.layers import Bidirectional, Dropout
from tensorflow.keras.regularizers import l2
inputs = Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = Bidirectional(LSTM(units=64))(x)
x = Dense(32, activation='relu')(x)
outputs = Dense(1, activation='sigmoid')(x)

model_3 = Model(inputs, outputs, name='Bidirectional_LSTM')
model_3.summary()

Model: "Bidirectional_LSTM"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVe  (None, 233)               0         
 ctorization)                                                    
                                                                 
 embedding (Embedding)       (None, 233, 256)          12800000  
                                                                 
 bidirectional (Bidirection  (None, 128)               164352    
 al)                                                             
                                                                 
 dense_2 (Dense)             (None, 32)                4128      
                                                                 
 dense_3 (Dense)             (None, 1)          

In [None]:
model_3.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

In [None]:
history_3 = model_3.fit(X_train, y_train, epochs=5, validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model_3_pred_probs = model_3.predict(X_test)
model_3_preds = tf.squeeze(tf.round(model_3_pred_probs))
model_3_preds[:5]



<tf.Tensor: shape=(5,), dtype=float32, numpy=array([0., 1., 1., 0., 1.], dtype=float32)>

In [None]:
model_3_results = calculate_results(y_test, model_3_preds)
model_3_results

{'Accuracy': 0.833,
 'Precision': 0.8340119444299996,
 'Recall': 0.833,
 'F1 Score': 0.8330653158762883}

### Model 4 - Conv1D

In [None]:
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D
inputs = Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = Conv1D(filters=128, kernel_size=5,strides=1, activation="relu", padding='valid')(x)
x = GlobalMaxPooling1D()(x)
x = Dense(64, activation="relu")(x)
outputs = Dense(1, activation="sigmoid")(x)
model_4 = Model(inputs, outputs, name='Conv1D_model')
model_4.summary()

Model: "Conv1D_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVe  (None, 233)               0         
 ctorization)                                                    
                                                                 
 embedding (Embedding)       (None, 233, 256)          12800000  
                                                                 
 conv1d (Conv1D)             (None, 229, 128)          163968    
                                                                 
 global_max_pooling1d (Glob  (None, 128)               0         
 alMaxPooling1D)                                                 
                                                                 
 dense_4 (Dense)             (None, 64)               

In [None]:
model_4.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

In [None]:
history_4 = model_4.fit(X_train, y_train, epochs=5, validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model_4_pred_probs = model_4.predict(X_test)
model_4_preds = tf.squeeze(tf.round(model_4_pred_probs))
model_4_results = calculate_results(y_test, model_4_preds)
model_4_results



{'Accuracy': 0.851,
 'Precision': 0.8511127233001698,
 'Recall': 0.851,
 'F1 Score': 0.8510290229894328}

### Model 5 - From Tensorflow Hub

In [None]:
import tensorflow_hub as hub
sentence_encoder_layer = hub.KerasLayer("https://www.kaggle.com/models/google/universal-sentence-encoder/tensorFlow2/universal-sentence-encoder/2?tfhub-redirect=true",
                                        input_shape=[],
                                        dtype=tf.string,
                                        trainable=False,
                                        name='USE')

# Create model using Sequential API
model_5 = tf.keras.Sequential([
    sentence_encoder_layer,
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
], name='Tensorflow_Hub_model')

model_5.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

history_5 = model_5.fit(X_train, y_train, epochs=5, validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model_5_pred_probs = model_5.predict(X_test)
model_5_preds = tf.squeeze(tf.round(model_5_pred_probs))
model_5_results = calculate_results(y_test, model_5_preds)
model_5_results



{'Accuracy': 0.8583333333333333,
 'Precision': 0.8595126179284595,
 'Recall': 0.8583333333333333,
 'F1 Score': 0.8583864702891485}