# Installing and Importing Required Libraries


In [1]:
!pip install bert-for-tf2
!pip install sentencepiece



In [54]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

import tensorflow_hub as hub

from tensorflow.keras import layers
import bert
import pandas as pd 
import re
import numpy as np
import random
import math 

# Importing and Preprocessing the Dataset


In [20]:
movie_reviews = pd.read_csv("/Users/zimingfang/Desktop/Animated GIFs/AwesomeGif/IMDB Dataset.csv")

movie_reviews.isnull().values.any()

movie_reviews.shape

(50000, 2)

Pandas is a dataframe
The output shows that our dataset has 50,000 rows and 2 columns.


In [21]:
def preprocess_text(sen):
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

In [22]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [23]:
reviews = []
sentences = list(movie_reviews['review'])
for sen in sentences:
    reviews.append(preprocess_text(sen))

In [24]:
sentences

["One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the f

In [25]:
movie_reviews

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [26]:
print(movie_reviews.columns.values)


['review' 'sentiment']


In [27]:
movie_reviews.sentiment.unique()
# The following script displays unique values in the sentiment column:

array(['positive', 'negative'], dtype=object)

In [30]:
y = movie_reviews['sentiment']

y = np.array(list(map(lambda x: 1 if x=="positive" else 0, y)))

In [31]:
y

array([1, 1, 1, ..., 0, 0, 0])

Now the reviews variable contain text reviews while the y variable contains the corresponding labels.

# Creating a BERT Tokenizer


In [32]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)


In [33]:
tokenizer.tokenize("don't be so judgmental")


['don', "'", 't', 'be', 'so', 'judgment', '##al']

In [34]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize("dont be so judgmental"))


[2123, 2102, 2022, 2061, 8689, 2389]

In [35]:
def tokenize_reviews(text_reviews):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_reviews))

In [36]:
tokenized_reviews = [tokenize_reviews(review) for review in reviews]
# this takes very long 

In [39]:
tokenized_reviews

[[2028,
  1997,
  1996,
  2060,
  15814,
  2038,
  3855,
  2008,
  2044,
  3666,
  2074,
  11472,
  2792,
  2017,
  2222,
  2022,
  13322,
  2027,
  2024,
  2157,
  2004,
  2023,
  2003,
  3599,
  2054,
  3047,
  2007,
  2033,
  1996,
  2034,
  2518,
  2008,
  4930,
  2033,
  2055,
  11472,
  2001,
  2049,
  24083,
  1998,
  4895,
  10258,
  2378,
  8450,
  5019,
  1997,
  4808,
  2029,
  2275,
  1999,
  2157,
  2013,
  1996,
  2773,
  2175,
  3404,
  2033,
  2023,
  2003,
  2025,
  2265,
  2005,
  1996,
  8143,
  18627,
  2030,
  5199,
  3593,
  2023,
  2265,
  8005,
  2053,
  17957,
  2007,
  12362,
  2000,
  5850,
  3348,
  2030,
  4808,
  2049,
  2003,
  13076,
  1999,
  1996,
  4438,
  2224,
  1997,
  1996,
  2773,
  2009,
  2003,
  2170,
  11472,
  2004,
  2008,
  2003,
  1996,
  8367,
  2445,
  2000,
  1996,
  17411,
  4555,
  3036,
  2110,
  7279,
  4221,
  12380,
  2854,
  2009,
  7679,
  3701,
  2006,
  14110,
  2103,
  2019,
  6388,
  2930,
  1997,
  1996,
  3827,
  2073,
  

# Prerparing Data For Training


To train the model, the input sentences should be of equal length.
To create sentences of equal length, one way is to pad the shorter sentences by 0s. However, this can result in a sparse matrix contain large number of 0s. The other way is to pad sentences within each batch. Since we will be training the model in batches, we can pad the sentences within the training batch locally depending upon the length of the longest sentence. To do so, we first need to find the length of each sentence.

In [40]:
reviews_with_len = [[review, y[i], len(review)]
                 for i, review in enumerate(tokenized_reviews)]

In [41]:
y[1]


1

In [42]:
len(tokenized_reviews[1])

167

In [43]:
tokenized_reviews[1]

[1037,
 6919,
 2210,
 2537,
 1996,
 7467,
 6028,
 2003,
 2200,
 14477,
 4757,
 24270,
 2200,
 2214,
 2051,
 4035,
 4827,
 1998,
 3957,
 16334,
 1998,
 2823,
 17964,
 2075,
 3168,
 1997,
 15650,
 2000,
 1996,
 2972,
 3538,
 1996,
 5889,
 2024,
 5186,
 2092,
 4217,
 2745,
 20682,
 2025,
 2069,
 2038,
 2288,
 2035,
 1996,
 11508,
 2072,
 2021,
 2002,
 2038,
 2035,
 1996,
 5755,
 2091,
 6986,
 2205,
 2017,
 2064,
 5621,
 2156,
 1996,
 25180,
 3238,
 9260,
 8546,
 2011,
 1996,
 7604,
 2000,
 3766,
 9708,
 10445,
 2025,
 2069,
 2003,
 2009,
 2092,
 4276,
 1996,
 3666,
 2021,
 2009,
 2003,
 27547,
 2135,
 2517,
 1998,
 2864,
 3538,
 3040,
 3993,
 2537,
 2055,
 2028,
 1997,
 1996,
 2307,
 3040,
 1997,
 4038,
 1998,
 2010,
 2166,
 1996,
 15650,
 2428,
 3310,
 2188,
 2007,
 1996,
 2210,
 2477,
 1996,
 5913,
 1997,
 1996,
 3457,
 2029,
 2738,
 2084,
 2224,
 1996,
 3151,
 3959,
 5461,
 3464,
 5024,
 2059,
 17144,
 2009,
 3248,
 2006,
 2256,
 3716,
 1998,
 2256,
 9456,
 3391,
 2007,
 1996,
 5019,
 

In our dataset, the first half of the reviews are positive while the last half contains negative reviews. Therefore, in order to have both positive and negative reviews in the training batches we need to shuffle the reviews. The following script shuffles the data randomly:



In [46]:
random.shuffle(reviews_with_len)


Once the data is shuffled, we will sort the data by the length of the reviews. To do so, we will use the sort() function of the list and will tell it that we want to sort the list with respect to the third item in the sublist i.e. the length of the review --> x[2] 

In [47]:
reviews_with_len.sort(key=lambda x: x[2])


Once the data is shuffled, we will sort the data by the length of the reviews. To do so, we will use the sort() function of the list and will tell it that we want to sort the list with respect to the third item in the sublist i.e. the length of the review.

In [48]:
sorted_reviews_labels = [(review_lab[0], review_lab[1]) for review_lab in reviews_with_len]


In [49]:
# convert the sorted dataset into a TensorFlow 2.0-compliant input dataset shape
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_reviews_labels, output_types=(tf.int32, tf.int32))


In [51]:
BATCH_SIZE = 32
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [52]:
# print the first batch and see how padding has been applied to it:
next(iter(batched_dataset))

(<tf.Tensor: shape=(32, 21), dtype=int32, numpy=
 array([[ 3191,  1996,  2338,  5293,  1996,  3185,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [ 2054,  5896,  2054,  2466,  2054,  6752,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [ 3078,  5436,  3078,  3257,  3532,  7613,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [ 2062, 23873,  3993,  2062, 11259,  2172,  2172,  2062, 14888,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [ 2023,  3185,  2003,  6659,  2021,  2009,  2038,  2070,  2204,
          3896,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0],
        [ 1045,  2876,  9278,  2023,  2028,  2130,  2006,  7922, 12635,
    

From the last five reviews, you can see that the total number of words in the largest sentence were 21. Therefore, in the first five reviews the 0s are added at the end of the sentences so that their total length is also 21. The padding for the next batch will be different depending upon the size of the largest sentence in the batch.



In [55]:
TOTAL_BATCHES = math.ceil(len(sorted_reviews_labels) / BATCH_SIZE)
TEST_BATCHES = TOTAL_BATCHES // 10
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)

# Creating the Model

Our model will consist of three convolutional neural network layers.

In [56]:
class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output

In [57]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [58]:
text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)

In [59]:
if OUTPUT_CLASSES == 2:
    text_model.compile(loss="binary_crossentropy",
                       optimizer="adam",
                       metrics=["accuracy"])
else:
    text_model.compile(loss="sparse_categorical_crossentropy",
                       optimizer="adam",
                       metrics=["sparse_categorical_accuracy"])

In [60]:
text_model.fit(train_data, epochs=NB_EPOCHS)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x15a0078d0>

In [62]:
results = text_model.evaluate(test_data)
print(results)

[0.4385887682437897, 0.8966346383094788]
