# Introduction


This Notebook introduces use of BERT for tokenization task in a solution for text classification.

The dataset used here is [IMDB Dataset of 50K Movie Reviews](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)

# Analysis preparation

## Install and import libraries

In [None]:
!pip install bert-for-tf2
!pip install sentencepiece

In [None]:
import os
import pandas as pd
import numpy as np
import re
import random
import math

try:
    %tensorflow_version 2.x
except Exception as ex:
    print(ex)
    pass
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert

## Input data

In [None]:
movie_reviews = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
print(f"Null: {movie_reviews.isnull().values.any()}")
print(f"shape: {movie_reviews.shape}")

# Data preprocessing

We prepare the text for the classification. The text preprocessing includes the following:
* Remove html tags;  
* Remove punctuations and numbers;  
* Remove single character words;  
* Remove multiple spaces.  

In [None]:
TAG_RE = re.compile(r'<[^>]+>')
def remove_tags(text):
    """
    Remove html tags
    """
    return TAG_RE.sub('', text)

In [None]:
def preprocess_text(sen):
    """
    Remove html tags
    Remove punctuations and numbers
    Remove single character words
    Remove multiple spaces
    """
    # Removing html tags
    sentence = remove_tags(sen)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

    # Removing multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)

    return sentence

We apply the preprocessing to all sentences.

In [None]:
reviews = []
sentences = list(movie_reviews['review'])
for sen in sentences:
    reviews.append(preprocess_text(sen))

In [None]:
print(movie_reviews.columns.values)

In [None]:
movie_reviews.sentiment.unique()

We transform the target feature, from {'positive', 'negative'} to {1, 0}

In [None]:
y = movie_reviews['sentiment']
y = np.array(list(map(lambda x: 1 if x=="positive" else 0, y)))

### Check text and target features

In [None]:
print(f"Review sample:\n {reviews[10]}")
print(f"Review sentiment: {y[10]}")

# Model

## Tokenization using BERT

We use `BertTokenizer` (BERT uncased) from bert.
We initialize BertTokenizer with vocabulary file and option to lower case.

In [None]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

### Check tokenizer

In [None]:
print(tokenizer.tokenize("don't try to be so sentimental or so judgemental"))

In [None]:
print(tokenizer.convert_tokens_to_ids(tokenizer.tokenize("don't try to be so sentimental or so judgemental")))

### Aplly tokenizer to data

In [None]:
def tokenize_reviews(text_reviews):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text_reviews))

In [None]:
tokenized_reviews = [tokenize_reviews(review) for review in reviews]

After tokenization, we add to the transformed data as well the length for each review.

In [None]:
reviews_with_len = [[review, y[i], len(review)]
                 for i, review in enumerate(tokenized_reviews)]

In [None]:
random.shuffle(reviews_with_len)

Sort reviews on length.

In [None]:
reviews_with_len.sort(key=lambda x: x[2])

In [None]:
sorted_reviews_labels = [(review_lab[0], review_lab[1]) for review_lab in reviews_with_len]

In [None]:
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_reviews_labels, output_types=(tf.int32, tf.int32))

In [None]:
BATCH_SIZE = 32
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [None]:
next(iter(batched_dataset))

## Train-test split


We split the dataset in train-test, reserving 10% of the data for test, 90% in used for train.

In [None]:
TOTAL_BATCHES = math.ceil(len(sorted_reviews_labels) / BATCH_SIZE)
TEST_BATCHES = TOTAL_BATCHES // 10
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)

## Define model

In [None]:
class TextClassificationModel(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
        super(TextClassificationModel, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output

## Train model

In [None]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 100
DNN_UNITS = 256
OUTPUT_CLASSES = 2

DROPOUT_RATE = 0.2

NB_EPOCHS = 5

In [None]:
text_model = TextClassificationModel(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)

In [None]:
if OUTPUT_CLASSES == 2:
    text_model.compile(loss="binary_crossentropy",
                       optimizer="adam",
                       metrics=["accuracy"])
else:
    text_model.compile(loss="sparse_categorical_crossentropy",
                       optimizer="adam",
                       metrics=["sparse_categorical_accuracy"])

In [None]:
text_model.fit(train_data, epochs=NB_EPOCHS)

## Validation using test set

In [None]:
results = text_model.evaluate(test_data)
print(f"Test evaluation results: {results}")