# TEXT SENTIMENT ANALYSIS USING BERT

Using Keras and Tensorflow 2

Applied to Quora Insincere Questions Competition Data

In [None]:
!pip install -q tensorflow-text
!pip install -q tf-models-official

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow.keras import losses
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from official.nlp import optimization

from sklearn.metrics import classification_report

import numpy as np
import pandas as pd

import gc

In [None]:
df = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/train.csv",index_col=0)
df.info()

In [None]:
#Shuffle df
df = df.sample(frac=1,random_state=21)

In [None]:
print(df.target.value_counts())
target_mean = df.target.mean()
print('Pct Insincere = {:.2%}'.format(target_mean))

### Question Examples by Label

In [None]:
test_examples = df.groupby("target").head(10)

with pd.option_context('display.max_colwidth', 400):
    display(test_examples)

# Build Tensorflow Datasets

In [None]:
train_ds      = tf.data.Dataset.from_tensor_slices((df.question_text.values[0::3], df.target.values[0::3])).batch(32)
validation_ds = tf.data.Dataset.from_tensor_slices((df.question_text.values[1::3], df.target.values[1::3])).batch(32)
test_ds       = tf.data.Dataset.from_tensor_slices((df.question_text.values[2::3], df.target.values[2::3])).batch(32)

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

def configure_dataset(dataset):
    return dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
train_ds      = configure_dataset(train_ds.take(6000))
validation_ds = configure_dataset(validation_ds.take(300))
test_ds       = configure_dataset(test_ds)

In [None]:
del df
gc.collect()

# Load BERT Model

In [None]:
bert_model            = hub.KerasLayer('https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1')
bert_preprocess_model = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')

In [None]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3', name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer('https://tfhub.dev/google/electra_small/2', trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    # New Layers
    net = tf.keras.layers.Dropout(0.1)(net)
    # Final Layer for Classification
    net = tf.keras.layers.Dense(1, name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [None]:
classifier_model = build_classifier_model()

In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

epochs = 1
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
#steps_per_epoch = min(steps_per_epoch, 200)

num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 1e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics,
                         )

In [None]:
history = classifier_model.fit(
    x=train_ds,
    validation_data=validation_ds,
    epochs=epochs,
    #steps_per_epoch=steps_per_epoch
    )

### Evaluate and Compare Models

In [None]:
y_true = np.concatenate([t.numpy() for _,t in validation_ds])

In [None]:
y_pred = classifier_model.predict(validation_ds) > 0
print(classification_report(y_true, y_pred))

# Export Model and Score New Texts

In [None]:
export_dir = "./qic_bert"
classifier_model.save(export_dir, include_optimizer=False)

In [None]:
disk_model = tf.saved_model.load(export_dir)
print("Loaded model from disk")

In [None]:
def get_string_labels(predicted_scores_batch):
    predicted_int_labels = (predicted_scores_batch.numpy() > 0).astype(int)
    predicted_labels = tf.gather(['sincere','insincere'], predicted_int_labels)
    return predicted_labels

In [None]:
predicted_scores = disk_model(tf.constant(test_examples.question_text))
predicted_labels = get_string_labels(predicted_scores)
true_labels = test_examples.target
for input, plabel, label in zip(test_examples.question_text, predicted_labels, true_labels):
    print("Question: ", input)
    print("Predicted label: ", plabel.numpy(), 'True Label: ', 'insincere' if label else 'sincere')
    print()