# TEXT SENTIMENT ANALYSIS

Using Keras and Tensorflow 2

Applied to Quora Insincere Questions Competition Data

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import tensorflow as tf
from tensorflow.keras import losses
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

from sklearn.metrics import classification_report

import numpy as np
import pandas as pd

import gc

In [None]:
df = pd.read_csv("/kaggle/input/quora-insincere-questions-classification/train.csv",index_col=0)
df.info()

In [None]:
#Shuffle df
df = df.sample(frac=1,random_state=21)

In [None]:
print(df.target.value_counts())
print('Pct Insincere = {:.2%}'.format(df.target.mean()))

### Question Examples by Label

In [None]:
test_examples = df.groupby("target").head(10)

with pd.option_context('display.max_colwidth', 400):
    display(test_examples)

# Build Tensorflow Datasets

In [None]:
train_ds      = tf.data.Dataset.from_tensor_slices((df.question_text.values[0::3], df.target.values[0::3])).batch(32)
validation_ds = tf.data.Dataset.from_tensor_slices((df.question_text.values[1::3], df.target.values[1::3])).batch(32)
test_ds       = tf.data.Dataset.from_tensor_slices((df.question_text.values[2::3], df.target.values[2::3])).batch(32)

In [None]:
train_ds.cardinality()

In [None]:
del df
gc.collect()

In [None]:
for q, t in train_ds.take(1):
    print(q)
    print(t)

In [None]:
for text_batch, label_batch in train_ds.take(1):
    for i in range(10):
        print("Question: ", text_batch[i])
        print("Label:", label_batch[i])

# Tokenize

In [None]:
VOCAB_SIZE = 5000

binary_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='binary')

In [None]:
MAX_SEQUENCE_LENGTH = 75

int_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH)

In [None]:
def binary_vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return binary_vectorize_layer(text), label

In [None]:
def int_vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return int_vectorize_layer(text), label

In [None]:
# Retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, label_batch = next(iter(train_ds))
first_question, first_label = text_batch[0], label_batch[0]
print("Question", first_question)
print("Label", first_label)

In [None]:
# Make a text-only dataset (without labels), then call adapt
train_text = train_ds.map(lambda q, t: q)
binary_vectorize_layer.adapt(train_text)
int_vectorize_layer.adapt(train_text)

In [None]:
del train_text
gc.collect()

In [None]:
foo = binary_vectorize_layer(tf.expand_dims('How did Quebec nationalists see their province as a nation in the 1960s?', -1))
foo

In [None]:
print(binary_vectorize_layer.get_vocabulary()[:30])

In [None]:
print(binary_vectorize_layer.get_vocabulary()[-30:])

In [None]:
txt = tf.expand_dims('How did Quebec nationalists see their province as a nation in the 1960s?', -1)
int_vectorize_layer(txt)

In [None]:
print(int_vectorize_layer.get_vocabulary()[:30])

In [None]:
print(int_vectorize_layer.get_vocabulary()[-30:])

In [None]:
print("'binary' vectorized question:", 
      binary_vectorize_text(first_question, first_label)[0])

In [None]:
print("'binary' vectorized question:", 
      int_vectorize_text(first_question, first_label)[0])

In [None]:
binary_train_ds = train_ds.take(15000).map(binary_vectorize_text)
binary_valid_ds = validation_ds.take(1000).map(binary_vectorize_text)
binary_test_ds  = test_ds.take(1000).map(binary_vectorize_text)

int_train_ds    = train_ds.take(15000).map(int_vectorize_text)
int_valid_ds    = validation_ds.take(1000).map(int_vectorize_text)
int_test_ds     = test_ds.take(1000).map(int_vectorize_text)

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

def configure_dataset(dataset):
    return dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
binary_train_ds = configure_dataset(binary_train_ds)
binary_valid_ds = configure_dataset(binary_valid_ds)
binary_test_ds  = configure_dataset(binary_test_ds)

int_train_ds    = configure_dataset(int_train_ds)
int_valid_ds    = configure_dataset(int_valid_ds)
int_test_ds     = configure_dataset(int_test_ds)

# Build Sentiment Analysis Models

## Binary Tokenizer (Bag of Words Model)

In [None]:
binary_model = tf.keras.Sequential([tf.keras.layers.Dense(64, activation='relu')], name='binary_model')
binary_model.add(tf.keras.layers.Dropout(0.75))
binary_model.add(tf.keras.layers.Dense(2))

binary_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer= tf.keras.optimizers.Adam(learning_rate = 2e-4),
    metrics=['accuracy'])
#history = binary_model.fit(binary_train_ds, validation_data=binary_valid_ds, epochs=10, steps_per_epoch=1500)
history = binary_model.fit(binary_train_ds, validation_data=binary_valid_ds, epochs=1)

In [None]:
pd.DataFrame(history.history).style.background_gradient()

In [None]:
print("Linear model on binary vectorized data:")
print(binary_model.summary())

## Convolution Net on Integer Tokenizer

In [None]:
def create_model(vocab_size, num_labels):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, 64, mask_zero=True),
        tf.keras.layers.Conv1D(64, 5, padding="valid", activation="relu", strides=2),
        tf.keras.layers.GlobalMaxPooling1D(),
        tf.keras.layers.Dense(num_labels)
    ])
    return model

In [None]:
# vocab_size is VOCAB_SIZE + 1 since 0 is used additionally for padding.
int_model = create_model(vocab_size=VOCAB_SIZE + 1, num_labels=2)
int_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy'])
history = int_model.fit(int_train_ds, validation_data=int_valid_ds, epochs=1)

In [None]:
print("ConvNet model on int vectorized data:")
print(int_model.summary())

### Evaluate and Compare Models

In [None]:
binary_loss, binary_accuracy = binary_model.evaluate(binary_test_ds)
int_loss, int_accuracy = int_model.evaluate(int_test_ds)

print("Binary model accuracy: {:2.2%}".format(binary_accuracy))
print("Int model accuracy: {:2.2%}".format(int_accuracy))

In [None]:
y_true = np.concatenate([t.numpy() for _,t in binary_test_ds])

In [None]:
y_pred = np.argmax(binary_model.predict(binary_test_ds),1)
print(classification_report(y_true, y_pred))

In [None]:
y_pred = np.argmax(int_model.predict(int_test_ds),1)
print(classification_report(y_true, y_pred))

In [None]:
export_model = tf.keras.Sequential(
    [int_vectorize_layer, int_model,
     tf.keras.layers.Activation('sigmoid')])

export_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer='adam',
    metrics=['accuracy'])


loss, accuracy = export_model.evaluate(test_ds.take(100))
print("Accuracy: {:2.2%}".format(binary_accuracy))

# Export Model and Score New Texts

In [None]:
def get_string_labels(predicted_scores_batch):
    predicted_int_labels = tf.argmax(predicted_scores_batch, axis=1)
    predicted_labels = tf.gather(['sincere','insincere'], predicted_int_labels)
    return predicted_labels

In [None]:
predicted_scores = export_model.predict(test_examples.question_text)
predicted_labels = get_string_labels(predicted_scores)
true_labels = test_examples.target
for input, plabel, label in zip(test_examples.question_text, predicted_labels, true_labels):
    print("Question: ", input)
    print("Predicted label: ", plabel.numpy(), 'True Label: ', 'insincere' if label else 'sincere')
    print()