In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.colors import Normalize, rgb2hex
from IPython.display import HTML


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
tf.__version__

# Load Data

In [None]:
# Load the data
data = pd.read_csv("../input/womens-ecommerce-clothing-reviews/Womens Clothing E-Commerce Reviews.csv" , index_col=0)
data.head()

In [None]:
data.shape

In [None]:
# Removing null review texts
data = data[~data['Review Text'].isnull()]
data.shape

In [None]:
# check if labels are missing
data['Recommended IND'].isnull().sum()

In [None]:
# check the review length
data['Review Text'].str.split().apply(lambda x: len(x)).describe()

In [None]:
# check label distribution
data['Recommended IND'].value_counts()

In [None]:
# one hot encode y label (will be used in the interpretability section)
labels = tf.keras.utils.to_categorical(data['Recommended IND'])
output_shape = labels.shape[1]
labels, output_shape

In [None]:
X = data['Review Text'].values
X[:3]

# Start Building

In [None]:
# split into train, test, val
x, X_test, y, y_test = train_test_split(X, labels, test_size=0.1, random_state=53)
X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.1, random_state=53)

In [None]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

In [None]:
# set hyper parameters
v_size = 2000
max_len = 100  # roughly 90th percentile
e_dim = 64
batch_size = 256

In [None]:
# create a tf textvectorization later
pre_processing_layer = TextVectorization(max_tokens=v_size, 
                                         output_sequence_length=max_len, 
                                         name='Notes_preprocessing_layer')

In [None]:
# fit on training vocab
pre_processing_layer.adapt(X_train)
# get the vocab
vocab = pre_processing_layer.get_vocabulary()

In [None]:
# create a simple bi-directional rnn model
tf.keras.backend.clear_session()
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=v_size,
                              output_dim=e_dim, 
                             name='embedding', 
                             mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(output_shape, activation='softmax')
])

metrics = [tf.keras.metrics.CategoricalAccuracy()]

model.summary()

In [None]:
# compile the model
model.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
              metrics=metrics)
print("Ready to Train")

In [None]:
# convert inputs to tf Datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
valid_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))

# during distributed training, TF can shard on files or data, it defaults to
# files and throws a warning that it is switching to data.
# supress that warning by adding options
options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = \
    tf.data.experimental.AutoShardPolicy.DATA

raw_train_ds = \
    train_dataset.shuffle(X_train.shape[0]).batch(batch_size).with_options(options)
raw_val_ds = valid_dataset.batch(batch_size).with_options(options)
raw_test_ds = test_dataset.batch(batch_size).with_options(options)

In [None]:
# vectorize the text inputs

@tf.autograph.experimental.do_not_convert
def vectorize_text(text, label):

    """ convert text to tokens """

    text = tf.expand_dims(text, -1)
    return pre_processing_layer(text), label


# print an example
text_batch, label_batch = next(iter(raw_train_ds.shuffle(50)))
first_review, first_label = text_batch[0], label_batch[0]
print("Review: ", first_review)
print("Label: ", tf.argmax(first_label))
print("Vectorized review", vectorize_text(first_review, first_label))


In [None]:
# tokenize all datasets and prepare for training
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)


In [None]:
# train the model (3 epochs for quick implementation)
model.fit(train_ds,
          validation_data=val_ds,
          epochs=3, verbose=1)

In [None]:
model.evaluate(test_ds)

In [None]:
## get predictions
test_probs = model.predict(test_ds)
test_preds = tf.argmax(test_probs, axis=1)
y_test_flat = tf.argmax(y_test, axis=1)

In [None]:
# build confusion matrix
cm = tf.math.confusion_matrix(y_test_flat, test_preds)

# plot confusion matrix
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=[0, 1],
            yticklabels=[0, 1], cbar=False)
plt.show()

# Interpretability

### INTEGRATED GRADIENTS for understanding feature importance
Refer to https://arxiv.org/pdf/1703.01365.pdf for all the details


##### VERY IMPORTANT : in Tensorflow, gradients dont pass through Embedding layer; so will get the embedding layer out, and build the rest of the model as `new_model`

In [None]:
# extract embedding layer
embed_layer = model.get_layer('embedding')

# build new model with all layers after embedding layer
new_model = tf.keras.Sequential()
for layer in model.layers[1:]:
    new_model.add(layer)

In [None]:
# take some test data
sample_texts = next(raw_test_ds.take(1).as_numpy_iterator())[0]
sample_vectors = pre_processing_layer(sample_texts)
# sample_vectors = next(test_ds.take(1).as_numpy_iterator())[0]
sample_labels = next(test_ds.take(1).as_numpy_iterator())[1]

Run after this block for a different example

In [None]:
# select a random index
index = tf.cast(tf.random.uniform(shape=[1],
                                  minval=sample_vectors.shape[0]),
                dtype=tf.int8).numpy()[0]
# generate a random sample
sample_text = sample_texts[index]
sample_vector = sample_vectors[index]
sample_label = tf.argmax(sample_labels, axis=1)[index]
# get embeddings
sample_embed = embed_layer(sample_vector)
# Create a Baseline vector with zero embeddings
baseline_embed = tf.zeros(shape=tf.shape(sample_embed))
# get preds for sample
sample_preds = model(sample_vectors)[index]
# print the results with color codes
words = [vocab[i] for i in sample_vector]

In [None]:
def interpolate_texts(baseline, text, m_steps):

    """ Linearly interpolate the input vector
    (embedding layer output of the sample vector)"""

    # Generate m_steps intervals for integral_approximation() below.
    alphas = tf.linspace(start=0.0, stop=1.0, num=m_steps+1)
    # text = tf.cast(text, tf.float32)
    alphas_x = alphas[:, tf.newaxis, tf.newaxis]
    delta = text - baseline
    texts = baseline + alphas_x * delta
    return texts

In [None]:
n_steps = 50
interpolated_texts = interpolate_texts(baseline_embed,
                                       sample_embed,
                                       n_steps)

In [None]:
interpolated_texts.shape
# (num_interpolations, seq_len, embed_dim)

In [None]:
def compute_gradients(t, target_class_idx):

    """ compute the gradient wrt to embedding layer output """

    with tf.GradientTape() as tape:
        tape.watch(t)
        probs = new_model(t)[:, target_class_idx]
    grads = tape.gradient(probs, t)
    return grads

In [None]:
target_label = sample_label
# target_label = 2   
# change target_label to see attributions for that particular class; this is why we one-hot encoded the y label
path_gradients = compute_gradients(interpolated_texts, target_label)

In [None]:
path_gradients.shape
# (num_interpolations, seq_len, embed_dim)

In [None]:
# sum the grads of the interpolated vectors
all_grads = tf.reduce_sum(path_gradients, axis=0) / n_steps
# mulitply grads by (input - baseline); baseline is zero vectors
x_grads = tf.math.multiply(all_grads, sample_embed)
# sum all gradients across the embedding dimension
igs = tf.reduce_sum(x_grads, axis=-1).numpy()

In [None]:
# igs

In [None]:
# Helper functions to color the feature importances

def  hlstr(string, color='white'):
    """
    Return HTML markup highlighting text with the desired color.
    """
    return f"<mark style=background-color:{color}>{string} </mark>"


def colorize(attrs, cmap='PiYG'):
    """
    Compute hex colors based on the attributions for a single instance.
    Uses a diverging colorscale by default and normalizes and scales
    the colormap so that colors are consistent with the attributions.
    """
    
    cmap_bound = tf.reduce_max(tf.abs(attrs))
    norm = Normalize(vmin=-cmap_bound, vmax=cmap_bound)
    cmap = mpl.cm.get_cmap(cmap)

    # now compute hex values of colors
    colors = list(map(lambda x: rgb2hex(cmap(norm(x))), attrs))
    return colors

In [None]:
colors = colorize(igs)
_LABEL_NAMES = [0, 1]

In [None]:
# print the sample and predictions
print(f"Sample Text: {sample_text}\n")
# print(f"Sample Vector: {sample_vector}")
# print(f"True Label: {_LABEL_NAMES[sample_label]}")
# print(f"Predicted Label: "
#       f"{_LABEL_NAMES[tf.argmax(sample_preds).numpy()]}")
print("Predictions : ")
for index in tf.argsort(sample_preds,
                        axis=-1, direction='DESCENDING').numpy():
    print(f"\t{_LABEL_NAMES[index]} --> {sample_preds[index]*100:0.2f}%")

print(f"\nTrue Label: {_LABEL_NAMES[sample_label]}")
print(f"\nAttributions for Label: {_LABEL_NAMES[target_label]}")
print(f"\nTop 5 Important words: "
      f"{[words[i] for i in tf.argsort(igs, -1, 'DESCENDING')[:5]]}\n")
print("\nGreen is high importance/attribution whereas pink is negative importance/attribution\n")
HTML("".join(list(map(hlstr, words, colors))))


In [None]:
# build confusion matrix
cm = tf.math.confusion_matrix(y_test_flat, test_preds)

# plot confusion matrix
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues", xticklabels=[0, 1],
            yticklabels=[0, 1], cbar=False)
plt.show()

#### Lets look at some examples where predictions were wrong

In [None]:
# test_preds, y_test_flat

In [None]:
false_positive_indices = tf.where((test_preds == 1) & (y_test_flat == 0))
false_negative_indices = tf.where((test_preds == 0) & (y_test_flat == 1))

In [None]:
def get_igs(sample_embed, target_label, n_steps=50):
    baseline_embed = tf.zeros(shape=tf.shape(sample_embed))
    interpolated_texts = interpolate_texts(baseline_embed, sample_embed, n_steps)
    path_gradients = compute_gradients(interpolated_texts, target_label)
    # sum the grads of the interpolated vectors
    all_grads = tf.reduce_sum(path_gradients, axis=0) / n_steps
    # mulitply grads by (input - baseline); baseline is zero vectors
    x_grads = tf.math.multiply(all_grads, sample_embed)
    # sum all gradients across the embedding dimension
    igs = tf.reduce_sum(x_grads, axis=-1).numpy()
    
    return tf.reshape(igs, -1)

In [None]:
index = tf.cast(tf.random.uniform(shape=[1], minval=false_negative_indices.shape[0]), dtype=tf.int8).numpy()[0]
sample_index = false_negative_indices[index].numpy()[0]
sample_index

In [None]:
sample_text = X_test[sample_index]
sample_label = y_test_flat[sample_index].numpy()
sample_vector = pre_processing_layer([sample_text])
sample_embed = embed_layer(sample_vector)
sample_preds = tf.reshape(new_model(sample_embed), -1)
words = [vocab[i] for i in sample_vector[0]]
predicted_label = tf.argmax(sample_preds).numpy()
target_label = predicted_label
# target_label = sample_label
igs = get_igs(sample_embed, target_label)
colors = colorize(igs)

In [None]:
# print the sample and predictions
print(f"Sample Text: {sample_text}\n")
# print(f"Sample Vector: {sample_vector}")
# print(f"True Label: {_LABEL_NAMES[sample_label]}")
# print(f"Predicted Label: "
#       f"{_LABEL_NAMES[tf.argmax(sample_preds).numpy()]}")
print("Predictions : ")
for index in tf.argsort(sample_preds,
                        axis=-1, direction='DESCENDING').numpy():
    print(f"\t{_LABEL_NAMES[index]} --> {tf.reshape(sample_preds, -1)[index]*100:0.2f}%")

print(f"\nTrue Label: {_LABEL_NAMES[sample_label]}")
print(f"\nAttributions for Label: {_LABEL_NAMES[target_label]}")
print(f"\nTop 5 Important words: "
      f"{[words[i] for i in tf.argsort(igs, -1, 'DESCENDING')[:5]]}\n")
print("\nGreen is high importance/attribution whereas pink is negative importance/attribution\n")
HTML("".join(list(map(hlstr, words, colors))))

In [None]:
# 270 -> remove symbols
# 504; the word `unraveling` and `fraying`, 'chic', downhill, 'scrumptious' are not in vocab
# missed word 'pesky'
# the first word oof the sentense is "this", it is highlighted green for laabel 1

In [None]:
def tf_lower_and_split_punct(text):
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, '[^ a-z0-9\']', ' ')
    text = tf.strings.strip(text)
    return text

In [None]:
# [vocab[i] for i in pre_processing_layer([tf_lower_and_split_punct('''If you know one.september and you know what's good for you, then size the heck down. maybe even two sizes if your particularly slim or want the dress to fit tighter. any way, this dress is simply scrumptious! the embroidery at the bodice is so detailed and pretty. the layers are light and soft like silk (but it's rayon/poly) and the sleeves are sheer and voluminous. i actually prefer the sleeves rolled up because 
# they are too long. neckline features a slit-v with two hooks to close if wanted. co''')])[0]]

In [None]:
# t = TextVectorization(output_sequence_length=max_len, 
#                                          name='t')

In [None]:
# t.adapt(X_train)

In [None]:
# tocab = t.get_vocabulary()

In [None]:
# tocab.index('unraveling')