In [None]:
import numpy as np 
import pandas as pd 
import os
from sklearn.utils import shuffle
import string
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Activation, Dense, Embedding, GlobalAveragePooling1D

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
tf.__version__ # newest version

In [None]:
import matplotlib.pyplot as plt

def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

# Load the data, Womens Clothing E-Commerce Reviews.csv, into memory.

In [None]:
data_path = '../input/womens-ecommerce-clothing-reviews/Womens Clothing E-Commerce Reviews.csv'
df = pd.read_csv(data_path)

In [None]:
df.head()

# Concatenate the Title, Review Text, Division Name, Department Name and Class Name as a new feature of Reviews.

In [None]:
df.drop(df.columns[0],inplace=True, axis=1)
df = df[['Title', 'Review Text', 'Division Name', 'Department Name', 'Class Name', 'Recommended IND']]

# see if label has any null values, and it doesn't
df['Recommended IND'].isnull().values.any()

# fill the nan in features with ''(empty string)
df = df.fillna('')

# concatenate
df['Reviews'] = df['Title'] + ' ' + df['Review Text'] + ' ' + df['Division Name'] + ' ' + df['Department Name'] + ' ' + df['Class Name']

# remove the title review text, division name, department name and class name columns
df = df[['Reviews', 'Recommended IND']]

# shuffle the data frame
df = shuffle(df, random_state=2021)

# remove punctuation
df["Reviews"] = df['Reviews'].str.replace('[{}]'.format(string.punctuation), '')

# lower-case everything
df['Reviews'] = df['Reviews'].str.lower()

In [None]:
df.head()

In [None]:
# get the entire dataset's unique words and its frequency
total_words = df['Reviews'].str.split()

total_words.head()

In [None]:
total_word_set = set()
total_words.apply(total_word_set.update)

In [None]:
# total_word_set

In [None]:
# word frequency distribution
from collections import Counter

count_dict = Counter(total_word_set)
VOCAB_SIZE = len(count_dict)

In [None]:
# vectorize text
# sequence_length = 100

encoder = TextVectorization(max_tokens = VOCAB_SIZE)

## Convert the df(data frame) to a tf dataset

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(
           ( tf.cast(df['Reviews'].values, tf.string),
            tf.cast(df['Recommended IND'].values, tf.int32)))

In [None]:
dataset.element_spec

In [None]:
# print out an instance in the dataset
for example, label in dataset.take(1):
  print('text: ', example.numpy())
  print('label: ', label.numpy())

## Train Test Split

In [None]:
TRAIN_SIZE = int(len(dataset)*0.7)

train_dataset = dataset.take(TRAIN_SIZE)
test_dataset = dataset.skip(TRAIN_SIZE) 

In [None]:
# print out a training example and a test example

print('========== TRAINING EXAMPLE','='*50)
for sentence, label in train_dataset.take(1):
    print('text: ', sentence.numpy())
    print('label: ', label.numpy())
print()    
print('========== TEST EXAMPLE', '='*54)
for sentence, label in test_dataset.take(1):
    print('text: ', sentence.numpy())
    print('label: ', label.numpy())
    

In [None]:
# tuning the tring and test dataset

    # previous buffer_size hyperparameter BUFFER_SIZE = 10000

AUTOTUNE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 16

train_dataset = train_dataset.cache().batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)
test_dataset = test_dataset.cache().batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)

In [None]:
#since we set the batch_size=16, when we take out 1, which means 1 batch (16 obs/rows)
# features, labels
for example, label in train_dataset.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

# Build a baseline model of Word embeddings to forecast the Recommended IND based on Reviews using deep learning.

## Create the text encoder

The raw text needs to be processed before it can be used in the model. I use the ***experimental.preprocessing.TextVectorization*** layer. 

In [None]:
import tensorflow as tf

VOCAB_SIZE = 1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
# We only need to convert features (NOT label) ot int
encoder.adapt(train_dataset.map(lambda text, label: text))

In [None]:
# the first 20 tokens
np.array(encoder.get_vocabulary())[:20]

## Create embedding layer

In [None]:
embedding_layer = tf.keras.layers.Embedding(input_dim=len(encoder.get_vocabulary()),
                                            output_dim=64,
                                            mask_zero=True)

## Create, train and complie the base model

In [None]:
embedding_dim=16

model1 = tf.keras.Sequential([
    encoder,
    embedding_layer,
    GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [None]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [None]:
model1.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
model1.fit(
    train_dataset,
    validation_data=test_dataset, 
    epochs=15,
    callbacks=[tensorboard_callback])

In [None]:
test_loss, test_acc = model1.evaluate(test_dataset)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

# The accuracy of this model is only 18%, which is not great at all. 

# Build a second model of RNN using a bidirectional LSTM to forecast the Recommended IND based on Reviews

In [None]:
model2 = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [None]:
model2.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='accuracy', mode='max', patience=3)

In [None]:
history2 = model2.fit(train_dataset, epochs=10,
                    validation_data=test_dataset, 
                    validation_steps=30,
                    callbacks = [early_stop])

In [None]:
test_loss2, test_acc2 = model2.evaluate(test_dataset)

print(f'Test Loss: {round(test_loss2,2)}')
print(f'Test Accuracy: {round(test_acc2,4)*100}%')

In [None]:
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plot_graphs(history2, 'accuracy')
plt.ylim(None,1)
plt.subplot(1,2,2)
plot_graphs(history2, 'loss')
plt.ylim(0,None)

### Run a test with some new reviews on model2

- if the prediction is >= 0.0, it is positive reveiw, otherwise, negative

In [None]:
def make_prediction(text):
    prediction = model2.predict(np.array([text]))
    if prediction >= 0.0:
        return 'a positive review.'
    else:
        return 'a negative review.'


# run a few validation predictions:
review_1 = ('The shirt is cool. The print on the shirt '
               'is so much fun. I would recommend this product.') #true label = 1, positive review

review_2 = ('The pattern is hedious, and the fit is weird. '
                 'I would not recommend this to anyone.') #true label = 0, negative review
 
review_3 = ('So happy! I order size M because I have my belly and it works perfect, '
                 'the waist is wide helping to control.') # true label = 1, positive review

review_4 = ('True to size and comfy! The inner lining is soft and dry-fit while the outside is a bit more like a windbreaker material. '
                 'I like that they are super lightweight and not so thin that you can see your underwear') # true label = 1, positive review

text_list = [review_1, review_2, review_3, review_4]



counter = 0
for review in text_list:
    counter += 1
    print(f'Review {counter}: {make_prediction(review)}')
    print()
    
# model2 got every review correct. 