In [None]:
!pip install tokenizers

In [None]:
import pandas as pd
import tensorflow as tf
import numpy as np
from tokenizers import ByteLevelBPETokenizer
import pickle
import matplotlib.pyplot as plt

## Use a TPU / GPU if available

In [None]:
# Detect and init the TPU if available
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
    strategy = tf.distribute.TPUStrategy(tpu)
    master = tpu.master()
    print('Running on TPU:', master if master else 'local')
except ValueError:
    # If a TPU is not available, check for a GPU
    if tf.config.list_physical_devices('GPU'):
        strategy = tf.distribute.OneDeviceStrategy("GPU")
        print('Running on GPU')
    else:
        strategy = tf.distribute.OneDeviceStrategy("CPU")
        print('Running on CPU')

## Loading the data and data preprocessing

In [None]:
# Load the data
data = pd.read_csv("/kaggle/input/amazon-reviews/train.csv", header=None)
test_data = pd.read_csv('/kaggle/input/amazon-reviews/test.csv', header=None)

In [None]:
# Preprocess the data
df = pd.DataFrame(columns=["rating", "review"])

df["rating"] = data[0].apply(lambda x: x - 1)
df["review"] = data[1] + " " + data[2]

test_df = pd.DataFrame(columns=["rating", "review"])

test_df["rating"] = test_data[0].apply(lambda x: x - 1)
test_df["review"] = test_data[1] + " " + test_data[2]

In [None]:
# Inspect the data
df.head(5)

In [None]:
df.dropna(inplace=True)
test_df.dropna(inplace=True)

df = df.sample(1_200_000, random_state=42)
test_df = test_df.sample(100_000, random_state=42)

In [None]:
df.head(5)

## Tokenization

In [None]:
# Instantiate the tokenizer
sequence_length = 512
vocab_size = 50000

tokenizer = ByteLevelBPETokenizer()

In [None]:
tokenizer.train_from_iterator(df['review'], vocab_size=vocab_size)

In [None]:
# Tokenize and encode the sequences
df['encoded_sequence'] = [tokenizer.encode(review).ids for review in df['review']]
test_df['encoded_sequence'] = [tokenizer.encode(review).ids for review in test_df['review']]

In [None]:
max(df['encoded_sequence'].apply(len))

In [None]:
df['padded_sequence'] = tf.keras.preprocessing.sequence.pad_sequences(df['encoded_sequence'], maxlen=sequence_length, padding='post').tolist()
test_df['padded_sequence'] = tf.keras.preprocessing.sequence.pad_sequences(test_df['encoded_sequence'], maxlen=sequence_length, padding='post').tolist()

In [None]:
df.head(5)

In [None]:
vocab_size = tokenizer.get_vocab_size()

## Model architechure

In [None]:
with strategy.scope():
    # Model architecture
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=300, mask_zero=True),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
        tf.keras.layers.BatchNormalization(),

        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
        tf.keras.layers.BatchNormalization(),

        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
        tf.keras.layers.BatchNormalization(),

        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
        tf.keras.layers.BatchNormalization(),

        tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.5), # we're using a dropout of 50% here to introduce a significant amount of regularization and to encourage the model to learn more robust and generalizable features

        tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        tf.keras.layers.BatchNormalization(),

        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                  optimizer=tf.keras.optimizers.Adam(1e-3),
                  metrics=['accuracy'])


## Model training

Here, we convert all of our data to NumPy arrays, subsequently making all of them them contiguous as well as setting the writeable flag to False.
We do this because we are aiming to optimize memory usage, improve data access efficiency, and ensure the data is not accidentally modified during the execution of subsequent code.

In [None]:
X_train = np.array(df['padded_sequence'].tolist())
X_train = np.ascontiguousarray(X_train)
X_train.flags.writeable = False

y_train = np.array(df['rating'].tolist())
y_train = np.ascontiguousarray(y_train)
y_train.flags.writeable = False

X_test = np.array(test_df['padded_sequence'].tolist())
X_test = np.ascontiguousarray(X_test)
X_test.flags.writeable = False

y_test = np.array(test_df['rating'].tolist())
y_test = np.ascontiguousarray(y_test)
y_test.flags.writeable = False

In [None]:
history = model.fit(X_train, y_train, epochs=1, batch_size=64,
                    validation_data=(X_test, y_test),
                    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=1)])

## Model evaluation

In [None]:
model.evaluate(X_test, y_test)

In [None]:
!pip install seaborn

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Assuming you have predictions from your model
y_pred = model.predict(X_test)

# Convert the predicted probabilities to class labels
y_pred_labels = np.argmax(y_pred, axis=1)
 
# Create the confusion matrix
cm = confusion_matrix(y_test, y_pred_labels)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

## Plotting the loss, value loss, accuracy and value accuracy

In [None]:
metrics = history.history
plt.figure(figsize=(16,6))
plt.subplot(1,2,1)
plt.plot(history.epoch, metrics['loss'], metrics['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.ylim([0, max(plt.ylim())])
plt.xlabel('Epoch')
plt.ylabel('Loss [BinaryCrossEntropy]')

plt.subplot(1,2,2)
plt.plot(history.epoch, 100 * np.array(metrics['accuracy']), 100 * np.array(metrics['val_accuracy']))
plt.legend(['accuracy', 'val_accuracy'])
plt.ylim([0, 100])
plt.xlabel('Epoch')
plt.ylabel('Accuracy [%]')
plt.show()

## Saving the model

In [None]:
# Save the model
model.save('/kaggle/working/model.h5')

# Save the tokenizer
tokenizer.save('/kaggle/working/tokenizer.bpe')

## Creating an end-to-end version
It's no fun running our model in the current state: having to tokenize the text, pad it, call the model and then work out labels. Let's create an easy-to-use end-to-end version with our model and tokenizer.

In [None]:
class ExportModel():
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def __call__(self, x):
        if isinstance(x, str):
            x = [x]

        x = [self.tokenizer.encode(text).ids for text in x]
        x = tf.keras.preprocessing.sequence.pad_sequences(
            x, maxlen=sequence_length, padding='post')

        pred = self.model(
            x,
            training=False
        )

        res = np.array([])

        for p in pred:
            label = "POSITIVE" if tf.keras.backend.greater(p[0], 0.5) else "NEGATIVE"
            confidence = tf.keras.backend.abs(p[0] - 0.5) * 2

            res = np.append(res, {
                "label": label,
                "confidence": confidence.numpy()
            })

        return res

export = ExportModel(model, tokenizer)

with open('/kaggle/working/model_end2end', 'wb') as f:
    pickle.dump(export, f)

In [None]:
with open('/kaggle/working/model_end2end', 'rb') as f:
    exported_file = pickle.load(f)

## Testing the end-to-end model

In [None]:
export(["high quality","exellent","charge was fired","surprisingly not fine","horrible product","trash product"])

In [None]:
export(["This is a great product", "This is a horrible product"]) == exported_file(["This is a great product", "This is a horrible product"])