In [None]:
import pandas as pd

df_train = pd.read_json("../data/aclIMDB_train.json", orient='records')
df_train.info()
df_train.hist()
df_train.head()

In [None]:
df_val = pd.read_json("../data/aclIMDB_val.json", orient="records")
df_val.info()
df_val.hist()
df_val.head()

In [None]:
df_test = pd.read_json("../data/aclIMDB_test.json", orient='records')
df_test.info()
df_test.hist()
df_test.head()

In [None]:
train = [str(item).lower() for item in  df_train["text"].values]
val = [str(item).lower() for item in  df_val["text"].values]
test = [str(item).lower() for item in  df_test["text"].values]

In [None]:
import tensorflow as tf

VOCAB_SIZE = 10000

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE, oov_token="UNK")
tokenizer.fit_on_texts(train)

X_train_wordlevel = tokenizer.texts_to_sequences(train)
X_val_wordlevel = tokenizer.texts_to_sequences(val)
X_test_wordlevel = tokenizer.texts_to_sequences(test)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
print(X_train_wordlevel[0][:10])

In [None]:
import numpy as np
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer("../data/bert-base-uncased-vocab.txt", lowercase=True)

def bert_tokenizer(data):
    result = []
    for item in data:
        result.append(tokenizer.encode(item).ids)
    
    return np.array(result)

In [None]:
X_train_bert = bert_tokenizer(train)
X_val_bert = bert_tokenizer(val)
X_test_bert = bert_tokenizer(test)

In [None]:
print(X_train_bert[0][:10])

In [None]:
maxlen = 128

X_train_bert = tf.keras.preprocessing.sequence.pad_sequences(X_train_bert, maxlen=maxlen)
X_val_bert = tf.keras.preprocessing.sequence.pad_sequences(X_val_bert, maxlen=maxlen)
X_test_bert = tf.keras.preprocessing.sequence.pad_sequences(X_test_bert, maxlen=maxlen)

X_train_wordlevel = tf.keras.preprocessing.sequence.pad_sequences(X_train_wordlevel, maxlen=maxlen)
X_val_wordlevel = tf.keras.preprocessing.sequence.pad_sequences(X_val_wordlevel, maxlen=maxlen)
X_test_wordlevel = tf.keras.preprocessing.sequence.pad_sequences(X_test_wordlevel, maxlen=maxlen)

y_train = np.array(df_train["label"].values)
y_val = np.array(df_val["label"].values)
y_test = np.array(df_test["label"].values)

In [None]:
import tensorflow as tf

model_wordlevel = tf.keras.Sequential([
    tf.keras.layers.Embedding(len(word_index)+1, 8, input_length=maxlen),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=64),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
]) 

model_wordlevel.compile(optimizer='adam',
                   loss="binary_crossentropy",
                   metrics=['binary_accuracy'])

model_wordlevel.summary()

In [None]:
with tf.device('/CPU:0'):
    h_wordlevel = model_wordlevel.fit(X_train_wordlevel,
                                    y_train,
                                    epochs=5,
                                    batch_size=32,
                                    validation_data=(X_val_wordlevel, y_val))

In [None]:
import matplotlib.pyplot as plt

plt.plot(h_wordlevel.history['loss'], label='(training data)',color='blue')
plt.plot(h_wordlevel.history['val_loss'], label='(val data)',color='green')
plt.title('Neural Network training loss')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.plot(h_wordlevel.history['binary_accuracy'], label='(training data)',color='blue')
plt.plot(h_wordlevel.history['val_binary_accuracy'], label='(val data)',color='green')
plt.title('Neural Network training accuracy')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
import tensorflow as tf

max_features = 30523  # vocab size

model_bert = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_features, 8, input_length=maxlen),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
]) 

model_bert.compile(optimizer='adam',
                   loss="binary_crossentropy",
                   metrics=['binary_accuracy'])

model_bert.summary()

In [None]:
with tf.device('/CPU:0'):
    h_bert = model_bert.fit(X_train_bert, 
                            y_train,
                            epochs=5,
                            batch_size=32,
                            validation_data=(X_val_bert, y_val))

In [None]:
import matplotlib.pyplot as plt

plt.plot(h_bert.history['loss'], label='(training data)',color='blue')
plt.plot(h_bert.history['val_loss'], label='(val data)',color='green')
plt.title('Neural Network training loss')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.plot(h_bert.history['binary_accuracy'], label='(training data)',color='blue')
plt.plot(h_bert.history['val_binary_accuracy'], label='(val data)',color='green')
plt.title('Neural Network training accuracy')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
with tf.device('/CPU:0'):
    results = model_wordlevel.evaluate(X_test_wordlevel, y_test, batch_size=32)
    print("test loss, test acc:", results)

In [None]:
with tf.device('/CPU:0'):
    results = model_bert.evaluate(X_test_bert, y_test, batch_size=32)
    print("test loss, test acc:", results)