In [None]:
import pandas as pd

df_train = pd.read_json("../data/aclIMDB_train.json", orient='records')
df_train.info()
df_train.hist()
df_train.head()

In [None]:
df_val = pd.read_json("../data/aclIMDB_val.json", orient="records")
df_val.info()
df_val.hist()
df_val.head()

In [None]:
df_test = pd.read_json("../data/aclIMDB_test.json", orient='records')
df_test.info()
df_test.hist()
df_test.head()

In [None]:
train = [str(item).lower() for item in  df_train["text"].values]
val = [str(item).lower() for item in  df_val["text"].values]
test = [str(item).lower() for item in  df_test["text"].values]

In [None]:
import numpy as np
import tensorflow as tf
from tokenizers import BertWordPieceTokenizer

maxlen = 128
tokenizer = BertWordPieceTokenizer("../data/bert-base-uncased-vocab.txt", lowercase=True)

def bert_tokenizer(data):
    result = []
    for item in data:
        result.append(tokenizer.encode(item).ids)
    
    return np.array(result, dtype=object)

X_train_bert = bert_tokenizer(train)
X_val_bert = bert_tokenizer(val)
X_test_bert = bert_tokenizer(test)

X_train_bert = tf.keras.preprocessing.sequence.pad_sequences(X_train_bert, maxlen=maxlen)
X_val_bert = tf.keras.preprocessing.sequence.pad_sequences(X_val_bert, maxlen=maxlen)
X_test_bert = tf.keras.preprocessing.sequence.pad_sequences(X_test_bert, maxlen=maxlen)

y_train = np.array(df_train["label"].values)
y_val = np.array(df_val["label"].values)
y_test = np.array(df_test["label"].values)

In [None]:
import tensorflow as tf

emb_size = 32
max_features = 30523 # vocab size

model_cnn = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_features, emb_size, input_length=maxlen),
    tf.keras.layers.Conv1D(32, 7, activation='relu'),
    tf.keras.layers.MaxPooling1D(5),
    tf.keras.layers.Conv1D(32, 7, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
]) 

model_cnn.compile(optimizer='adam',
                  loss="binary_crossentropy",
                  metrics=['binary_accuracy'])

model_cnn.summary()

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=2)

h_cnn = model_cnn.fit(X_train_bert, 
                      y_train,
                      epochs=5,
                      batch_size=32,
                      callbacks=[callback],
                      validation_data=(X_val_bert, y_val))

In [None]:
import matplotlib.pyplot as plt

plt.plot(h_cnn.history['loss'], label='(training data)',color='blue')
plt.plot(h_cnn.history['val_loss'], label='(val data)',color='green')
plt.title('Neural Network loss')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.plot(h_cnn.history['binary_accuracy'], label='(training data)',color='blue')
plt.plot(h_cnn.history['val_binary_accuracy'], label='(val data)',color='green')
plt.title('Neural Network accuracy')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
results = model_cnn.evaluate(X_test_bert, y_test, batch_size=32)
print("Simple cnn test loss, test acc:", results)

In [None]:
X_train_bert = bert_tokenizer(train)
X_val_bert = bert_tokenizer(val)
X_test_bert = bert_tokenizer(test)

In [None]:
max_len_avg = [len(item) for item in X_train_bert]
max_len_avg = int(round(sum(max_len_avg) / len(max_len_avg)))
max_len_avg

In [None]:
len([item for item in X_train_bert if len(item) <= 512]) / len(X_train_bert)

In [None]:
max_len_512 = 512
X_train_bert = tf.keras.preprocessing.sequence.pad_sequences(X_train_bert, maxlen=max_len_512)
X_val_bert = tf.keras.preprocessing.sequence.pad_sequences(X_val_bert, maxlen=max_len_512)
X_test_bert = tf.keras.preprocessing.sequence.pad_sequences(X_test_bert, maxlen=max_len_512)

In [None]:
import tensorflow as tf

emb_size = 32
max_features = 30524 # vocab size

model_cnn_gru = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_features, emb_size, input_length=max_len_512),
    tf.keras.layers.Conv1D(32, 7, activation='relu'),
    tf.keras.layers.GRU(32),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
]) 

model_cnn_gru.compile(optimizer='adam',
                      loss="binary_crossentropy",
                      metrics=['binary_accuracy'])

model_cnn_gru.summary()

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', patience=2)

h_cnn_gru = model_cnn_gru.fit(X_train_bert, 
                              y_train,
                              epochs=5,
                              batch_size=32,
                              callbacks=[callback],
                              validation_data=(X_val_bert, y_val))

In [None]:
import matplotlib.pyplot as plt

plt.plot(h_cnn_gru.history['loss'], label='(training data)',color='blue')
plt.plot(h_cnn_gru.history['val_loss'], label='(val data)',color='green')
plt.title('Neural Network loss')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.plot(h_cnn_gru.history['binary_accuracy'], label='(training data)',color='blue')
plt.plot(h_cnn_gru.history['val_binary_accuracy'], label='(val data)',color='green')
plt.title('Neural Network accuracy')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
results = model_cnn_gru.evaluate(X_test_bert, y_test, batch_size=32)
print("Simple cnn+gru test loss, test acc:", results)