In [None]:
import pandas as pd

df_train = pd.read_json("../data/aclIMDB_train.json", orient='records')
df_train.info()
df_train.hist()
df_train.head()

In [None]:
df_val = pd.read_json("../data/aclIMDB_val.json", orient="records")
df_val.info()
df_val.hist()
df_val.head()

In [None]:
df_test = pd.read_json("../data/aclIMDB_test.json", orient='records')
df_test.info()
df_test.hist()
df_test.head()

In [None]:
train = [str(item).lower() for item in  df_train["text"].values]
val = [str(item).lower() for item in  df_val["text"].values]
test = [str(item).lower() for item in  df_test["text"].values]

In [None]:
import tensorflow as tf

VOCAB_SIZE = 10000

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE, oov_token="UNK")
tokenizer.fit_on_texts(train)

X_train = tokenizer.texts_to_sequences(train)
X_val = tokenizer.texts_to_sequences(val)
X_test = tokenizer.texts_to_sequences(test)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
import numpy as np

maxlen = 128

X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=maxlen)
X_val = tf.keras.preprocessing.sequence.pad_sequences(X_val, maxlen=maxlen)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=maxlen)

y_train = np.array(df_train["label"].values)
y_val = np.array(df_val["label"].values)
y_test = np.array(df_test["label"].values)

In [None]:
pip install wget

In [None]:
import wget
url = "http://nlp.stanford.edu/data/glove.6B.zip"
path="../data/"
filename = wget.download(url, out=path)

In [None]:
import zipfile
with zipfile.ZipFile(filename, "r") as fzip:
    fzip.extractall(path)

In [None]:
embeddings_index = {}
with open("../data/glove.6B.50d.txt", encoding='utf8') as f:
    for line in f:
        word, vector = line.split(maxsplit=1)
        vector = np.fromstring(vector, "f", sep=" ")
        embeddings_index[word] = vector

print("Found %s word vectors." % len(embeddings_index))

In [None]:
num_tokens = len(word_index) + 2
embedding_dim = 50
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1

print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
print("dictionary coverage: ", round(hits / num_tokens * 100),"%")

In [None]:
import tensorflow as tf

embedding_dim = 50

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(num_tokens, 
                              embedding_dim,                               
                              embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                              trainable=False,
                              input_length=maxlen),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=64),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
]) 

model.compile(optimizer='adam',
              loss="binary_crossentropy",
              metrics=['binary_accuracy'])

model.summary()

In [None]:
history = model.fit(X_train,
                    y_train,
                    epochs=20,
                    batch_size=32,
                    validation_data=(X_val, y_val))

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'], label='(training data)',color='blue')
plt.plot(history.history['val_loss'], label='(val data)',color='green')
plt.title('Neural Network training loss')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
plt.plot(history.history['binary_accuracy'], label='(training data)',color='blue')
plt.plot(history.history['val_binary_accuracy'], label='(val data)',color='green')
plt.title('Neural Network training accuracy')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
results = model.evaluate(X_test, y_test, batch_size=32)
print("test loss, test acc:", results)