# Sentimetn analysis with Neural Network

In [None]:
import pandas as pd

df_train = pd.read_json("../data/aclIMDB_train.json", orient='records')
df_train.info()
df_train.hist()
df_train.head()

In [None]:
df_test = pd.read_json("../data/aclIMDB_test.json", orient='records')
df_test.info()
df_test.hist()
df_test.head()

In [None]:
df_val = pd.read_json("../data/aclIMDB_val.json", orient="records")
df_val.info()
df_val.hist()
df_val.head()

In [None]:
train = [str(item).lower() for item in  df_train["text"].values]
val = [str(item).lower() for item in  df_val["text"].values]
test = [str(item).lower() for item in  df_test["text"].values]

In [None]:
import tensorflow as tf

VOCAB_SIZE = 10000

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE, oov_token="UNK")
tokenizer.fit_on_texts(train)

X_train = tokenizer.texts_to_sequences(train)
X_val = tokenizer.texts_to_sequences(val)
X_test = tokenizer.texts_to_sequences(test)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

In [None]:
import numpy as np

def token_to_onehot(t):
    onehot = np.zeros((VOCAB_SIZE))

    for element in t:
        onehot[element] = 1
    return onehot
        

In [None]:
token_to_onehot([0, 1, 2, VOCAB_SIZE-1])

In [None]:
import numpy as np
# maxlen = 100

y_train = np.array(df_train["label"].values)
y_val = np.array(df_val["label"].values)
y_test = np.array(df_test["label"].values)

In [None]:
X_train = np.array([token_to_onehot(X_train[i]) for i in range(len(X_train))])
X_val = np.array([token_to_onehot(X_val[i]) for i in range(len(X_val))])
X_test = np.array([token_to_onehot(X_test[i]) for i in range(len(X_test))])

In [None]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(VOCAB_SIZE,)),
    tf.keras.layers.Dense(64),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# https://keras.io/api/optimizers/
# https://keras.io/api/optimizers/adam/
# https://keras.io/api/losses/
# https://keras.io/api/losses/probabilistic_losses/#binarycrossentropy-class
# https://keras.io/api/metrics/accuracy_metrics/
# https://keras.io/api/metrics/
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),
              loss="binary_crossentropy",
              metrics=['binary_accuracy'])
              
model.summary()

In [None]:
h = model.fit(X_train,
              y_train,
              validation_data=(X_val, y_val),
              epochs=10,
              batch_size=32)

In [None]:
import matplotlib.pyplot as plt

plt.plot(h.history['loss'], label='(training data)',color='blue')
plt.plot(h.history['val_loss'], label='(val data)',color='green')
plt.title('Neural Network training loss')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.plot(h.history['binary_accuracy'], label='(training data)',color='blue')
plt.plot(h.history['val_binary_accuracy'], label='(val data)',color='green')
plt.title('Neural Network training accuracy')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
results = model.evaluate(X_test, y_test, batch_size=32)
print("test loss, test acc:", results)

In [None]:
df_test_raw = pd.read_json("../data/aclIMDB_test.json", orient="records")

In [None]:
predictions = np.round(model.predict(X_test[:5]),0)
for i in range(len(predictions)):
    print(df_test_raw["text"].values[i][:100],"...")
    print("Pred: ", predictions[i][0], "Real: ", y_test[i])