In [None]:
import pandas as pd

df_train = pd.read_json("../data/news_train.json", orient='records')
df_train.info()
df_train.hist()
df_train.head()

In [None]:
df_test = pd.read_json("../data/news_test.json", orient='records')
df_test.info()
df_test.hist()
df_test.head()

In [None]:
df_val = pd.read_json("../data/news_val.json", orient='records')
df_val.info()
df_val.hist()
df_val.head()

In [None]:
import numpy as np
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer("../data/bert-base-uncased-vocab.txt",
                                   lowercase=True)

def bert_tokenizer(data):
    result = []
    for item in data:
        result.append(tokenizer.encode(item).ids)
    
    return np.array(result, dtype=object)

In [None]:
import tensorflow as tf

maxlen = 128

X_train = list(df_train["text"].values)
X_train = bert_tokenizer(X_train)
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=maxlen)
y_train = df_train["label"].values

X_val = list(df_val["text"].values)
X_val = bert_tokenizer(X_val)
X_val = tf.keras.preprocessing.sequence.pad_sequences(X_val, maxlen=maxlen)
y_val = df_val["label"].values


X_test = list(df_test["text"].values)
X_test = bert_tokenizer(X_test)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=maxlen)
y_test = df_test["label"].values

In [None]:
import tensorflow as tf

emb_size = 32
max_features = 30523 # vocab size

model_simple_rnn = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_features, emb_size, input_length=maxlen),
    tf.keras.layers.SimpleRNN(units=emb_size, activation='tanh'),
    tf.keras.layers.Dense(units=4, activation='softmax')
]) 

model_simple_rnn.compile(optimizer='adam',
                         loss="sparse_categorical_crossentropy",
                         metrics=['accuracy'])

model_simple_rnn.summary()

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2)

h_simple_rnn = model_simple_rnn.fit(X_train, 
                                    y_train,
                                    epochs=5,
                                    batch_size=32,
                                    callbacks=[callback],
                                    validation_data=(X_val, y_val))

In [None]:
import matplotlib.pyplot as plt

plt.plot(h_simple_rnn.history['loss'], label='(training data)',color='blue')
plt.plot(h_simple_rnn.history['val_loss'], label='(val data)',color='green')
plt.title('Neural Network training loss')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.plot(h_simple_rnn.history['accuracy'], label='(training data)',color='blue')
plt.plot(h_simple_rnn.history['val_accuracy'], label='(val data)',color='green')
plt.title('Neural Network training accuracy')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
results = model_simple_rnn.evaluate(X_test, y_test, batch_size=32)
print("Simple rnn test loss, test acc:", results)

In [None]:
import tensorflow as tf

emb_size = 32
max_features = 30523 # vocab size

model_lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_features, emb_size, input_length=maxlen),
    tf.keras.layers.LSTM(units=emb_size, activation='tanh'),
    tf.keras.layers.Dense(units=4, activation='softmax')
]) 

model_lstm.compile(optimizer='adam',
                   loss="sparse_categorical_crossentropy",
                   metrics=['accuracy'])

model_lstm.summary()

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2)

h_lstm = model_lstm.fit(X_train, 
                        y_train,
                        epochs=5,
                        batch_size=32,
                        callbacks=[callback],
                        validation_data=(X_test, y_test))

In [None]:
import matplotlib.pyplot as plt

plt.plot(h_lstm.history['loss'], label='(training data)',color='blue')
plt.plot(h_lstm.history['val_loss'], label='(val data)',color='green')
plt.title('Neural Network training loss')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.plot(h_lstm.history['accuracy'], label='(training data)',color='blue')
plt.plot(h_lstm.history['val_accuracy'], label='(val data)',color='green')
plt.title('Neural Network training accuracy')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
results = model_lstm.evaluate(X_test, y_test, batch_size=32)
print("LSTM test loss, test acc:", results)

In [None]:
import tensorflow as tf

emb_size = 32
max_features = 30523 # vocab size

model_gru = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_features, emb_size, input_length=maxlen),
    tf.keras.layers.GRU(units=emb_size, activation='tanh'),
    tf.keras.layers.Dense(units=4, activation='softmax')
]) 

model_gru.compile(optimizer='adam',
                  loss="sparse_categorical_crossentropy",
                  metrics=['accuracy'])

model_gru.summary()

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2)

h_gru = model_gru.fit(X_train, 
                      y_train,
                      epochs=5,
                      batch_size=32,
                      callbacks=[callback],
                      validation_data=(X_test, y_test))

In [None]:
import matplotlib.pyplot as plt

plt.plot(h_gru.history['loss'], label='(training data)',color='blue')
plt.plot(h_gru.history['val_loss'], label='(val data)',color='green')
plt.title('Neural Network training loss')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.plot(h_gru.history['accuracy'], label='(training data)',color='blue')
plt.plot(h_gru.history['val_accuracy'], label='(val data)',color='green')
plt.title('Neural Network training accuracy')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
results = model_gru.evaluate(X_test, y_test, batch_size=32)
print("GRU test loss, test acc:", results)