In [None]:
import pandas as pd

df_train = pd.read_json("../data/news_train.json", orient='records')
df_train.info()
df_train.hist()
df_train.head()

In [None]:
df_test = pd.read_json("../data/news_test.json", orient='records')
df_test.info()
df_test.hist()
df_test.head()

In [None]:
df_val = pd.read_json("../data/news_val.json", orient='records')
df_val.info()
df_val.hist()
df_val.head()

In [None]:
import numpy as np
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer("../data/bert-base-uncased-vocab.txt",
                                   lowercase=True)

def bert_tokenizer(data):
    result = []
    for item in data:
        result.append(tokenizer.encode(item).ids)
    
    return np.array(result)

In [None]:
import tensorflow as tf

maxlen = 128

X_train = list(df_train["text"].values)
X_train = bert_tokenizer(X_train)
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=maxlen)
y_train = df_train["label"].values

X_val = list(df_val["text"].values)
X_val = bert_tokenizer(X_val)
y_val = df_val["label"].values
X_val = tf.keras.preprocessing.sequence.pad_sequences(X_val, maxlen=maxlen)

X_test = list(df_test["text"].values)
X_test = bert_tokenizer(X_test)
y_test = df_test["label"].values
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=maxlen)

In [None]:
import tensorflow as tf

max_features = 30523  # vocab size

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_features, 8, input_length=maxlen),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(32),
    tf.keras.layers.Dense(units=4, activation='softmax')
]) 

model.compile(optimizer='adam',
              loss="sparse_categorical_crossentropy",
              metrics=['accuracy'])

model.summary()

In [None]:
with tf.device('/CPU:0'):
    h = model.fit(X_train, 
                y_train,
                epochs=10,
                batch_size=32,
                validation_data=(X_val, y_val))

In [None]:
import matplotlib.pyplot as plt

plt.plot(h.history['loss'], label='(training data)',color='blue')
plt.plot(h.history['val_loss'], label='(val data)',color='green')
plt.title('Neural Network loss')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.plot(h.history['accuracy'], label='(training data)',color='blue')
plt.plot(h.history['val_accuracy'], label='(val data)',color='green')
plt.title('Neural Network accuracy')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
with tf.device('/CPU:0'):
    results = model.evaluate(X_test, y_test, batch_size=32)
    print("test loss, test acc:", results)

In [None]:
df_test_raw = pd.read_json("../data/news_test.json", orient="records")

In [None]:
with tf.device('/CPU:0'):
    predictions = np.round(model.predict(X_test[:100]))
    for i in range(len(predictions)):
        print(df_test_raw["text"].values[i][:100],"...")
        print("Pred: ", predictions[i], "Real: ", y_test[i])