In [None]:
import pandas as pd

df_train = pd.read_json("../data/news_train.json", orient='records')
df_train.info()
df_train.hist()
df_train.head()

In [None]:
df_test = pd.read_json("../data/news_test.json", orient='records')
df_test.info()
df_test.hist()
df_test.head()

In [None]:
df_val = pd.read_json("../data/news_val.json", orient='records')
df_val.info()
df_val.hist()
df_val.head()

In [None]:
train = list(df_train["text"].values)
val = list(df_val["text"].values)
test = list(df_test["text"].values)

In [None]:
import numpy as np
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer("../data/bert-base-uncased-vocab.txt",
                                   lowercase=True)

def bert_tokenizer(data):
    result = []
    for item in data:
        result.append(tokenizer.encode(item).ids)
    
    return np.array(result, dtype=object)

In [None]:
import tensorflow as tf

maxlen = 128

X_train = bert_tokenizer(train)
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=maxlen)
y_train = df_train["label"].values

X_val = bert_tokenizer(val)
X_val = tf.keras.preprocessing.sequence.pad_sequences(X_val, maxlen=maxlen)
y_val = df_val["label"].values


X_test = bert_tokenizer(test)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=maxlen)
y_test = df_test["label"].values

In [None]:
import tensorflow as tf

emb_size = 32
max_features = 30523 # vocab size

model_cnn = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_features, emb_size, input_length=maxlen),
    tf.keras.layers.Conv1D(32, 7, activation='relu'),
    tf.keras.layers.MaxPooling1D(5),
    tf.keras.layers.Conv1D(32, 7, activation='relu'),
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(units=4, activation='softmax')
]) 

model_cnn.compile(optimizer='adam',
                  loss="sparse_categorical_crossentropy",
                  metrics=['accuracy'])

model_cnn.summary()

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2)

h_cnn = model_cnn.fit(X_train, 
                      y_train,
                      epochs=5,
                      batch_size=32,
                      callbacks=[callback],
                      validation_data=(X_val, y_val))

In [None]:
import matplotlib.pyplot as plt

plt.plot(h_cnn.history['loss'], label='(training data)',color='blue')
plt.plot(h_cnn.history['val_loss'], label='(val data)',color='green')
plt.title('Neural Network loss')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.plot(h_cnn.history['accuracy'], label='(training data)',color='blue')
plt.plot(h_cnn.history['val_accuracy'], label='(val data)',color='green')
plt.title('Neural Network accuracy')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
results = model_cnn.evaluate(X_test, y_test, batch_size=32)
print("Simple cnn test loss, test acc:", results)

In [None]:
total_len = max([len(item) for item in train])
total_len

In [None]:
X_train = bert_tokenizer(train)
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=total_len)

X_val = bert_tokenizer(val)
X_val = tf.keras.preprocessing.sequence.pad_sequences(X_val, maxlen=total_len)

X_test = bert_tokenizer(test)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=total_len)

In [None]:
import tensorflow as tf

emb_size = 32
max_features = 30524 # vocab size

model_cnn_gru = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_features, emb_size, input_length=total_len),
    tf.keras.layers.Conv1D(32, 7, activation='relu'),
    tf.keras.layers.GRU(32),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(units=4, activation='softmax')
]) 

model_cnn_gru.compile(optimizer='adam',
                      loss="sparse_categorical_crossentropy",
                      metrics=['accuracy'])

model_cnn_gru.summary()

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2)

h_cnn_gru = model_cnn_gru.fit(X_train, 
                              y_train,
                              epochs=5,
                              batch_size=32,
                              callbacks=[callback],
                              validation_data=(X_val, y_val))

In [None]:
results = model_cnn_gru.evaluate(X_test, y_test, batch_size=32)
print("Simple cnn+gru test loss, test acc:", results)