In [None]:
import numpy as np
import regex as re
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

import keras
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

In [None]:
df1 = pd.read_csv('data/clean/processed/train_1000.txt', sep='@@')
df2 = pd.read_csv('data/clean/processed/train_2000.txt', sep='@@')
df3 = pd.read_csv('data/clean/processed/train_3000.txt', sep='@@')
df4 = pd.read_csv('data/clean/processed/train_4000.txt', sep='@@')
df5 = pd.read_csv('data/clean/processed/train_5500.txt', sep='@@')

In [3]:
frames = [df1, df2, df3, df4, df5]
final_df = pd.concat(frames)
final_df.shape

(15452, 4)

In [4]:
def normalize_text(s):
    s = s.lower()
    
    # remove punctuation that is not word-internal (e.g., hyphens, apostrophes)
    s = re.sub('\s\W',' ',s)
    s = re.sub('\W\s',' ',s)
    
    # make sure we didn't introduce any double spaces
    s = re.sub('\s+',' ',s)
    
    return s

final_df['question'] = [normalize_text(s) for s in final_df['question']]

In [5]:
# container for sentences
headlines = np.array([headline for headline in final_df['question']])

In [6]:
# a is DF series
onehot_encoded = pd.get_dummies(final_df['entity']).astype('float32').values 

In [7]:
X = headlines
y = onehot_encoded

In [8]:
train_pct_index = int(0.8 * len(X))
X_train, X_test = X[:train_pct_index], X[train_pct_index:]
y_train, y_test = y[:train_pct_index], y[train_pct_index:]

In [9]:
# model parameters
vocab_size = 1200
embedding_dim = 16
max_length = 120
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [10]:
# tokenize sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

# convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, padding=padding_type, maxlen=max_length)

# convert validation dataset to sequence and pad sequences
validation_sequences = tokenizer.texts_to_sequences(X_test)
validation_padded = pad_sequences(validation_sequences, padding=padding_type, maxlen=max_length)

In [None]:
# model initialization
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(6, activation='sigmoid')
])

# compile model
# categorical-cross-entropy requires labels one-hot-encoded. sparse = as ints. binary = t/f
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# model summary
print(model.summary())

In [None]:
# fit model
num_epochs = 20
history = model.fit(train_padded, y_train, 
                    epochs=num_epochs, verbose=1,
                    validation_split=0.3)

# predict values
pred = model.predict(validation_padded)

In [13]:
import matplotlib.pyplot as plt

def plot_history_and_save(history, save_path):
    
    plt.figure(figsize=(18, 6))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training Accuracy', c='dodgerblue', lw='2')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy', c='orange', lw='2')
    plt.title('Accuracy', loc='left', fontsize=16)
    plt.xlabel("Epochs")
    plt.ylabel('Accuracy')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Loss', c='dodgerblue', lw='2')
    plt.plot(history.history['val_loss'], label='Validation Loss', c='orange', lw='2')
    plt.title('Loss', loc='left', fontsize=16)
    plt.xlabel("Epochs")
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(save_path, bbox_inches='tight')
    plt.show()

In [16]:
run_no = '1'
model_name = 'model_5500'
#####
save_path = f'images/plot_history/{model_name}/run_{run_no}.png' 

In [None]:
plot_history_and_save(history, save_path)

In [69]:
from sklearn.metrics import multilabel_confusion_matrix, confusion_matrix

def plot_cm(pred):
    
    pred = np.round(pred)
    
    fig, ax = plt.subplots(1, 1, figsize=(6, 6))

    mcm = multilabel_confusion_matrix(y_test, pred)

    # cm = confusion_matrix(validation_labels, pred)
    sns.heatmap(mcm, annot=True, cbar=False, fmt='1d', cmap='Blues', ax=ax)

    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')
    ax.set_yticklabels(['DESC', 'ENTY', 'LOC', 'HUM', 'NUM', 'ABBR'])
    ax.set_xticklabels(['DESC', 'ENTY', 'LOC', 'HUM', 'NUM', 'ABBR'])

    plt.show()

In [70]:
mcm = confusion_matrix(y_test.argmax(axis=1), pred.argmax(axis=1))


In [71]:
mcm_df = pd.DataFrame(mcm,
                     index = ['DESC', 'ENTY', 'LOC', 'HUM', 'NUM', 'ABBR'], 
                     columns = ['DESC', 'ENTY', 'LOC', 'HUM', 'NUM', 'ABBR'])

In [75]:
save_path_cm = f'images/cm/{model_name}/run_{run_no}.png' 

In [None]:
#Plotting the confusion matrix
plt.figure(figsize=(5,4))
sns.heatmap(mcm_df, annot=True)
plt.title('Confusion Matrix')
plt.ylabel('Actal Values')
plt.xlabel('Predicted Values')
plt.savefig(save_path_cm, bbox_inches='tight')
plt.show()

In [None]:
# TensorFlow SavedModel format => .keras
model_file = 'models/5500'
model.save(model_file)

In [None]:
# It can be used to reconstruct the model identically.
reconstructed_model = keras.models.load_model(model_file)

# Let's check:
np.testing.assert_allclose(
    model.predict(validation_padded), reconstructed_model.predict(validation_padded)
)