In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


plt.style.use('ggplot')

In [None]:
def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(16, 5))

    plt.subplot(1, 2, 1)
    plt.plot(x, acc, label='training accuracy')
    plt.plot(x, val_acc, label='validation accuracy')
    plt.title('Accuracy')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, label='training loss')
    plt.plot(x, val_loss, label='validation loss')
    plt.title('Loss')
    plt.legend()

## Read dataset

In [None]:
df_newsgroup = pd.read_csv('/kaggle/input/20-newsgroup-preprocessed/20newsgroup_preprocessed.csv', sep=';', usecols=['target', 'text_cleaned'])
df_newsgroup.rename(columns={'text_cleaned' : 'text'}, inplace=True)

## Encode classes

In [None]:
le = LabelEncoder()
le.fit(df_newsgroup['target'].unique())

In [None]:
df_newsgroup['target'] = le.transform(df_newsgroup['target'])

## Divide dataset in train and test

In [None]:
X = df_newsgroup['text'].astype(str)
y = tf.keras.utils.to_categorical(df_newsgroup['target'], num_classes=df_newsgroup['target'].nunique())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df_newsgroup['target'])

## Tokenize words

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X_train)

vocab_size = len(tokenizer.word_index) + 1

## Text to sentence

In [None]:
train_seq = tokenizer.texts_to_sequences(X_train)
test_seq = tokenizer.texts_to_sequences(X_test)

## Padding

In [None]:
max_length = len(max(train_seq, key=len))

train_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seq, maxlen=max_length, padding='post', truncating='post')
test_vector = tf.keras.preprocessing.sequence.pad_sequences(test_seq, maxlen=max_length, padding='post', truncating='post')

## Model

In [None]:
class StopTrainOnHighAccuracy(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        acc_threshold = 0.9
        if logs.get('accuracy') > acc_threshold:
            print(f"\nReached {acc_threshold} accuracy, cancelling training")
            self.model.stop_training = True

def model(vocab_size, max_length):
    model = tf.keras.models.Sequential([
        tf.keras.layers.Embedding(vocab_size, 64, input_length=max_length),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(20, activation='softmax')
    ])
    
    return model
    
model = model(vocab_size, max_length)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

### Train

In [None]:
history = model.fit(train_vector, y_train, epochs=10, validation_data=(test_vector, y_test), callbacks=[StopTrainOnHighAccuracy()])

### Validation

In [None]:
loss, accuracy = model.evaluate(train_vector, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))

loss, accuracy = model.evaluate(test_vector, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
plot_history(history)

## Evaluate

In [None]:
predictions = model.predict_classes(test_vector)
ground_truth = np.argmax(y_test, axis=1)

In [None]:
list_precision = []
list_recall = []
list_f1 = []
for precision, target_class in zip(precision_score(ground_truth, predictions, labels=le.transform(le.classes_), average=None), le.classes_):
    list_precision.append({'target' : target_class, 'precision' : precision})
    
for recall in recall_score(ground_truth, predictions, labels=le.transform(le.classes_), average=None):
    list_recall.append(recall)
    
for recall in f1_score(ground_truth, predictions, labels=le.transform(le.classes_), average=None):
    list_f1.append(recall)
        
df_metrics = pd.DataFrame(list_precision)
df_metrics['recall'] = list_recall
df_metrics['f1_score'] = list_f1

In [None]:
df_metrics = round(df_metrics, 2)
df_metrics.sort_values('f1_score', ascending=False)