In [1]:
%load_ext tensorboard

In [2]:
from pandas import read_parquet
from data import file

data_train = read_parquet(file.news_articles_cleaned_train)
data_test = read_parquet(file.news_articles_cleaned_test)

data_train

In [3]:
from keras.layers import TextVectorization
import fasttext.util
import tensorflow as tf
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, InputLayer
from tensorflow.keras.layers import Embedding
from tensorflow.keras import Sequential
import numpy as np
import os
import datetime
from tensorflow import keras
from keras.callbacks import TensorBoard
from tensorflow.keras.optimizers import Adam
from preprocessing.text import extract_vocabulary


def get_embedder_fasttext(embedding_dim, model_name="cc.de.300.bin"):
    split = model_name.split(".")
    model_lang = split[1]
    model_dim = int(split[2])

    try:
        ft = fasttext.load_model(model_name)
    except ValueError:
        fasttext.util.download_model(model_lang, if_exists='ignore')
        ft = fasttext.load_model(model_name)

    if embedding_dim < model_dim:
        fasttext.util.reduce_model(ft, embedding_dim)

    def fasttext_embedder(word):
        return ft.get_word_vector(word)

    return fasttext_embedder


def calculate_embedding_matrix(vocabulary, embedding_dim=300, verbose=False):
    """Creates the embedding matrix
    """
    voc_size = len(vocabulary)
    words_not_found = set()
    embedding_matrix = np.zeros((voc_size, embedding_dim))

    embedder = get_embedder_fasttext(embedding_dim)
    
    
    for idx, word in enumerate(vocabulary):
        embedding_vector = embedder(word)
        if (embedding_vector is not None) and len(embedding_vector) > 0 and not np.all(embedding_vector == 0):
            # words not found in embedding index will be all-zeros.
            embedding_matrix[idx] = embedding_vector
        else:
            words_not_found.add(word)

    if verbose:
        print("Embedding type: fasttext")
        print("Number of null word embeddings:", np.sum(np.sum(embedding_matrix, axis=1) == 0))
        nr_words_not_found = len(words_not_found)
        print("Words not found in total:", len(words_not_found))
        if nr_words_not_found > 0:
            import random

            nr_sample = min(20, len(words_not_found))
            print("Words without embedding (", nr_sample, "/", nr_words_not_found, "): ",
                  random.sample(words_not_found, nr_sample), sep='')

    return embedding_matrix


def compile_model(model, loss_function="categorical_crossentropy", learning_rate=0.01, model_metric = ["accuracy"]):
    adam = Adam(learning_rate=learning_rate)
    model.compile(loss=loss_function, optimizer=adam, metrics=model_metric)
    
    
def build_model_cnn_simple(X_train, y_train, conv_num_filters=128, conv_kernel_size=7):

    vocabulary, embedding_length = extract_vocabulary(X_train, verbose=True)
    print(f"embedding length: {embedding_length}")
    
    embedding_matrix = calculate_embedding_matrix(vocabulary)
    embedding_input_dim, embedding_output_dim = embedding_matrix.shape[0], embedding_matrix.shape[1]
    
    output_classes = len(y_train.unique())

    vectorize_layer = TextVectorization(
        output_mode='int',
        output_sequence_length=None,
        vocabulary=list(vocabulary),
        name="text_vectorization"
    )

    embedding_layer = Embedding(
        embedding_input_dim,
        embedding_output_dim,
        weights=[embedding_matrix],
        input_length=embedding_length,
        trainable=False,
        mask_zero=True,
        name="embedding"
    )


    model = Sequential(name="cnn")
    model.add(InputLayer(input_shape=(1,), dtype=tf.string, name="text_input"))
    model.add(vectorize_layer)
    model.add(embedding_layer)
    model.add(Conv1D(conv_num_filters, conv_kernel_size, activation="relu", strides=1, padding="valid", name="conv_1"))
    model.add(GlobalMaxPooling1D(name="global_max_pool_1"))
    model.add(Dense(output_classes, activation=tf.nn.softmax, name="prediction"))

    return model

ImportError: cannot import name 'extract_vocabulary' from 'preprocessing.text' (/home/jupyter/code/nlp-topic-classification-german/preprocessing/text.py)

In [5]:
import preprocessing.text as foo
foo.extract_focabulary


AttributeError: module 'preprocessing.text' has no attribute 'extract_focabulary'

In [None]:
X_train = data_train.text_tokenized_stemmed
y_train = data_train.label

X_test = data_test.text_tokenized_stemmed
y_test = data_test.label

In [None]:
model = build_model_cnn_simple(X_train, y_train)
model.summary()

In [None]:
compile_model(model)

In [None]:
from sklearn.preprocessing import LabelBinarizer

y_train = data_train.label
label_binarizer = LabelBinarizer()
label_binarizer.fit(y_train)

x_train = data_train['text_stem']
y_train_bin = label_binarizer.transform(y_train)

x_test = data_test['text_stem']
y_test_bin = label_binarizer.transform(y_test)


In [None]:
batch_size = 32
train_input = tf.data.Dataset.from_tensor_slices((x_train, y_train_bin)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_input = tf.data.Dataset.from_tensor_slices((x_test, y_test_bin)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
train_input


In [None]:
#%tensorboard --logdir logs/fit

#callbacks = [TensorBoard("logs/fit", histogram_freq=1)]
callbacks = []
history = model.fit(train_input, validation_data=test_input, callbacks=callbacks, epochs=5)

In [None]:
from reporting.training import plot_history
    
plot_history(history)

In [None]:
y_predict = label_binarizer.inverse_transform(model.predict(x_test[0:100]))


from reporting.evaluation import plot_confusion_matrix
plot_confusion_matrix(y_test[0:100], y_predict)