In [None]:
from tensorflow import keras
from keras.layers import Conv1D, GlobalMaxPooling1D, InputLayer, Embedding
import tensorflow as tf
import fasttext.util
from keras.layers import TextVectorization
import numpy as np

from data import file
from fhnw.nlp.utils.storage import load_dataframe

In [None]:
def build_model_cnn_simple(vectorize_layer, embedding_layer, classes=1, conv_num_filters=128, conv_kernel_size=7):
    input_layer = InputLayer(input_shape=(1,), dtype=tf.string, name="text_input")
    convolution_layer = Conv1D(conv_num_filters, conv_kernel_size, activation="relu", strides=1, name="conv_1")
    global_max_pooling_layer = GlobalMaxPooling1D(name="global_max_pool_1")

    model = keras.Sequential(name="cnn")
    model.add(input_layer)
    model.add(vectorize_layer)
    model.add(embedding_layer)
    model.add(convolution_layer)
    model.add(global_max_pooling_layer)
    model.add(keras.layers.Dense(classes, activation="sigmoid", name="prediction"))

    return model


def get_embedder_fasttext(embedding_dim=50, model_name="cc.de.300.bin"):
    split = model_name.split(".")
    model_lang = split[1]
    model_dim = int(split[2])

    try:
        ft = fasttext.load_model(model_name)
    except ValueError:
        fasttext.util.download_model(model_lang, if_exists='ignore')
        ft = fasttext.load_model(model_name)

    if embedding_dim < model_dim:
        fasttext.util.reduce_model(ft, embedding_dim)

    def fasttext_embedder(word):
        return ft.get_word_vector(word)

    return fasttext_embedder


def calculate_embedding_matrix(vocabulary, embedder, embedding_dim=300, verbose=False):
    """Creates the embedding matrix
    """
    voc_size = len(vocabulary)
    words_not_found = set()
    embedding_matrix = np.zeros((voc_size, embedding_dim))

    for idx, word in enumerate(vocabulary):
        embedding_vector = embedder(word)
        if (embedding_vector is not None) and len(embedding_vector) > 0 and not np.all(embedding_vector == 0):
            # words not found in embedding index will be all-zeros.
            embedding_matrix[idx] = embedding_vector
        else:
            words_not_found.add(word)

    if verbose:
        print("Embedding type: fasttext")
        print("Number of null word embeddings:", np.sum(np.sum(embedding_matrix, axis=1) == 0))
        nr_words_not_found = len(words_not_found)
        print("Words not found in total:", len(words_not_found))
        if nr_words_not_found > 0:
            import random

            nr_sample = min(20, len(words_not_found))
            print("Words without embedding (", nr_sample, "/", nr_words_not_found, "): ",
                  random.sample(words_not_found, nr_sample), sep='')

    return embedding_matrix


def extract_vocabulary_and_set(data, verbose=False):
    sequence_length_percentil_cutoff = 0.98
    sequence_length_max = 768

    vocabulary = set()
    _ = data.apply(lambda x: vocabulary.update(x))

    lengths = data.apply(len)
    max_sequence_length = int(lengths.quantile(1.0))
    percentil_sequence_length = int(lengths.quantile(0.98))
    median_sequence_length = int(lengths.quantile(0.5))
    embedding_sequence_length = min(sequence_length_max, percentil_sequence_length)

    if verbose:
        print("Median sequence length:", median_sequence_length)
        print("Percentil (", sequence_length_percentil_cutoff, ") cutoff sequence length: ", percentil_sequence_length,
              sep='')
        print("Max sequence length:", max_sequence_length)
        print("Used embedding sequence length:", embedding_sequence_length)

    return (vocabulary, embedding_sequence_length)

In [None]:
data_train = load_dataframe(file.news_articles_cleaned_train)
X_train = data_train.text_tokenized_stemmed

X_train

In [None]:
vocabulary, embedding_length = extract_vocabulary_and_set(X_train, verbose=True)

vectorize_layer = TextVectorization(
    output_mode='int',
    output_sequence_length=None,
    vocabulary=list(vocabulary),
    name="text_vectorization"
)

embedder = get_embedder_fasttext()
embedding_matrix = calculate_embedding_matrix(vocabulary, embedder)
embedding_layer = Embedding(
    embedding_matrix.shape[0],
    embedding_matrix.shape[1],
    weights=[embedding_matrix],
    input_length=embedding_length,
    trainable=False,
    mask_zero=True,
    name="embedding"
)

model = build_model_cnn_simple(vectorize_layer, embedding_layer)
model.summary()



IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/var/folders/mq/bw836hy56s75dj_x3m9g2tsr0000gn/T/ipykernel_88744/2663914218.py", line 54, in get_embedder_fasttext
    ft = fasttext.load_model(model_name)
  File "/Users/raffael/learn/fhnw/nlp/nlp-projektarbeit/nlp-topic-classification-german/.venv/lib/python3.9/site-packages/fasttext/FastText.py", line 441, in load_model
    return _FastText(model_path=path)
  File "/Users/raffael/learn/fhnw/nlp/nlp-projektarbeit/nlp-topic-classification-german/.venv/lib/python3.9/site-packages/fasttext/FastText.py", line 98, in __init__
    self.f.loadModel(model_path)
ValueError: cc.de.300.bin cannot be opened for loading!

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/raffael/learn/fhnw/nlp/nlp-projektarbeit/nlp-topic-classification-german/.venv/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3444, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)


TypeError: object of type 'NoneType' has no len()