In [23]:
from keras.layers import TextVectorization
import fasttext.util
import tensorflow as tf
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, InputLayer
from tensorflow.keras.layers import Embedding
from tensorflow.keras import Sequential
import numpy as np


def build_model_cnn_simple(X_train, y_train, conv_num_filters=128, conv_kernel_size=7):

    vocabulary, embedding_length = extract_vocabulary_and_set(X_train, verbose=True)
    
    embedding_matrix = calculate_embedding_matrix(vocabulary)
    embedding_input_dim, embedding_output_dim = embedding_matrix.shape[0], embedding_matrix.shape[1]
    
    output_classes = len(y_train.unique())

    vectorize_layer = TextVectorization(
        output_mode='int',
        output_sequence_length=None,
        vocabulary=list(vocabulary),
        name="text_vectorization"
    )

    embedding_layer = Embedding(
        embedding_input_dim,
        embedding_output_dim,
        weights=[embedding_matrix],
        input_length=embedding_length,
        trainable=False,
        mask_zero=True,
        name="embedding"
    )

    input_layer = InputLayer(input_shape=(1,), dtype=tf.string, name="text_input")
    convolution_layer = Conv1D(conv_num_filters, conv_kernel_size, activation="relu", strides=1, name="conv_1")
    global_max_pooling_layer = GlobalMaxPooling1D(name="global_max_pool_1")

    model = Sequential(name="cnn")
    model.add(input_layer)
    model.add(vectorize_layer)
    model.add(embedding_layer)
    model.add(convolution_layer)
    model.add(global_max_pooling_layer)
    model.add(Dense(output_classes, activation="softmax", name="prediction"))

    return model


def get_embedder_fasttext(embedding_dim, model_name="cc.de.300.bin"):
    split = model_name.split(".")
    model_lang = split[1]
    model_dim = int(split[2])

    try:
        ft = fasttext.load_model(model_name)
    except ValueError:
        fasttext.util.download_model(model_lang, if_exists='ignore')
        ft = fasttext.load_model(model_name)

    if embedding_dim < model_dim:
        fasttext.util.reduce_model(ft, embedding_dim)

    def fasttext_embedder(word):
        return ft.get_word_vector(word)

    return fasttext_embedder


def calculate_embedding_matrix(vocabulary, embedding_dim=300, verbose=False):
    """Creates the embedding matrix
    """
    voc_size = len(vocabulary)
    words_not_found = set()
    embedding_matrix = np.zeros((voc_size, embedding_dim))

    embedder = get_embedder_fasttext(embedding_dim)
    
    
    for idx, word in enumerate(vocabulary):
        embedding_vector = embedder(word)
        if (embedding_vector is not None) and len(embedding_vector) > 0 and not np.all(embedding_vector == 0):
            # words not found in embedding index will be all-zeros.
            embedding_matrix[idx] = embedding_vector
        else:
            words_not_found.add(word)

    if verbose:
        print("Embedding type: fasttext")
        print("Number of null word embeddings:", np.sum(np.sum(embedding_matrix, axis=1) == 0))
        nr_words_not_found = len(words_not_found)
        print("Words not found in total:", len(words_not_found))
        if nr_words_not_found > 0:
            import random

            nr_sample = min(20, len(words_not_found))
            print("Words without embedding (", nr_sample, "/", nr_words_not_found, "): ",
                  random.sample(words_not_found, nr_sample), sep='')

    return embedding_matrix


def extract_vocabulary_and_set(data, verbose=False):
    sequence_length_percentil_cutoff = 0.98
    sequence_length_max = 768

    vocabulary = set()
    _ = data.apply(lambda x: vocabulary.update(x))

    lengths = data.apply(len)
    max_sequence_length = int(lengths.quantile(1.0))
    percentil_sequence_length = int(lengths.quantile(0.98))
    median_sequence_length = int(lengths.quantile(0.5))
    embedding_sequence_length = min(sequence_length_max, percentil_sequence_length)

    if verbose:
        print(f"Median sequence length:       : {median_sequence_length}")
        print(f"Percentil                     : {sequence_length_percentil_cutoff})")
        print(f"Cutoff sequence length        : {percentil_sequence_length}")
        print(f"Max sequence length           : {max_sequence_length}")
        print(f"Used embedding sequence length: {embedding_sequence_length}")
        print(f"Vocabulary length             : {len(vocabulary)}")

    return (vocabulary, embedding_sequence_length)

In [24]:
from pandas import read_parquet
from data import file

data_train = read_parquet(file.news_articles_cleaned_train)
data_train

Unnamed: 0,text_original,label,text_tokenized,text_tokenized_keywords,text_tokenized_lemmas,text_tokenized_stemmed
0,21-Jähriger fällt wohl bis Saisonende aus. Wie...,Sport,"[21-jähriger, fällt, wohl, bis, saisonende, au...","[21-jähriger, fällt, wohl, saisonende, ., wien...","[21-jähriger, fällen, wohl, saisonende, wien, ...","[21-jahrig, fallt, wohl, saison, wien, rapid, ..."
1,"Erfundene Bilder zu Filmen, die als verloren g...",Kultur,"[erfundene, bilder, zu, filmen, ,, die, als, v...","[erfundene, bilder, filmen, ,, verloren, gelte...","[erfunden, bilder, filmen, verlieren, gelten, ...","[erfund, bild, film, verlor, gelt, ``, the, fo..."
2,Der frischgekürte CEO Sundar Pichai setzt auf ...,Web,"[der, frischgekürte, ceo, sundar, pichai, setz...","[frischgekürte, ceo, sundar, pichai, setzt, um...","[frischgekürte, ceo, sundar, pichai, setzen, u...","[frischgekurt, ceo, sundar, pichai, setzt, umg..."
3,"Putin: ""Einigung, dass wir Menge auf Niveau vo...",Wirtschaft,"[putin, :, ``, einigung, ,, dass, wir, menge, ...","[putin, :, ``, einigung, ,, menge, niveau, jän...","[putin, ``, einigung, menge, niveau, jänner, h...","[putin, ``, einig, meng, niveau, jann, halt, '..."
4,Estland sieht den künftigen österreichischen P...,Inland,"[estland, sieht, den, künftigen, österreichisc...","[estland, sieht, künftigen, österreichischen, ...","[estland, sehen, künftig, österreichisch, präs...","[estland, sieht, kunftig, osterreich, prasiden..."
...,...,...,...,...,...,...
9240,Bernd Saurer war Bridge-Juniorenweltmeister un...,Inland,"[bernd, saurer, war, bridge-juniorenweltmeiste...","[bernd, saurer, bridge-juniorenweltmeister, ,,...","[bernd, sauer, bridge-juniorenweltmeister, kra...","[bernd, saur, bridge-juniorenweltmeist, krauss..."
9241,Sandhere soll in vergangener Woche bei Luftang...,International,"[sandhere, soll, in, vergangener, woche, bei, ...","[sandhere, vergangener, woche, luftangriff, ge...","[sandhere, vergangen, woche, luftangriff, töte...","[sandh, vergang, woch, luftangriff, getotet, w..."
9242,Derzeit Konzeptgruppe in Berlin – Kein Komment...,Wirtschaft,"[derzeit, konzeptgruppe, in, berlin, –, kein, ...","[derzeit, konzeptgruppe, berlin, –, kommentar,...","[derzeit, konzeptgruppe, berlin, kommentar, ap...","[derzeit, konzeptgrupp, berlin, kommentar, app..."
9243,Landeshauptmann will den vierten Regierungssit...,Inland,"[landeshauptmann, will, den, vierten, regierun...","[landeshauptmann, vierten, regierungssitz, erh...","[landeshauptmann, viert, regierungssitz, erhal...","[landeshauptmann, viert, regierungssitz, erhal..."


In [25]:
X_train = data_train.text_tokenized_stemmed
y_train = data_train.label


In [26]:
model = build_model_cnn_simple(X_train, y_train)
model.summary()

Median sequence length:       : 173
Percentil                     : 0.98)
Cutoff sequence length        : 589
Max sequence length           : 1699
Used embedding sequence length: 589
Vocabulary length             : 162207


2021-10-31 06:38:15.637805: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-31 06:38:15.645043: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-31 06:38:15.645644: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-31 06:38:15.646626: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Model: "cnn"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, None)              0         
_________________________________________________________________
embedding (Embedding)        (None, None, 300)         48662100  
_________________________________________________________________
conv_1 (Conv1D)              (None, None, 128)         268928    
_________________________________________________________________
global_max_pool_1 (GlobalMax (None, 128)               0         
_________________________________________________________________
prediction (Dense)           (None, 9)                 1161      
Total params: 48,932,189
Trainable params: 270,089
Non-trainable params: 48,662,100
_________________________________________________________________


In [74]:
data_train

Unnamed: 0,text_original,label,text_tokenized,text_tokenized_keywords,text_tokenized_lemmas,text_tokenized_stemmed
0,21-Jähriger fällt wohl bis Saisonende aus. Wie...,Sport,"[21-jähriger, fällt, wohl, bis, saisonende, au...","[21-jähriger, fällt, wohl, saisonende, ., wien...","[21-jähriger, fällen, wohl, saisonende, wien, ...","[21-jahrig, fallt, wohl, saison, wien, rapid, ..."
1,"Erfundene Bilder zu Filmen, die als verloren g...",Kultur,"[erfundene, bilder, zu, filmen, ,, die, als, v...","[erfundene, bilder, filmen, ,, verloren, gelte...","[erfunden, bilder, filmen, verlieren, gelten, ...","[erfund, bild, film, verlor, gelt, ``, the, fo..."
2,Der frischgekürte CEO Sundar Pichai setzt auf ...,Web,"[der, frischgekürte, ceo, sundar, pichai, setz...","[frischgekürte, ceo, sundar, pichai, setzt, um...","[frischgekürte, ceo, sundar, pichai, setzen, u...","[frischgekurt, ceo, sundar, pichai, setzt, umg..."
3,"Putin: ""Einigung, dass wir Menge auf Niveau vo...",Wirtschaft,"[putin, :, ``, einigung, ,, dass, wir, menge, ...","[putin, :, ``, einigung, ,, menge, niveau, jän...","[putin, ``, einigung, menge, niveau, jänner, h...","[putin, ``, einig, meng, niveau, jann, halt, '..."
4,Estland sieht den künftigen österreichischen P...,Inland,"[estland, sieht, den, künftigen, österreichisc...","[estland, sieht, künftigen, österreichischen, ...","[estland, sehen, künftig, österreichisch, präs...","[estland, sieht, kunftig, osterreich, prasiden..."
...,...,...,...,...,...,...
9240,Bernd Saurer war Bridge-Juniorenweltmeister un...,Inland,"[bernd, saurer, war, bridge-juniorenweltmeiste...","[bernd, saurer, bridge-juniorenweltmeister, ,,...","[bernd, sauer, bridge-juniorenweltmeister, kra...","[bernd, saur, bridge-juniorenweltmeist, krauss..."
9241,Sandhere soll in vergangener Woche bei Luftang...,International,"[sandhere, soll, in, vergangener, woche, bei, ...","[sandhere, vergangener, woche, luftangriff, ge...","[sandhere, vergangen, woche, luftangriff, töte...","[sandh, vergang, woch, luftangriff, getotet, w..."
9242,Derzeit Konzeptgruppe in Berlin – Kein Komment...,Wirtschaft,"[derzeit, konzeptgruppe, in, berlin, –, kein, ...","[derzeit, konzeptgruppe, berlin, –, kommentar,...","[derzeit, konzeptgruppe, berlin, kommentar, ap...","[derzeit, konzeptgrupp, berlin, kommentar, app..."
9243,Landeshauptmann will den vierten Regierungssit...,Inland,"[landeshauptmann, will, den, vierten, regierun...","[landeshauptmann, vierten, regierungssitz, erh...","[landeshauptmann, viert, regierungssitz, erhal...","[landeshauptmann, viert, regierungssitz, erhal..."


In [100]:
from sklearn.preprocessing import LabelBinarizer
    
X_column_name = "text_tokenized_stemmed"
#y_column_name = "label"

#label_binarizer = LabelBinarizer()
#label_binarizer.fit(y)

#data = data_train.drop(data_train.columns.difference([X_column_name, y_column_name]), axis=1, inplace=False)
#y = data.pop(y_column_name)
#y = label_binarizer.transform(data_train.label)

#data = data_train

#data = data.drop(data.columns.difference([X_column_name, y_column_name]), 1, inplace=False)
#y = data.pop(y_column_name)

#y = label_binarizer.transform(y)
#ds = tf.data.Dataset.from_tensor_slices((dict(data)))






0       21-jahrig fallt wohl saison wien rapid wohl sa...
1       erfund bild film verlor gelt `` the forbidd ro...
2       frischgekurt ceo sundar pichai setzt umgang fu...
3       putin `` einig meng niveau jann halt '' moskau...
4       estland sieht kunftig osterreich prasident est...
                              ...                        
9240    bernd saur bridge-juniorenweltmeist krauss sch...
9241    sandh vergang woch luftangriff getotet word wa...
9242    derzeit konzeptgrupp berlin kommentar appl mag...
9243    landeshauptmann viert regierungssitz erhalt fp...
9244    million syrisch fluchtling kamerafrau rechtsge...
Name: text_tokenized_stemmed, Length: 9245, dtype: object