In [23]:
from keras.layers import TextVectorization
import fasttext.util
import tensorflow as tf
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, InputLayer
from tensorflow.keras.layers import Embedding
from tensorflow.keras import Sequential
import numpy as np


def build_model_cnn_simple(X_train, y_train, conv_num_filters=128, conv_kernel_size=7):

    vocabulary, embedding_length = extract_vocabulary_and_set(X_train, verbose=True)
    
    embedding_matrix = calculate_embedding_matrix(vocabulary)
    embedding_input_dim, embedding_output_dim = embedding_matrix.shape[0], embedding_matrix.shape[1]
    
    output_classes = len(y_train.unique())

    vectorize_layer = TextVectorization(
        output_mode='int',
        output_sequence_length=None,
        vocabulary=list(vocabulary),
        name="text_vectorization"
    )

    embedding_layer = Embedding(
        embedding_input_dim,
        embedding_output_dim,
        weights=[embedding_matrix],
        input_length=embedding_length,
        trainable=False,
        mask_zero=True,
        name="embedding"
    )

    input_layer = InputLayer(input_shape=(1,), dtype=tf.string, name="text_input")
    convolution_layer = Conv1D(conv_num_filters, conv_kernel_size, activation="relu", strides=1, name="conv_1")
    global_max_pooling_layer = GlobalMaxPooling1D(name="global_max_pool_1")

    model = Sequential(name="cnn")
    model.add(input_layer)
    model.add(vectorize_layer)
    model.add(embedding_layer)
    model.add(convolution_layer)
    model.add(global_max_pooling_layer)
    model.add(Dense(output_classes, activation="softmax", name="prediction"))

    return model


def get_embedder_fasttext(embedding_dim, model_name="cc.de.300.bin"):
    split = model_name.split(".")
    model_lang = split[1]
    model_dim = int(split[2])

    try:
        ft = fasttext.load_model(model_name)
    except ValueError:
        fasttext.util.download_model(model_lang, if_exists='ignore')
        ft = fasttext.load_model(model_name)

    if embedding_dim < model_dim:
        fasttext.util.reduce_model(ft, embedding_dim)

    def fasttext_embedder(word):
        return ft.get_word_vector(word)

    return fasttext_embedder


def calculate_embedding_matrix(vocabulary, embedding_dim=300, verbose=False):
    """Creates the embedding matrix
    """
    voc_size = len(vocabulary)
    words_not_found = set()
    embedding_matrix = np.zeros((voc_size, embedding_dim))

    embedder = get_embedder_fasttext(embedding_dim)
    
    
    for idx, word in enumerate(vocabulary):
        embedding_vector = embedder(word)
        if (embedding_vector is not None) and len(embedding_vector) > 0 and not np.all(embedding_vector == 0):
            # words not found in embedding index will be all-zeros.
            embedding_matrix[idx] = embedding_vector
        else:
            words_not_found.add(word)

    if verbose:
        print("Embedding type: fasttext")
        print("Number of null word embeddings:", np.sum(np.sum(embedding_matrix, axis=1) == 0))
        nr_words_not_found = len(words_not_found)
        print("Words not found in total:", len(words_not_found))
        if nr_words_not_found > 0:
            import random

            nr_sample = min(20, len(words_not_found))
            print("Words without embedding (", nr_sample, "/", nr_words_not_found, "): ",
                  random.sample(words_not_found, nr_sample), sep='')

    return embedding_matrix


def extract_vocabulary_and_set(data, verbose=False):
    sequence_length_percentil_cutoff = 0.98
    sequence_length_max = 768

    vocabulary = set()
    _ = data.apply(lambda x: vocabulary.update(x))

    lengths = data.apply(len)
    max_sequence_length = int(lengths.quantile(1.0))
    percentil_sequence_length = int(lengths.quantile(0.98))
    median_sequence_length = int(lengths.quantile(0.5))
    embedding_sequence_length = min(sequence_length_max, percentil_sequence_length)

    if verbose:
        print(f"Median sequence length:       : {median_sequence_length}")
        print(f"Percentil                     : {sequence_length_percentil_cutoff})")
        print(f"Cutoff sequence length        : {percentil_sequence_length}")
        print(f"Max sequence length           : {max_sequence_length}")
        print(f"Used embedding sequence length: {embedding_sequence_length}")
        print(f"Vocabulary length             : {len(vocabulary)}")

    return (vocabulary, embedding_sequence_length)

In [None]:
from pandas import read_parquet
from data import file

data_train = read_parquet(file.news_articles_cleaned_train)
data_train

In [21]:
X_train = data_train.text_tokenized_stemmed
y_train = data_train.label


In [22]:
model = build_model_cnn_simple(X_train, y_train)
model.summary()

Median sequence length:       : 173
Percentil                     : 0.98)
Cutoff sequence length        : 589
Max sequence length           : 1699
Used embedding sequence length: 589
Vocabulary length             : 162207




ValueError: could not broadcast input array from shape (50) into shape (300)