In [None]:
import os
import string
import re
import nltk
import polars as pl
import tensorflow as tf
from nltk.corpus import stopwords
nltk.download('stopwords')
from functools import partial

def clean_text_preprocess(text_row):
    """ performs preprocessing steps on each text row removing numbers, 
    stopwords, punctuation and any symbols

    
    Returns 
    -------
    clean_text : row
        A cleaned and preprocessed text 
    """

    text_row = text_row.lower()
    text_row = re.sub('<[^>]*>', '', text_row)
    text_row = re.sub(r'[^a-zA-Z\s]', '', text_row)
    stop_words = set(stopwords.words('english'))
    text_row = [word for word in text_row.split()
            if word not in stop_words and word not in string.punctuation]
    clean_text = ' '.join(word for word in text_row)
    return clean_text

def label_encoder(target_df):
    """performs label encoding for target label 

    
    Returns
    -------
    label : int
        return either 0 for normal or 1 for abnormal
    """

    if target_df == 'normal':
        label = 0
    else:
        label = 1
    return label

def get_dataset(file_path, batch_size, shuffle_size= 10, shuffle = True):
    """create a Tensorflow dataset, with shuffle, batching and prefetching activated
    to speed up computation during training

    Parameters
    ----------
    file_path : str
        path of the parquet file
    batch_size : int
        Batch size 
    shuffle_size : int
        Size of the buffer for shuffle 
    shuffle : bool, Default = True
        perform shuffle on the dataset, if false it doesn't

    Returns
    -------
    dataset : Dataset
        A tensorflow Dataset with features and label
    """

    dataframe = pl.read_parquet(file_path)
    dataframe = dataframe.with_columns(pl.col('Target').apply(
            label_encoder, return_dtype=pl.Int32))
    dataframe = dataframe.with_columns(pl.col('Log').apply(
            clean_text_preprocess))
    features_df = dataframe['Log']
    target_df = dataframe['Target']
    dataset = tf.data.Dataset.from_tensor_slices((features_df, target_df))
    if shuffle:
        dataset = dataset.shuffle(shuffle_size)
    dataset = dataset.batch(batch_size).prefetch(buffer_size= tf.data.AUTOTUNE)
    return dataset

In [None]:
file = 'dev.gzip'
dev_df = pl.read_parquet(file)
dev_df.head()

In [None]:
dev_dataset = get_dataset(file_path= file, batch_size=1,shuffle= False)

In [None]:
for word in dev_dataset.take(2):
    print(word)

In [None]:
log_ds = dev_dataset.map(lambda text,label: text)
sequence_length = 10
tokenizer_layer = tf.keras.layers.TextVectorization(split= 'whitespace', output_mode= 'int',
                                              output_sequence_length= sequence_length)
tokenizer_layer.adapt(log_ds)
vocab_size = tokenizer_layer.vocabulary_size()

In [None]:
vocab_size

In [None]:
filter = 10
kernel = 5
stride = 1
pad= "same"
embed_dim = 100

def build_model():
    """ 1DCNN doc

    Parameters
    ----------
    file_path : str

    Returns
    -------
    model : object
        model
    """
    Inpput_layer = tf.keras.layers.Input(shape= ())
    embeding_layer = tf.keras.layers.Embedding(input_dim=vocab_size + 1,output_dim= embed_dim,
                                               mask_zero= True)
    DefaultConv1D = partial(tf.keras.layers.Conv1D, kernel_size= 3, strides= 1 , padding= pad, 
                           activation= 'relu')
    DefualtMaxpool1D = partial(tf.keras.layers.MaxPool1D, pool_size= 2)
    optim = tf.keras.optimizers.Adam(learning_rate= 0.01)

    model = tf.keras.Sequential([embeding_layer,
                         DefaultConv1
                         'D(30),
                         DefualtMaxpool1D(),
                         tf.keras.layers.GlobalMaxPool1D(),
                         tf.keras.layers.Dropout(0.5),
                         tf.keras.layers.Dense(units= 20, activation= 'relu'),
                         tf.keras.layers.Dropout(0.5),
                         tf.keras.layers.Dense(units= 1, activation= 'sigmoid')
                         ])
    model.compile(loss="binary_crossentropy", optimizer= optim, metrics= ['f1_score'])
    return model

In [None]:
model = build_model()
model.summary()