In [1]:
import os
from pathlib import Path
from time import strftime

import mlflow
import pandas as pd
import tensorflow as tf

from utils.utils import (get_dataset, get_tokenizer, set_seed, tensorboard,
                         tracking)

2023-08-10 16:49:30.276494: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-10 16:49:30.310442: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-10 16:49:30.311089: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
"""doc
"""

import re
import string

import hydra
import nltk
import polars as pl
from nltk.corpus import stopwords
nltk.download("stopwords")


def clean_text(text_row):
    """performs preprocessing steps on each text row removing numbers,
    stopwords, punctuation and any symbols


    Returns
    -------
    clean_text : row
        A cleaned and preprocessed text
    """

    text_row = text_row.lower()
    text_row = re.sub("<[^>]*>", "", text_row)
    text_row = re.sub(r"[^a-zA-Z\s]", "", text_row)
    stop_words = set(stopwords.words("english"))
    text_row = [
        word
        for word in text_row.split()
        if word not in stop_words and word not in string.punctuation
    ]
    text_cleaned = " ".join(word for word in text_row)
    return text_cleaned


def label_encoder(target_df):
    """performs label encoding for target label


    Returns
    -------
    label : int
        return either 0 for normal or 1 for abnormal
    """

    if target_df == "normal":
        label = 0
    else:
        label = 1
    return label


def preprocess_and_encode(file_path, save_path):
    """_summary_

    Parameters
    ----------
    file_path : _type_
        _description_
    save_path : _type_
        _description_
    """
    dataframe = pl.read_parquet(file_path)
    dataframe = dataframe.with_columns(
        pl.col("Target").apply(label_encoder, return_dtype=pl.Int32)
    )
    dataframe = dataframe.with_columns(pl.col("Log").apply(clean_text))
    dataframe.write_parquet(file=save_path, compression="gzip")

[nltk_data] Downloading package stopwords to /home/gitpod/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
os.chdir('../')
%pwd

'/workspace/log_anomaly'

In [4]:
set_seed()

Random seed set as 42


In [5]:
loss = tf.keras.losses.BinaryCrossentropy()
optim = tf.keras.optimizers.Adadelta(learning_rate=0.001)

In [6]:
from models import GRU

model_name ='New_Testing'

In [7]:
data_path = 'development/dev.gzip'

preprocess_and_encode(data_path, 'development/clean_test.gzip')

In [8]:
clean_data = 'development/clean_test.gzip'
df = pl.read_parquet(clean_data)


In [9]:
df.head()

Log,Target
str,i32
"""rmnccju rmnccj…",0
"""rmnecju rmnecj…",0
"""rmncju rmncju …",0
"""rmncju rmncju …",0
"""rmncju rmncju …",0


In [10]:
clean_data = 'development/clean_test.gzip'
dataset = get_dataset(file_path=clean_data,shuffle= True)
for sample in dataset.take(1):
    print(sample)

(<tf.Tensor: shape=(2,), dtype=string, numpy=
array([b'rmncju rmncju ras kernel info ce sym xef mask x',
       b'rmnccju rmnccju ras kernel info generating core'], dtype=object)>, <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0., 0.], dtype=float32)>)


2023-08-10 16:49:32.793735: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [11]:
tokenizer, vocab_size = get_tokenizer(dataset)
vocab_size

420

In [12]:
def build_model(vocab_size, embed_dim, Sequnce_length):
    """1DCNN doc

    Parameters
    ----------
    file_path : str

    Returns
    -------
    model : object
        model
    """

    #input_ = tf.keras.layers.Input(shape=(Sequnce_length))
    embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size+ 1, 
                                                output_dim=embed_dim, mask_zero= True)
    conv1D = tf.keras.layers.Conv1D(filters=10, kernel_size= 2)
    pool = tf.keras.layers.MaxPool1D()
    flatten = tf.keras.layers.GlobalAveragePooling1D()
    drop1 = tf.keras.layers.Dropout(0.5)
    dense_layer = tf.keras.layers.Dense(units =100, activation='relu')
    drop2 = tf.keras.layers.Dropout(0.5)
    output_layer = tf.keras.layers.Dense(1,activation='sigmoid')

    model = tf.keras.Sequential([embedding_layer,conv1D,pool,flatten,drop1,dense_layer,drop2,output_layer])
    return model

tensprflow_model = tf.keras.Sequential([ tf.keras.layers.Embedding(420 + 1, 100),
                                        tf.keras.layers.Dropout(0.2),
                                        tf.keras.layers.GlobalAveragePooling1D(),
                                        tf.keras.layers.Dropout(0.2),
                                        tf.keras.layers.Dense(1)])

In [13]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return tokenizer(text), label
final_dataset = dataset.map(vectorize_text)
for sample in final_dataset.take(1):
    print(sample)

(<tf.Tensor: shape=(2, 20), dtype=int64, numpy=
array([[  2,   2,   3,   4,   5,  29,  30, 271,  28,  23,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0],
       [  2,   2,   3,   4,   5,   7,   6,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0]])>, <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0., 0.], dtype=float32)>)


In [14]:
model = build_model(vocab_size=vocab_size, embed_dim=10, Sequnce_length=20)

In [15]:
dir = tensorboard(model_name)
f1_score = tf.keras.metrics.F1Score()
tensprflow_model.compile(loss=loss, optimizer='adam', metrics= f1_score)

In [16]:
tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir=dir)

In [17]:
tensprflow_model.fit(final_dataset, callbacks=[tensorboard_cb], batch_size=2)

ValueError: Expected scalar shape, saw shape: (1,).