In [1]:
import os 
os.chdir('../')
%pwd

'/workspaces/log_anomaly'

In [2]:
import re
import string

import hydra
import nltk
import polars as pl
from nltk.corpus import stopwords
from omegaconf import DictConfig

nltk.download("stopwords")


def clean_text(text_row):
    """performs preprocessing steps on each text row removing numbers,
    stopwords, punctuation and any symbols


    Returns
    -------
    clean_text : row
        A cleaned and preprocessed text
    """

    text_row = text_row.lower()
    text_row = re.sub("<[^>]*>", "", text_row)
    text_row = re.sub(r"[^a-zA-Z\s]", "", text_row)
    stop_words = set(stopwords.words("english"))
    text_row = [
        word
        for word in text_row.split()
        if word not in stop_words and word not in string.punctuation
    ]
    text_cleaned = " ".join(word for word in text_row)
    return text_cleaned


def label_encoder(target_df):
    """performs label encoding for target label


    Returns
    -------
    label : int
        return either 0 for normal or 1 for abnormal
    """

    if target_df == "normal":
        label = 0
    else:
        label = 1
    return label


def preprocess_and_encode(file_path, save_path):
    """_summary_

    Parameters
    ----------
    file_path : _type_
        _description_
    save_path : _type_
        _description_
    """
    dataframe = pl.read_parquet(file_path)
    dataframe = dataframe.with_columns(
        pl.col("Target").apply(label_encoder, return_dtype=pl.Int32)
    )
    dataframe = dataframe.with_columns(pl.col("Log").apply(clean_text))
    dataframe.write_parquet(file=save_path, compression="gzip")
    return dataframe


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
file = 'dev.gzip'
save_path = 'test/test.gzip'
dev_data = pl.read_parquet(file)
dev_data.head()

Log,Target
str,str
""" 1119803499 20…","""normal"""
""" 1119803105 20…","""normal"""
""" 1121496169 20…","""normal"""
""" 1120968564 20…","""normal"""
""" 1120953205 20…","""normal"""


In [13]:
preprocess_and_encode(file_path= file, save_path=save_path)

In [11]:
clean_df = pl.read_parquet(save_path)
clean_df.head()

Log,Target
str,i32
"""rmnccju rmnccj…",0
"""rmnecju rmnecj…",0
"""rmncju rmncju …",0
"""rmncju rmncju …",0
"""rmncju rmncju …",0


In [14]:
new_clean_df = pl.read_parquet(save_path)
new_clean_df.head()

Log,Target
str,i32
"""rmnccju rmnccj…",0
"""rmnecju rmnecj…",0
"""rmncju rmncju …",0
"""rmncju rmncju …",0
"""rmncju rmncju …",0


In [15]:
new_clean_df == clean_df

Log,Target
bool,bool
true,true
true,true
true,true
true,true
true,true
true,true
true,true
true,true
true,true
true,true
