In [63]:
from loglead.loaders.raw import RawLoader
from loglead.enhancers import EventLogEnhancer, SequenceEnhancer
import polars as pl
import glob, os


In [83]:
#log_dir = "../data/lo2-token/light-oauth2-data-1719592986/"
#log_files = glob.glob(os.path.join(log_dir, "*.log"))

log_files = ["../data/lo2-token/light-oauth2-data-1719592986/TOKEN_access_token_auth_header_error_401.log"]

all_dfs = []

for log_file in log_files:
    loader = RawLoader(
        filename=log_file,
        timestamp_pattern=r"^(\d{1,2}:\d{2}:\d{2}\.\d{3})",
        timestamp_format="%H:%M:%S.%f",
        missing_timestamp_action="merge"
    )
    loader.load()

    # Custom timestamp extraction and padding
    df = loader.df.with_columns([
        pl.col("m_message")
        .str.extract(loader.timestamp_pattern, group_index=1)
        .str.pad_end(12, "0")
        .str.strptime(pl.Datetime, "%H:%M:%S.%f", strict=False)
        .alias("m_timestamp"),

        pl.col("m_message")
        .str.replace(loader.timestamp_pattern, "")
        .alias("m_message"),

        pl.lit(os.path.basename(log_file)).alias("source_file")  # keep file origin
    ])

    # Reorder columns
    df = df.select(["m_timestamp", "m_message", "source_file"])

    all_dfs.append(df)

# Combine all logs
df = pl.concat(all_dfs)

# Enrich with parsed fields
df = df.with_columns([
    pl.col("m_message").str.extract(r"\[([^\]]+)\]", group_index=1).alias("thread"),
    pl.col("m_message").str.extract(r"\] +(\S+)", group_index=1).alias("request_id"),
    pl.col("m_message").str.extract(r"\] +\S+ +(\w+)", group_index=1).alias("level"),
    pl.col("m_message").str.extract(r"\w+ +(\S+ +<init>)", group_index=1).alias("class_method"),
    pl.col("m_message").str.extract(r"<init> - (.*)", group_index=1).alias("log_text")
])

df = df.fill_null("")


  .str.strptime(pl.Datetime, "%H:%M:%S.%f", strict=False)


In [84]:
enhancer = EventLogEnhancer(df)
df = enhancer.normalize()
df = enhancer.parse_spell()
df = enhancer.words()
df = enhancer.trigrams()
df = enhancer.parse_drain()
df = enhancer.parse_tip()
df = enhancer.parse_pliplom()
df = enhancer.parse_iplom()
df = enhancer.parse_brain()
df = enhancer.length()


  df_part = df_part.with_columns(pl.col("events").list.to_struct()).unnest("events")


In [85]:
from loglead.anomaly_detection import AnomalyDetector
anomaly_detector = AnomalyDetector(
    item_list_col="e_words",
    numeric_cols=["e_words_len", "e_trigrams_len", "e_chars_len", "e_lines_len", "e_event_id_len"],
    store_scores=True,
    print_scores=False,  # ← disable printing evaluation
    auc_roc=False        # ← avoid computing ROC/AUC without labels
)




In [86]:
anomaly_detector.test_train_split(df, test_frac=0.85, shuffle=True)
anomaly_detector.train_LOF(n_neighbors=5)
#anomaly_detector.train_IsolationForest(n_estimators=100, contamination="auto")

df = anomaly_detector.predict()
df.filter(pl.col("pred_ano") > 0)




row_nr,m_timestamp,m_message,source_file,thread,request_id,level,class_method,log_text,e_message_normalized,e_event_spell_id,e_template_spell,e_words,e_words_len,e_trigrams,e_trigrams_len,e_event_drain_id,e_event_tip_id,e_event_pliplom_id,e_event_iplom_id,e_event_brain_id,e_chars_len,e_lines_len,e_event_id_len,pred_ano
u32,datetime[μs],str,str,str,str,str,str,str,str,str,str,list[str],u32,list[str],u32,str,str,str,str,str,u32,u32,i32,i64
2205,,"""	at com.mysql.cj.jdbc.exceptio…","""TOKEN_access_token_auth_header…","""""","""""","""""","""""","""""","""	at com.mysql.cj.jdbc.exceptio…","""ab972341""","""at *""","[""	at"", ""com.mysql.cj.jdbc.exceptions.SQLError.createSQLException(SQLError.java:97)""]",2,"[""	at"", "" co"", … "":97""]",76,"""e11""","""e15""","""e11""","""d712aeda""","""E12""",78,0,1,1
1092,,""" [XNIO-1 task-4] T3jyu-cXR-CN…","""TOKEN_access_token_auth_header…","""XNIO-1 task-4""","""T3jyu-cXR-CNNX5s1zvLXg""","""DEBUG""","""""","""""",""" [XNIO-<NUM> task-<NUM>] T3jy…","""03f9324a""","""[XNIO-<NUM> task-<NUM>] * DEBU…","["""", ""[XNIO-1"", … ""http://localhost:8080/authorization""]",15,"["" [X"", ""NIO"", … ""ion""]",183,"""e9""","""e13""","""e10""","""987e1ed3""","""E2""",185,0,1,1
504,,""" [XNIO-1 task-2] I_-ih0zXRBuk…","""TOKEN_access_token_auth_header…","""XNIO-1 task-2""","""I_-ih0zXRBuk1d7-c_TGMQ""","""DEBUG""","""""","""""",""" [XNIO-<NUM> task-<NUM>] I_-i…","""4c99f89d""","""[XNIO-<NUM> task-<NUM>] * DEBU…","["""", ""[XNIO-1"", … ""http://localhost:8080/authorization""]",15,"["" [X"", ""NIO"", … ""ion""]",183,"""e30""","""e13""","""e10""","""5fea72f6""","""E2""",185,0,1,1
2606,,""" [hz._hzInstance_1_dev.partiti…","""TOKEN_access_token_auth_header…","""hz._hzInstance_1_dev.partition…","""DEBUG""","""c""","""""","""""",""" [hz._hzInstance_<NUM>_dev.par…","""9bf2a03b""","""[hz._hzInstance_<NUM>_dev.part…","["""", ""[hz._hzInstance_1_dev.partition-operation.thread-10]"", … ""Load:c32af8be""]",9,"["" [h"", ""z._"", … ""f8b""]",112,"""e3""","""e4""","""e7""","""e54efadb""","""E6""",114,0,1,1
587,,"""	at com.networknt.audit.AuditH…","""TOKEN_access_token_auth_header…","""""","""""","""""","""""","""""","""	at com.networknt.audit.AuditH…","""ab972341""","""at *""","[""	at"", ""com.networknt.audit.AuditHandler.next(AuditHandler.java:192)""]",2,"[""	at"", "" co"", … "":19""]",62,"""e11""","""e15""","""e11""","""987e1ed3""","""E12""",64,0,1,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2278,,""" [XNIO-1 task-4] diK46mjOQg21…","""TOKEN_access_token_auth_header…","""XNIO-1 task-4""","""diK46mjOQg21SM7ZOqaveg""","""DEBUG""","""""","""""",""" [XNIO-<NUM> task-<NUM>] diK4…","""03f9324a""","""[XNIO-<NUM> task-<NUM>] * DEBU…","["""", ""[XNIO-1"", … ""http://localhost:8080/authorization""]",15,"["" [X"", ""NIO"", … ""ion""]",183,"""e9""","""e13""","""e10""","""987e1ed3""","""E2""",185,0,1,1
111,,""" [hz._hzInstance_1_dev.partiti…","""TOKEN_access_token_auth_header…","""hz._hzInstance_1_dev.partition…","""DEBUG""","""c""","""""","""""",""" [hz._hzInstance_<NUM>_dev.par…","""9bf2a03b""","""[hz._hzInstance_<NUM>_dev.part…","["""", ""[hz._hzInstance_1_dev.partition-operation.thread-3]"", … ""Load:e1a67eac""]",9,"["" [h"", ""z._"", … ""eac""]",111,"""e3""","""e4""","""e7""","""e54efadb""","""E6""",113,0,1,1
2346,,"""	at com.networknt.audit.AuditH…","""TOKEN_access_token_auth_header…","""""","""""","""""","""""","""""","""	at com.networknt.audit.AuditH…","""ab972341""","""at *""","[""	at"", ""com.networknt.audit.AuditHandler.next(AuditHandler.java:192)""]",2,"[""	at"", "" co"", … "":19""]",62,"""e11""","""e15""","""e11""","""987e1ed3""","""E12""",64,0,1,1
572,,""" [hz._hzInstance_1_dev.partiti…","""TOKEN_access_token_auth_header…","""hz._hzInstance_1_dev.partition…","""DEBUG""","""c""","""""","""""",""" [hz._hzInstance_<NUM>_dev.par…","""9bf2a03b""","""[hz._hzInstance_<NUM>_dev.part…","["""", ""[hz._hzInstance_1_dev.partition-operation.thread-5]"", … ""Load:3a0c09fb-21ab-400a-a1d5-73f20734af82""]",9,"["" [h"", ""z._"", … ""f82""]",138,"""e3""","""e4""","""e7""","""e54efadb""","""E6""",140,0,1,1


row_nr,m_timestamp,m_message,source_file,thread,request_id,level,class_method,log_text,e_message_normalized,e_event_spell_id,e_template_spell,e_words,e_words_len,e_trigrams,e_trigrams_len,e_event_drain_id,e_event_tip_id,e_event_pliplom_id,e_event_iplom_id,e_event_brain_id,e_chars_len,e_lines_len,e_event_id_len,pred_ano
u32,datetime[μs],str,str,str,str,str,str,str,str,str,str,list[str],u32,list[str],u32,str,str,str,str,str,u32,u32,i32,i64
75400,,"""	at com.hazelcast.spi.Operatio…","""TOKEN_code_challenge_invalid_f…","""""","""""","""""","""""","""""","""	at com.hazelcast.spi.Operatio…","""ab972341""","""at *""","[""	at"", ""com.hazelcast.spi.Operation.call(Operation.java:170)""]",2,"[""	at"", "" co"", … ""70)""]",54,"""e11""","""e215""","""e22e1""","""1adbe184""","""E15""",56,0,1,1
110098,,"""	at com.hazelcast.map.impl.map…","""TOKEN_register_user_400_email_…","""""","""""","""""","""""","""""","""	at com.hazelcast.map.impl.map…","""ab972341""","""at *""","[""	at"", ""com.hazelcast.map.impl.mapstore.writethrough.WriteThroughStore.remove(WriteThroughStore.java:56)""]",2,"[""	at"", "" co"", … ""a:5""]",98,"""e11""","""e215""","""e22e1""","""e54efadb""","""E15""",100,0,1,1
110093,,"""Jun 28, 2024 4:40:42 PM com.ha…","""TOKEN_register_user_400_email_…","""""","""""","""""","""""","""""","""Jun <NUM>, <NUM> <NUM>:<NUM>:<…","""57f3ae84""","""Jun <NUM>, <NUM> <NUM>:<NUM>:<…","[""Jun"", ""28,"", … ""com.hazelcast.map.impl.operation.DeleteOperation""]",6,"[""Jun"", "" 28"", … ""tio""]",70,"""e39""","""e150""","""e2e2""","""e54efadb""","""E24""",72,0,1,1
40150,,"""	at com.networknt.oauth.token.…","""TOKEN_access_token_form_urlenc…","""""","""""","""""","""""","""""","""	at com.networknt.oauth.token.…","""ab972341""","""at *""","[""	at"", ""com.networknt.oauth.token.handler.Oauth2TokenPostHandler.handleRequest(Oauth2TokenPostHandler.java:98)""]",2,"[""	at"", "" co"", … ""a:9""]",104,"""e11""","""e215""","""e22e1""","""987e1ed3""","""E15""",106,0,1,1
92917,,"""	at com.hazelcast.spi.impl.ope…","""TOKEN_access_token_client_secr…","""""","""""","""""","""""","""""","""	at com.hazelcast.spi.impl.ope…","""ab972341""","""at *""","[""	at"", ""com.hazelcast.spi.impl.operationservice.impl.OperationRunnerImpl.run(OperationRunnerImpl.java:416)""]",2,"[""	at"", "" co"", … ""416""]",100,"""e11""","""e215""","""e22e1""","""987e1ed3""","""E15""",102,0,1,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
93274,,"""	at com.networknt.oauth.token.…","""TOKEN_access_token_client_secr…","""""","""""","""""","""""","""""","""	at com.networknt.oauth.token.…","""ab972341""","""at *""","[""	at"", ""com.networknt.oauth.token.handler.Oauth2TokenPostHandler.validateClientSecret(Oauth2TokenPostHandler.java:577)""]",2,"[""	at"", "" co"", … ""577""]",112,"""e11""","""e215""","""e22e1""","""987e1ed3""","""E15""",114,0,1,1
30325,,"""	at com.mysql.cj.jdbc.ClientPr…","""TOKEN_register_service_400_ser…","""""","""""","""""","""""","""""","""	at com.mysql.cj.jdbc.ClientPr…","""ab972341""","""at *""","[""	at"", ""com.mysql.cj.jdbc.ClientPreparedStatement.executeUpdateInternal(ClientPreparedStatement.java:1094)""]",2,"[""	at"", "" co"", … ""094""]",100,"""e11""","""e215""","""e22e1""","""e54efadb""","""E15""",102,0,1,1
29740,,"""	at com.hazelcast.map.impl.rec…","""TOKEN_register_service_400_ser…","""""","""""","""""","""""","""""","""	at com.hazelcast.map.impl.rec…","""ab972341""","""at *""","[""	at"", ""com.hazelcast.map.impl.recordstore.DefaultRecordStore.delete(DefaultRecordStore.java:455)""]",2,"[""	at"", "" co"", … ""455""]",91,"""e11""","""e215""","""e22e1""","""e54efadb""","""E15""",93,0,1,1
60763,,"""	at com.hazelcast.map.impl.ope…","""TOKEN_register_user_400_passwo…","""""","""""","""""","""""","""""","""	at com.hazelcast.map.impl.ope…","""ab972341""","""at *""","[""	at"", ""com.hazelcast.map.impl.operation.DeleteOperation.run(DeleteOperation.java:44)""]",2,"[""	at"", "" co"", … "":44""]",79,"""e11""","""e215""","""e22e1""","""987e1ed3""","""E15""",81,0,1,1


In [61]:
df.select([
    pl.col("e_words_len").is_null().sum().alias("e_words_len_nulls"),
    pl.col("e_trigrams_len").is_null().sum().alias("e_trigrams_len_nulls"),
    pl.col("e_chars_len").is_null().sum().alias("e_chars_len_nulls"),
    pl.col("e_lines_len").is_null().sum().alias("e_lines_len_nulls"),
    pl.col("e_event_id_len").is_null().sum().alias("e_event_id_len_nulls")
])


e_words_len_nulls,e_trigrams_len_nulls,e_chars_len_nulls,e_lines_len_nulls,e_event_id_len_nulls
u32,u32,u32,u32,u32
0,0,0,0,0


In [62]:
df.filter(
    pl.col("e_words").list.lengths() > 0
).select(pl.count())


AttributeError: 'ExprListNameSpace' object has no attribute 'lengths'

In [None]:
anomalies = 

In [11]:
df.columns

['row_nr',
 'm_timestamp',
 'm_message',
 'source_file',
 'thread',
 'request_id',
 'level',
 'class_method',
 'log_text',
 'e_message_normalized',
 'e_event_spell_id',
 'e_template_spell',
 'e_words',
 'e_words_len',
 'e_trigrams',
 'e_trigrams_len',
 'e_event_drain_id',
 'e_event_tip_id',
 'e_event_pliplom_id',
 'e_event_iplom_id',
 'e_event_brain_id',
 'e_chars_len',
 'e_lines_len',
 'e_event_id_len',
 'pred_ano']

In [13]:
df.head(100).to_pandas().to_excel("LO2output.xlsx", index=False, engine="openpyxl")

In [11]:
df.write_parquet(f"../data/light-oauth2-data-1719592986.parquet")