In [1]:
from pathlib import Path
import random
from src.extract_logs import stream_lines
import pandas as pd
import numpy as np
import config.config as conf
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder
import re
from collections import Counter,defaultdict

In [2]:
max_lines_per_label_train = conf.MAX_LINES_PER_LABEL_TRAIN
max_lines_per_label_test = conf.MAX_LINES_PER_LABEL_TEST

max_lines_per_app_train = conf.MAX_LINES_PER_APP_TRAIN
max_lines_per_app_test = conf.MAX_LINES_PER_APP_TEST

max_info_ratio = conf.MAX_INFO_RATIO

error_pattern = re.compile(conf.ERROR_PATTERN,re.I)
warn_pattern = re.compile(conf.WARN_PATTERN,re.I)

processed_data_path = Path(conf.PROCESSED_DATA_PATH)

In [None]:
def stratified_app_split(app_to_label, train_size=0.7, val_size=0.15, test_size=0.15, seed=42):
    apps = np.array(sorted(app_to_label.keys()))
    y = np.array([app_to_label[a] for a in apps])

    sss1 = StratifiedShuffleSplit(n_splits=1, train_size=train_size, random_state=seed)
    train_idx, vt_idx = next(sss1.split(apps, y))

    apps_train = apps[train_idx]
    y_vt = y[vt_idx]

    vt_apps = apps[vt_idx]
    test_ratio_in_vt = test_size / (val_size + test_size)
    sss2 = StratifiedShuffleSplit(n_splits=1, test_size=test_ratio_in_vt, random_state=seed+1)
    val_idx_rel, test_idx_rel = next(sss2.split(vt_apps, y_vt))

    apps_val  = vt_apps[val_idx_rel]
    apps_test = vt_apps[test_idx_rel]
    return set(apps_train), set(apps_val), set(apps_test)


def line_priority(line:str)->float:
    if error_pattern.search(line):
        return 3.0
    if(error_pattern).search(line):
        return 1.5
    return 1.0

def collect_lines_for_split(apps_subset, app_to_label, per_label_cap=max_lines_per_label_train, per_app_cap=max_lines_per_app_train, max_info_ratio=max_info_ratio):
    per_label_counts = Counter()
    texts,labels = [],[]
    app_to_indices = defaultdict(list)

    apps_list = list(apps_subset)
    random.shuffle(apps_list)

    inv_df = pd.read_csv(processed_data_path/"inventory.csv")


    for app in apps_list:
        label = app_to_label[app]
        
        log_files_paths = list(inv_df[inv_df["application"]==app]["file_path"])

        scored = []

        for lf_path in log_files_paths:
            lines = stream_lines(Path(lf_path))
            for line in lines:
                scored.append((line_priority(line),line))

        if not scored:continue

        scored.sort(key=lambda x:x[0],reverse=True)

        chosen, info_count = [],0
        info_limit = int(max_info_ratio*per_app_cap) if per_app_cap>0 else 0
        for score,line in scored:
            if(len(chosen)>per_app_cap):
                break
            is_info = (score==1.0)
            if is_info and info_count>=info_limit:
                continue
            chosen.append(line)
            if is_info:
                info_count+=1

        for line in chosen:
            if(per_label_counts[app]> per_label_cap):
                break
            idx = len(texts)
            texts.append(line)
            labels.append(label)
            app_to_indices[app].append(idx)
            per_label_counts[app]+=1

    return texts,labels,app_to_indices

In [None]:
app_stats_df = pd.read_csv(processed_data_path/"app_status.csv",index_col="application")
app_to_label = dict()
for row in app_stats_df.iterrows():
    app = row[0]
    app_to_label[app] = app_stats_df.loc[app,"label"]

labels_set = sorted(set(app_to_label.values()))

apps = np.array(sorted(app_to_label.keys()))
y= np.array([app_to_label[a] for a in apps])

apps_train, apps_val, apps_test = stratified_app_split(app_to_label)

X_train_lines, y_train_labels, appidx_train = collect_lines_for_split(apps_train, app_to_label)
X_val_lines, y_val_labels, appidx_val = collect_lines_for_split(apps_val, app_to_label)
X_test_lines, y_test_labels, appidx_test = collect_lines_for_split(apps_test, app_to_label)

le = LabelEncoder()
le.fit(list(labels_set)) 
y_train = le.transform(y_train_labels)
y_val   = le.transform(y_val_labels)
y_test  = le.transform(y_test_labels)

num_classes = len(le.classes_)


In [None]:
import tensorflow as tf
from keras import layers,optimizers,callbacks,regularizers
from sklearn.metrics import classification_report,confusion_matrix,f1_score,accuracy_score
# from tensorflow.keras import layers,optimizers,callbacks,regularizers

SEQ_LEN = conf.SEQ_LEN     
VOCAB = conf.VOCAB       
EMBED_DIM = conf.EMBED_DIM
DROPOUT = conf.DROPOUT
BATCH_SIZE = conf.BATCH_SIZE
EPOCHS = conf.EPOCHS
BASE_LR = conf.BASE_LR     
CONF_FLOOR = conf.BASE_LR

In [None]:
vec = layers.TextVectorization(
    standardize=None,
    split="character",
    output_mode="int",
    output_sequence_length=SEQ_LEN,
    vocabulary=VOCAB
)
vec.adapt(tf.data.Dataset.from_tensor_slices(np.array(X_train_lines, dtype=object)).batch(2048))
vocab = vec.get_vocabulary()
with open(Path(conf.PROCESSED_DATA_PATH) / "char_vocab.txt", "w", encoding="utf-8") as f:
    for tok in vocab:
        f.write(tok + "\n")
print("Vocab size:", len(vocab))


def make_ds(texts, labels, batch_size, shuffle=False):
    ds = tf.data.Dataset.from_tensor_slices((np.array(texts, dtype=object), np.array(labels, dtype=np.int64)))
    if shuffle:
        ds = ds.shuffle(buffer_size=min(len(texts), 100000), seed=42, reshuffle_each_iteration=True)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

def vectorize_batch(text, label):
    return vec(text), label

train_ds = make_ds(X_train_lines, y_train, BATCH_SIZE, shuffle=True).map(vectorize_batch, num_parallel_calls=tf.data.AUTOTUNE)
val_ds   = make_ds(X_val_lines,   y_val,   BATCH_SIZE, shuffle=False).map(vectorize_batch, num_parallel_calls=tf.data.AUTOTUNE)
test_ds  = make_ds(X_test_lines,  y_test,  BATCH_SIZE, shuffle=False).map(vectorize_batch, num_parallel_calls=tf.data.AUTOTUNE)

Vocab size: 90


In [None]:
def build_model(vocab_size, num_classes):
    inputs = layers.Input(shape=(SEQ_LEN,), dtype=tf.int64)
    x = layers.Embedding(vocab_size, EMBED_DIM, mask_zero=True,
                         embeddings_regularizer=regularizers.l2(1e-6))(inputs)

    b1 = layers.Conv1D(96, 3, padding="same", activation="relu",
                       kernel_regularizer=regularizers.l2(1e-6))(x)
    b2 = layers.Conv1D(96, 5, padding="same", activation="relu",
                       kernel_regularizer=regularizers.l2(1e-6))(x)
    b3 = layers.Conv1D(96, 7, padding="same", activation="relu",
                       kernel_regularizer=regularizers.l2(1e-6))(x)
    x = layers.Concatenate()([b1, b2, b3])
    x = layers.BatchNormalization()(x)
    x = layers.MaxPooling1D(pool_size=2)(x)
    x = layers.SpatialDropout1D(0.25)(x)

    x = layers.Bidirectional(layers.LSTM(80, return_sequences=True,
                                         dropout=0.2,
                                         kernel_regularizer=regularizers.l2(1e-6)))(x)
    x = layers.GlobalMaxPooling1D()(x)

    x = layers.Dense(128, activation="relu", kernel_regularizer=regularizers.l2(1e-6))(x)
    x = layers.Dropout(DROPOUT)(x)
    outputs = layers.Dense(num_classes, activation="softmax")(x)

    model = tf.keras.Model(inputs, outputs)
    return model

model = build_model(vocab_size=len(vocab), num_classes=num_classes)
model.summary()



In [None]:

opt = optimizers.Adam(learning_rate=BASE_LR)
model.compile(optimizer=opt, loss="sparse_categorical_crossentropy", metrics=["accuracy"])

cb = [
    callbacks.EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True),
    callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=1, min_lr=1e-5, verbose=1),
    callbacks.ModelCheckpoint(Path(conf.EXTRACTED_DATA_PATH) / "line_charcnn_lstm.keras", save_best_only=True, monitor="val_loss")
]

# Training

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=cb,
    verbose=1
)

# Evaluation: line-level
y_pred_prob = model.predict(test_ds, verbose=0)
y_pred = y_pred_prob.argmax(axis=1)

print("\n[Line-level] Accuracy: %.4f  Macro-F1: %.4f" %(accuracy_score(y_test, y_pred), f1_score(y_test, y_pred, average="macro")))
print("\n[Line-level] Classification Report\n",classification_report(y_test, y_pred, target_names=list(le.classes_)))


Epoch 1/12


[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m350s[0m 3s/step - accuracy: 0.7593 - loss: 0.6229 - val_accuracy: 0.2032 - val_loss: 1.3940 - learning_rate: 0.0010
Epoch 2/12
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m370s[0m 3s/step - accuracy: 0.9351 - loss: 0.1862 - val_accuracy: 0.2404 - val_loss: 1.3865 - learning_rate: 0.0010
Epoch 3/12
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 3s/step - accuracy: 0.9540 - loss: 0.1357 - val_accuracy: 0.6021 - val_loss: 1.0793 - learning_rate: 0.0010
Epoch 4/12
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m394s[0m 4s/step - accuracy: 0.9577 - loss: 0.1234 - val_accuracy: 0.5767 - val_loss: 0.7807 - learning_rate: 0.0010
Epoch 5/12
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - accuracy: 0.9623 - loss: 0.1137
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m378s[0m 

In [None]:

idx_to_app = [None] * len(X_test_lines)
for app, indices in appidx_test.items():
    for idx in indices:
        if 0 <= idx < len(idx_to_app):
            idx_to_app[idx] = app

app_true = {}
app_probs = defaultdict(list)

for i, app in enumerate(idx_to_app):
    if app is None:
        continue
    app_true.setdefault(app, y_test[i])
    # keep only reasonably confident lines (tune CONF_FLOOR 0.4–0.6)
    if float(np.max(y_pred_prob[i])) >= CONF_FLOOR:
        app_probs[app].append(y_pred_prob[i])

app_level_true, app_level_pred = [], []
for app in app_true.keys():
    probs = app_probs.get(app, None)
    if not probs:  # if all lines filtered, fallback to all lines for that app
        probs = [y_pred_prob[i] for i, a in enumerate(idx_to_app) if a == app]
    mean_prob = np.mean(probs, axis=0)
    app_level_pred.append(int(np.argmax(mean_prob)))
    app_level_true.append(int(app_true[app]))

print("\n[App-level from mean prob] Accuracy: %.4f  Macro-F1: %.4f" %
      (accuracy_score(app_level_true, app_level_pred),
       f1_score(app_level_true, app_level_pred, average="macro")))
print("\n[App-level] Classification Report\n",classification_report(app_level_true, app_level_pred, target_names=list(le.classes_)))
print("\n[App-level] Confusion Matrix\n",
                pd.DataFrame(confusion_matrix(app_level_true, app_level_pred),
                   index=[f"true_{c}" for c in le.classes_],
                   columns=[f"pred_{c}" for c in le.classes_]))


[App-level from mean prob] Accuracy: 1.0000  Macro-F1: 1.0000

[App-level] Classification Report
                        precision    recall  f1-score   support

            Disk full       1.00      1.00      1.00         2
         Machine down       1.00      1.00      1.00         5
Network disconnection       1.00      1.00      1.00         1
               Normal       1.00      1.00      1.00         1

             accuracy                           1.00         9
            macro avg       1.00      1.00      1.00         9
         weighted avg       1.00      1.00      1.00         9


[App-level] Confusion Matrix
                             pred_Disk full  pred_Machine down  \
true_Disk full                           2                  0   
true_Machine down                        0                  5   
true_Network disconnection               0                  0   
true_Normal                              0                  0   

                            pred_Netw

In [17]:
import joblib

model.save(Path(conf.PROCESSED_DATA_PATH) / "line_charcnn_lstm_final.keras")

vec_config = vec.get_config()
vec_weights = vec.get_weights()
vec_vocabulary = vec.get_vocabulary()
joblib.dump({"config": vec_config, "weights": vec_weights,"vocabulary":vec_vocabulary}, Path(conf.PROCESSED_DATA_PATH) / "textvectorization_char.pkl")


joblib.dump(le, Path(conf.PROCESSED_DATA_PATH) / "label_encoder_line.joblib")

['data\\processed\\label_encoder_line.joblib']

In [18]:
print(le.classes_)

['Disk full' 'Machine down' 'Network disconnection' 'Normal']
