# Data Preprocess

In [1]:
import pandas as pd
import re
import os

# Merge 
folder = "data/sourcedata"
files = [os.path.join(folder, f) for f in os.listdir(folder) if f.startswith("messages-")]
records = []
for f in files:
    with open(f, 'r', encoding='utf-8') as fh:
        for line in fh:
            ts, host, text = line.strip().split(' ', 2)
            records.append({'timestamp': ts, 'host': host, 'text': text})

df = pd.DataFrame(records)
print(f"Data merge successful, total {len(df)} records")

# Label (normal/abnormal/warning)
def label_fn(txt):
    if re.search(r'(?i)error', txt):
        return 'abnormal'
    elif re.search(r'(?i)(alert|fail|warning)', txt):
        return 'warning'
    else:
        return 'normal'

df['label'] = df['text'].apply(label_fn)

# Statistics for normal, abnormal, and warning
total = len(df)
n_abnormal = (df['label'] == 'abnormal').sum()
n_warning = (df['label'] == 'warning').sum()
n_normal = (df['label'] == 'normal').sum()
abnormal_ratio = n_abnormal / total if total > 0 else 0
warning_ratio = n_warning / total if total > 0 else 0

print(f"Abnormal ratio: {abnormal_ratio:.2%}, Warning ratio: {warning_ratio:.2%}, Normal: {n_normal}, Warning: {n_warning}, Abnormal: {n_abnormal}")

# 按时间排序
df = df.sort_values(by='timestamp').reset_index(drop=True)

# 下采样参数
n_per_class = 500

# 下采样各类别
abnormal = df[df['label'] == 'abnormal']
warning = df[df['label'] == 'warning']
normal = df[df['label'] == 'normal']

abnormal_sample = abnormal.sample(n=min(n_per_class, len(abnormal)), random_state=42)
warning_sample = warning.sample(n=min(n_per_class, len(warning)), random_state=42)
normal_sample = normal.sample(n=min(n_per_class, len(normal)), random_state=42)

# 合并训练集并打乱
train_df = pd.concat([normal_sample, abnormal_sample, warning_sample]).sample(frac=1, random_state=42).reset_index(drop=True)

# 剩余数据作为测试集候选
remaining_df = df.drop(train_df.index).reset_index(drop=True)
n_test = min(1000, len(remaining_df))
test_df = remaining_df.sample(n=n_test, replace=False, random_state=42).reset_index(drop=True)

print(f"Number of training samples: {len(train_df)}")
print(f"Number of test samples: {len(test_df)}")

os.makedirs('data/sampledatasets', exist_ok=True)
train_df.to_json('data/sampledatasets/messages-train.jsonl', orient='records', lines=True, force_ascii=False)
test_df.to_json('data/sampledatasets/messages-test.jsonl', orient='records', lines=True, force_ascii=False)

print("Train set label counts:")
print(train_df['label'].value_counts())
print("Test set label counts:")
print(test_df['label'].value_counts())

Data merge successful, total 15921814 records
Number of training samples: 1500
Number of test samples: 1000
Train set label counts:
label
normal      500
abnormal    500
Name: count, dtype: int64
Test set label counts:
label
normal      932
abnormal      3
Name: count, dtype: int64


In [2]:
import pandas as pd
import re

# Try to read as plain text and print a few lines for inspection
with open("data/sourcedata/messages-20250602", "r", encoding="utf-8") as f:
	lines = f.readlines()

# 打印normal, warning, abnormal内容各2条（过滤掉包含网址的行）

def label_fn(txt):
    if re.search(r'(?i)error', txt):
        return 'abnormal'
    elif re.search(r'(?i)(alert|fail|warning)', txt):
        return 'warning'
    else:
        return 'normal'

def has_url(s):
    return bool(re.search(r'https?://|www\.', s))

normal_lines = []
warning_lines = []
abnormal_lines = []

for line in lines:
    if has_url(line):
        continue
    label = label_fn(line)
    if label == 'normal' and len(normal_lines) < 2:
        normal_lines.append(line.strip())
    elif label == 'warning' and len(warning_lines) < 2:
        warning_lines.append(line.strip())
    elif label == 'abnormal' and len(abnormal_lines) < 2:
        abnormal_lines.append(line.strip())
    if len(normal_lines) == 2 and len(warning_lines) == 2 and len(abnormal_lines) == 2:
        break

print("Normal:")
for l in normal_lines:
    print(l)
print("\nWarning:")
for l in warning_lines:
    print(l)
print("\nAbnormal:")
for l in abnormal_lines:
    print(l)

Normal:
2025-05-25T03:49:02.962309+08:00 hivenode02 infinity[2781294]: monitor net list is null
2025-05-25T03:49:03.441086+08:00 hivenode02 xinetd[4580]: START: mysql_status pid=1221371 from=::ffff:10.108.196.201

2025-05-25T03:49:02.829892+08:00 hivenode02 logrotate: ALERT exited abnormally with [1]

Abnormal:
2025-05-25T03:49:03.607982+08:00 hivenode02 named[7310]: error (network unreachable) resolving 'clients1.google.com/A/IN': 2001:7fd::1#53
2025-05-25T03:49:04.303444+08:00 hivenode02 named[7310]: error (network unreachable) resolving './NS/IN': 2001:500:2f::f#53


In [3]:
import pandas as pd

df = pd.read_json("data/sampledatasets/messages-train.jsonl", lines=True)
print(df[df['label'] == 'normal'])
print(df[df['label'] == 'warning'])
print(df[df['label'] == 'abnormal'])

df = pd.read_json("data/sampledatasets/messages-test.jsonl", lines=True)
print(df[df['label'] == 'normal'])
print(df[df['label'] == 'warning'])
print(df[df['label'] == 'abnormal'])

                            timestamp        host  \
2    2025-05-28 22:11:48.963670+08:00  hivenode02   
3    2025-05-29 08:18:00.546461+08:00  hivenode02   
4    2025-05-16 09:21:47.780379+08:00  hivenode02   
9    2025-05-13 15:42:09.094117+08:00  hivenode02   
14   2025-05-15 11:15:04.157428+08:00  hivenode02   
...                               ...         ...   
1483 2025-05-17 01:50:43.113434+08:00  hivenode02   
1487 2025-05-18 22:53:32.581086+08:00  hivenode02   
1489 2025-05-17 14:37:06.078631+08:00  hivenode02   
1491 2025-05-13 01:09:26.753736+08:00  hivenode02   
1492 2025-05-24 19:12:44.584002+08:00  hivenode02   

                                                   text   label  
2     python: .> infinity_queue   exchange=infinity_...  normal  
3     xinetd[4580]: START: mysql_status pid=2044170 ...  normal  
4     xinetd[4580]: START: mysql_status pid=2847661 ...  normal  
9     xinetd[4580]: START: mysql_status pid=3604329 ...  normal  
14      systemd: Starting Session

# LLM & Fine-Tunning & RAG

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import (
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset
from sentence_transformers import SentenceTransformer
import faiss

# 1. Data Loading
train_df = pd.read_json('data/sampledatasets/messages-train.jsonl', lines=True)
test_df = pd.read_json('data/sampledatasets/messages-test.jsonl', lines=True)

# Label mapping
# label_to_id = {'normal': 0, 'warning': 1, 'abnormal': 2}
# id_to_label = {v: k for k, v in label_to_id.items()}
# for df in (train_df, test_df):
#    if df['label'].dtype == 'object':
#        df['label'] = df['label'].map(label_to_id)
# num_labels = len(label_to_id)

# Class weights
# y = train_df['label'].values
# classes = np.unique(y)
# weights = compute_class_weight('balanced', classes=classes, y=y)
# weights = weights * np.array([1.0, 5.0, 10.0])
# class_weights = torch.tensor(weights, dtype=torch.float).to('cuda')

# 2. Label Embeddings
embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

unique_labels = train_df['label'].unique().tolist()
print(f"Found labels: {unique_labels}")

# labels --> embeddings
label_embeddings = embedder.encode(unique_labels)
embedding_dim = label_embeddings.shape[1]

# Mapping labels --> embeddings
label_to_embedding = {label: label_embeddings[i] for i, label in enumerate(unique_labels)}
# Mapping embeddings --> labels
embedding_to_label = {tuple(emb): label for label, emb in label_to_embedding.items()}

# 3. Build FAISS index for RAG KB
kb = [
    {"text": item['text'], "solution": item['solution']} 
    for item in [
        {
        "text": "systemd: infi-celery.service: main process exited, code=exited, status=1/FAILURE",
        "solution": "Check the infi-celery service log, verify configuration and dependencies, and try restarting the service."
    },
    {
        "text": "systemd: infi-celery.service holdoff time over, scheduling restart.",
        "solution": "The service will automatically restart after an exception. Investigate the root cause of the abnormal exit."
    },
    {
        "text": "systemd: infi-celery.service: control process exited, code=exited status=1",
        "solution": "Check the control process log and verify service configuration and permissions."
    },
    {
        "text": "systemd: Started Infinity Celery Worker Service.",
        "solution": "The service started successfully. No action required."
    },
    {
        "text": "systemd-logind: Removed session",
        "solution": "User session was removed. This is usually a normal operation."
    },
    {
        "text": "xinetd[4580]: EXIT: mysql_status status=0",
        "solution": "MySQL status check is normal. No action required."
    },
    {
        "text": "xinetd[4580]: EXIT: zk_status status=0",
        "solution": "Zookeeper status check is normal. No action required."
    },
    {
        "text": "python: /usr/lib/python2.7/site-packages/celery/platforms.py:796: RuntimeWarning: You're running the worker with superuser privileges: this is",
        "solution": "It is not recommended to run celery worker as root. Please use a regular user."
    },
    {
        "text": "infinity[4139103]: an error occurred while requesting bindings <urlopen error [Errno 111] Connection refused>",
        "solution": "Check network connectivity and ensure the target service port is open."
    },
    {
        "text": "kill: kill: cannot find process",
        "solution": "The target process does not exist. Please verify the process ID."
    }
    ]
]
kb_texts = [entry['text'] for entry in kb]
kb_embs = embedder.encode(kb_texts)
dim = kb_embs.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(kb_embs)

def retrieve_solutions(text, k=3):
    emb = embedder.encode([text])
    D, I = index.search(np.array(emb), k)
    return " ".join(kb[i]['solution'] for i in I[0])

# 4. Text --> Vector Model
class VectorOutputModel(nn.Module):
    def __init__(self, embedding_dim):
        super(VectorOutputModel, self).__init__()
        self.base_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
        self.vector_head = nn.Linear(self.base_model.config.hidden_size, embedding_dim)
        self.embedding_dim = embedding_dim
        
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # Use first token
        vector_output = self.vector_head(pooled_output)
        
        batch_size = len(labels)
        target_embeddings = torch.zeros(batch_size, self.embedding_dim).to(vector_output.device)
            
        for i, label_text in enumerate(labels):
            if isinstance(label_text, torch.Tensor):
                label_text = label_text.item() if label_text.numel() == 1 else str(label_text)
            target_embeddings[i] = torch.tensor(label_to_embedding[str(label_text)]).to(vector_output.device)
            
            # Cos similarity loss (1 - cosine_similarity)
        cos_sim = nn.functional.cosine_similarity(vector_output, target_embeddings, dim=1)
        loss = 1 - cos_sim.mean()
            
        return {"loss": loss, "logits": vector_output}

# 5. Prepare datasets
d_train = Dataset.from_pandas(train_df)
d_test = Dataset.from_pandas(test_df)

tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-0.6B')
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Tokenize function
def tokenize_fn(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

# Rename & format
d_train = d_train.rename_column('label', 'labels')
d_test = d_test.rename_column('label', 'labels')
for ds in (d_train, d_test):
    ds.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

# 4. Model & LoRA setup
model = AutoModelForSequenceClassification.from_pretrained(
    'Qwen/Qwen3-0.6B', num_labels=num_labels
)
model.config.pad_token_id = tokenizer.pad_token_id

lora_cfg = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=['q_proj', 'v_proj'],
    lora_dropout=0.1,
    bias='none'
)
model = get_peft_model(model, lora_cfg)
model.to('cuda')

# 5. 修复的Metrics函数
def compute_metrics(eval_pred):
    """计算评估指标"""
    try:
        predictions, labels = eval_pred
        print(f"Debug: predictions shape: {predictions.shape if hasattr(predictions, 'shape') else type(predictions)}")
        print(f"Debug: labels shape: {labels.shape if hasattr(labels, 'shape') else type(labels)}")
        
        # 确保predictions是numpy数组
        if isinstance(predictions, tuple):
            predictions = predictions[0]
        
        # 转换为numpy数组
        if not isinstance(predictions, np.ndarray):
            predictions = np.array(predictions)
        if not isinstance(labels, np.ndarray):
            labels = np.array(labels)
        
        # 获取预测结果
        predictions = np.argmax(predictions, axis=-1)
        
        # 计算指标
        accuracy = accuracy_score(labels, predictions)
        f1 = f1_score(labels, predictions, average='weighted', zero_division=0)
        precision = precision_score(labels, predictions, average='weighted', zero_division=0)
        recall = recall_score(labels, predictions, average='weighted', zero_division=0)
        
        result = {
            'accuracy': float(accuracy),
            'f1': float(f1),
            'precision': float(precision),
            'recall': float(recall)
        }
        
        print(f"Debug: computed metrics: {result}")
        return result
        
    except Exception as e:
        print(f"Error in compute_metrics: {e}")
        # 返回默认指标以避免训练中断
        return {
            'accuracy': 0.0,
            'f1': 0.0,
            'precision': 0.0,
            'recall': 0.0
        }

# 6. Custom Trainer
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs['labels']
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

# 7. 修改后的Training Arguments - 使用更保守的设置
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    weight_decay=0.01,
    warmup_steps=100,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='steps',
    eval_steps=100,
    save_steps=100,
    # 临时禁用最佳模型加载以避免指标问题
    load_best_model_at_end=False,
    # metric_for_best_model='eval_f1',  # 暂时注释掉
    # greater_is_better=True,  # 暂时注释掉
    fp16=True,
    dataloader_drop_last=False,
    # 确保评估数据集被使用
    do_eval=True,
    # 添加更多调试信息
    report_to=None,  # 禁用wandb等报告
)

# 8. Initialize Trainer - 添加调试信息
print("初始化Trainer...")
print(f"训练数据集大小: {len(d_train)}")
print(f"测试数据集大小: {len(d_test)}")
print(f"数据集列: {d_train.column_names}")

# 检查数据集格式
sample_batch = d_train[:2]
print(f"样本批次键: {sample_batch.keys()}")
for key, value in sample_batch.items():
    if hasattr(value, 'shape'):
        print(f"{key} shape: {value.shape}")
    else:
        print(f"{key} type: {type(value)}")

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=d_train,
    eval_dataset=d_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 验证compute_metrics函数
print("测试compute_metrics函数...")
try:
    # 创建一个测试样本
    test_logits = np.random.rand(4, num_labels)  # 4个样本，3个类别
    test_labels = np.array([0, 1, 2, 1])
    test_result = compute_metrics((test_logits, test_labels))
    print(f"compute_metrics测试成功: {test_result}")
except Exception as e:
    print(f"compute_metrics测试失败: {e}")

# 9. Train
print("开始训练...")
trainer.train()

# 10. RAG Inference
def rag_predict(texts, model, tokenizer, k=3):
    model.eval()
    preds = []
    for txt in texts:
        context = retrieve_solutions(txt, k=k)
        inp = tokenizer(f"{txt} [CONTEXT] {context}",
                        return_tensors='pt',
                        padding=True,
                        truncation=True,
                        max_length=128)
        inp = {k: v.to('cuda') for k, v in inp.items()}
        with torch.no_grad():
            out = model(**inp)
        pred = torch.argmax(out.logits, dim=-1).cpu().item()
        preds.append(pred)
    return preds

# 11. 修复的评估部分
print("开始评估...")
test_texts = test_df['text'].tolist()
y_true = test_df['label'].tolist()

# 使用RAG预测
y_pred_rag = rag_predict(test_texts, model, tokenizer)

# 计算指标
accuracy = accuracy_score(y_true, y_pred_rag)
f1 = f1_score(y_true, y_pred_rag, average='weighted')
precision = precision_score(y_true, y_pred_rag, average='weighted')
recall = recall_score(y_true, y_pred_rag, average='weighted')

print("RAG预测结果:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

# 混淆矩阵
cm = confusion_matrix(y_true, y_pred_rag)
print("\n混淆矩阵:")
print(cm)

# 按类别显示结果
print(f"\n标签映射: {id_to_label}")
for i, label in id_to_label.items():
    mask = np.array(y_true) == i
    if mask.sum() > 0:
        class_acc = accuracy_score(np.array(y_true)[mask], np.array(y_pred_rag)[mask])
        print(f"{label} 类准确率: {class_acc:.4f}")

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Some weights of Qwen3ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen3-0.6B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = WeightedTrainer(
No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


初始化Trainer...
训练数据集大小: 1500
测试数据集大小: 1000
数据集列: ['timestamp', 'host', 'text', 'labels', 'input_ids', 'attention_mask']
样本批次键: dict_keys(['labels', 'input_ids', 'attention_mask'])
labels shape: torch.Size([2])
input_ids shape: torch.Size([2, 128])
attention_mask shape: torch.Size([2, 128])
测试compute_metrics函数...
Debug: predictions shape: (4, 3)
Debug: labels shape: (4,)
Debug: computed metrics: {'accuracy': 0.0, 'f1': 0.0, 'precision': 0.0, 'recall': 0.0}
compute_metrics测试成功: {'accuracy': 0.0, 'f1': 0.0, 'precision': 0.0, 'recall': 0.0}
开始训练...


Step,Training Loss,Validation Loss
100,1.5312,No log
200,0.2863,No log
300,0.0954,No log
400,0.0288,No log
500,0.0074,No log


开始评估...
RAG预测结果:
Accuracy: 0.9330
F1 Score: 0.9016
Precision: 0.9345
Recall: 0.9330

混淆矩阵:
[[932   0   0]
 [ 64   1   0]
 [  3   0   0]]

normal 类准确率: 1.0000
abnormal 类准确率: 0.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# SVM

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
import plotly.offline as pyo
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import logging
import os
from datetime import datetime

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Initialize Plotly for offline rendering in VSCode
pyo.init_notebook_mode(connected=True)

# Ensure NLTK data is downloaded
nltk.download('punkt', quiet=True)

# 1. Load preprocessed data
try:
    train_df = pd.read_json('data/sampledatasets/messages-train.jsonl', lines=True)
    test_df = pd.read_json('data/sampledatasets/messages-test.jsonl', lines=True)
    logger.info(f"Loaded {len(train_df)} training samples and {len(test_df)} test samples")
except FileNotFoundError as e:
    logger.error(f"Data file not found: {e}")
    raise
except Exception as e:
    logger.error(f"Error loading data: {e}")
    raise

# Verify data
assert 'text' in train_df.columns and 'label' in train_df.columns, "Train data must contain 'text' and 'label'"
assert 'text' in test_df.columns and 'label' in test_df.columns, "Test data must contain 'text' and 'label'"

# 2. Label encoding
label_encoder = LabelEncoder()
train_df['label_id'] = label_encoder.fit_transform(train_df['label'])
test_df['label_id'] = label_encoder.transform(test_df['label'])
label_mapping = dict(zip(label_encoder.classes_, range(len(label_encoder.classes_))))
logger.info(f"Label mapping: {label_mapping}")

# 3. Text preprocessing for Word2Vec
def preprocess_text(text):
    try:
        tokens = word_tokenize(text.lower())
        tokens = [t for t in tokens if t.isalpha()]
        return tokens
    except Exception as e:
        logger.warning(f"Error preprocessing text: {e}")
        return []

train_df['tokens'] = train_df['text'].apply(preprocess_text)
test_df['tokens'] = test_df['text'].apply(preprocess_text)

# 4. Train Word2Vec model
try:
    all_tokens = train_df['tokens'].tolist() + test_df['tokens'].tolist()
    word2vec = Word2Vec(sentences=all_tokens, vector_size=100, window=5, min_count=1, workers=4, sg=1)
    logger.info("Word2Vec model trained successfully")
except Exception as e:
    logger.error(f"Error training Word2Vec: {e}")
    raise

# Function to get sentence embedding
def get_sentence_embedding(tokens, model, vector_size=100):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)

train_df['embedding'] = train_df['tokens'].apply(lambda x: get_sentence_embedding(x, word2vec))
test_df['embedding'] = test_df['tokens'].apply(lambda x: get_sentence_embedding(x, word2vec))

# Convert embeddings to feature matrix
try:
    X_train = np.stack(train_df['embedding'].values)
    y_train = train_df['label_id'].values
    X_test = np.stack(test_df['embedding'].values)
    y_test = test_df['label_id'].values
    logger.info(f"Training features shape: {X_train.shape}, Test features shape: {X_test.shape}")
except Exception as e:
    logger.error(f"Error creating feature matrices: {e}")
    raise

# 5. Define models
models = {
    'SVM': SVC(kernel='linear', class_weight='balanced', random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'RandomForest': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    'ANN': MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42),
    'DecisionTree': DecisionTreeClassifier(class_weight='balanced', random_state=42),
    'LogisticRegression': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
}

# 6. Train and evaluate models
results = {}
per_class_accuracies = {}
for name, model in models.items():
    logger.info(f"Training {name}...")
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # Compute metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        cm = confusion_matrix(y_test, y_pred)
        
        # Compute per-class accuracy
        class_acc = {}
        for i, label in enumerate(label_encoder.classes_):
            mask = y_test == i
            if mask.sum() > 0:
                class_acc[label] = accuracy_score(y_test[mask], y_pred[mask])
            else:
                class_acc[label] = 0.0
        
        # Store results
        results[name] = {
            'accuracy': accuracy,
            'f1': f1,
            'precision': precision,
            'recall': recall,
            'confusion_matrix': cm
        }
        per_class_accuracies[name] = class_acc
        
        # Print results
        print(f"\n{name} Results:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print("Confusion Matrix:")
        print(cm)
        print(f"Per-class accuracy for {name}:")
        for label, acc in class_acc.items():
            print(f"{label} accuracy: {acc:.4f}")
    except Exception as e:
        logger.error(f"Error training/evaluating {name}: {e}")
        continue

# 7. Plotly Visualization Functions
def plot_model_metrics(results):
    model_names = list(results.keys())
    metrics = ['accuracy', 'f1', 'precision', 'recall']
    metric_names = ['Accuracy', 'F1 Score', 'Precision', 'Recall']
    colors = ['royalblue', 'firebrick', 'forestgreen', 'goldenrod']
    
    fig = go.Figure()
    for i, metric in enumerate(metrics):
        values = [results[model][metric] for model in model_names]
        fig.add_trace(go.Bar(
            x=model_names,
            y=values,
            name=metric_names[i],
            marker_color=colors[i],
            opacity=0.8,
            width=0.2,
            offset=i * 0.2 - 0.3,
            hovertemplate=f"{metric_names[i]}: %{{y:.4f}}<extra></extra>"
        ))
    
    fig.update_layout(
        title=dict(text="<b>Model Performance Comparison</b>", x=0.5, font=dict(size=20)),
        xaxis=dict(title="Model", gridcolor='lightgray', tickangle=45),
        yaxis=dict(title="Score", range=[0, 1.05], gridcolor='lightgray', tickfont=dict(color='black')),
        barmode='group',
        plot_bgcolor='white',
        hovermode="x unified",
        height=600,
        legend=dict(x=1.05, y=1.0, bgcolor='rgba(255,255,255,0.5)'),
        margin=dict(b=150)
    )
    fig.add_hline(y=0.8, line_dash="dash", line_color="orange", 
                  annotation_text="Threshold (0.8)", annotation_position="bottom right")
    fig.show()

def plot_confusion_matrices(results, label_classes):
    fig = go.Figure()
    for i, (name, result) in enumerate(results.items()):
        cm = result['confusion_matrix']
        fig.add_trace(go.Heatmap(
            z=cm,
            x=label_classes,
            y=label_classes,
            text=cm,
            texttemplate="%{text}",
            colorscale='Blues',
            hovertemplate='True: %{y}<br>Predicted: %{x}<br>Count: %{z}<extra></extra>',
            visible=(i == 0),  # Only first model visible initially
            name=name
        ))
    
    buttons = [
        dict(label=name, method="update", args=[{"visible": [i == j for j in range(len(results))]}], args2=[{"title": f"<b>Confusion Matrix - {name}</b>"}])
        for i, name in enumerate(results.keys())
    ]
    
    fig.update_layout(
        title=dict(text=f"<b>Confusion Matrix - {list(results.keys())[0]}</b>", x=0.5, font=dict(size=20)),
        xaxis=dict(title="Predicted Label", gridcolor='lightgray'),
        yaxis=dict(title="True Label", gridcolor='lightgray'),
        updatemenus=[dict(
            type="dropdown",
            buttons=buttons,
            x=1.05,
            y=1.1,
            showactive=True
        )],
        plot_bgcolor='white',
        height=500,
        width=500
    )
    fig.show()

def plot_per_class_accuracy(per_class_accuracies, label_classes):
    model_names = list(per_class_accuracies.keys())
    fig = go.Figure()
    colors = ['royalblue', 'firebrick', 'forestgreen']
    
    for i, label in enumerate(label_classes):
        values = [per_class_accuracies[model][label] for model in model_names]
        fig.add_trace(go.Bar(
            x=model_names,
            y=values,
            name=label,
            marker_color=colors[i % len(colors)],
            opacity=0.8,
            width=0.15,
            offset=i * 0.15 - 0.225,
            hovertemplate=f"{label} Accuracy: %{{y:.4f}}<extra></extra>"
        ))
    
    fig.update_layout(
        title=dict(text="<b>Per-Class Accuracy Comparison</b>", x=0.5, font=dict(size=20)),
        xaxis=dict(title="Model", gridcolor='lightgray', tickangle=45),
        yaxis=dict(title="Accuracy", range=[0, 1.05], gridcolor='lightgray', tickfont=dict(color='black')),
        barmode='group',
        plot_bgcolor='white',
        hovermode="x unified",
        height=600,
        legend=dict(x=1.05, y=1.0, bgcolor='rgba(255,255,255,0.5)'),
        margin=dict(b=150)
    )
    fig.add_hline(y=0.8, line_dash="dash", line_color="orange", 
                  annotation_text="Threshold (0.8)", annotation_position="bottom right")
    fig.show()

# 8. Generate Plots
logger.info("Generating Plotly Visualizations...")
try:
    plot_model_metrics(results)
    plot_confusion_matrices(results, label_encoder.classes_)
    plot_per_class_accuracy(per_class_accuracies, label_encoder.classes_)
except Exception as e:
    logger.error(f"Error generating visualizations: {e}")

# 9. Summary of Results
print("\nSummary of Results:")
summary_df = pd.DataFrame({
    'Model': results.keys(),
    'Accuracy': [results[m]['accuracy'] for m in results],
    'F1 Score': [results[m]['f1'] for m in results],
    'Precision': [results[m]['precision'] for m in results],
    'Recall': [results[m]['recall'] for m in results]
})
print(summary_df.to_markdown(index=False))


torch.clear_autocast_cache()

2025-06-17 11:21:33,839 - INFO - Loaded 1500 training samples and 1000 test samples
2025-06-17 11:21:34,021 - INFO - collecting all words and their counts
2025-06-17 11:21:34,021 - INFO - PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-06-17 11:21:34,024 - INFO - collected 138 word types from a corpus of 12032 raw words and 2500 sentences
2025-06-17 11:21:34,025 - INFO - Creating a fresh vocabulary
2025-06-17 11:21:34,026 - INFO - Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 138 unique words (100.00% of original 138, drops 0)', 'datetime': '2025-06-17T11:21:34.026571', 'gensim': '4.3.3', 'python': '3.12.10 | packaged by conda-forge | (main, Apr 10 2025, 22:08:16) [MSC v.1943 64 bit (AMD64)]', 'platform': 'Windows-11-10.0.26100-SP0', 'event': 'prepare_vocab'}
2025-06-17 11:21:34,027 - INFO - Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 12032 word corpus (100.00% of original 12032, drops 0)', 'datetime': '2025-06-17T11:21:34.027


SVM Results:
Accuracy: 0.9860
F1 Score: 0.9865
Precision: 0.9880
Recall: 0.9860
Confusion Matrix:
[[  3   0   0]
 [  0 919  13]
 [  0   1  64]]
Per-class accuracy for SVM:
abnormal accuracy: 1.0000
normal accuracy: 0.9861


2025-06-17 11:21:38,162 - INFO - Training RandomForest...



KNN Results:
Accuracy: 0.9850
F1 Score: 0.9858
Precision: 0.9880
Recall: 0.9850
Confusion Matrix:
[[  3   0   0]
 [  2 917  13]
 [  0   0  65]]
Per-class accuracy for KNN:
abnormal accuracy: 1.0000
normal accuracy: 0.9839


2025-06-17 11:21:38,387 - INFO - Training ANN...



RandomForest Results:
Accuracy: 0.9980
F1 Score: 0.9980
Precision: 0.9981
Recall: 0.9980
Confusion Matrix:
[[  3   0   0]
 [  0 930   2]
 [  0   0  65]]
Per-class accuracy for RandomForest:
abnormal accuracy: 1.0000
normal accuracy: 0.9979


2025-06-17 11:21:41,204 - INFO - Training DecisionTree...
2025-06-17 11:21:41,214 - INFO - Training LogisticRegression...
2025-06-17 11:21:41,251 - INFO - Generating Plotly Visualizations...



ANN Results:
Accuracy: 0.9840
F1 Score: 0.9848
Precision: 0.9868
Recall: 0.9840
Confusion Matrix:
[[  3   0   0]
 [  2 917  13]
 [  0   1  64]]
Per-class accuracy for ANN:
abnormal accuracy: 1.0000
normal accuracy: 0.9839

DecisionTree Results:
Accuracy: 0.9950
F1 Score: 0.9951
Precision: 0.9954
Recall: 0.9950
Confusion Matrix:
[[  3   0   0]
 [  0 927   5]
 [  0   0  65]]
Per-class accuracy for DecisionTree:
abnormal accuracy: 1.0000
normal accuracy: 0.9946

LogisticRegression Results:
Accuracy: 0.8890
F1 Score: 0.9104
Precision: 0.9569
Recall: 0.8890
Confusion Matrix:
[[  3   0   0]
 [  2 822 108]
 [  0   1  64]]
Per-class accuracy for LogisticRegression:
abnormal accuracy: 1.0000
normal accuracy: 0.8820



Summary of Results:
| Model              |   Accuracy |   F1 Score |   Precision |   Recall |
|:-------------------|-----------:|-----------:|------------:|---------:|
| SVM                |      0.986 |   0.986546 |    0.988013 |    0.986 |
| KNN                |      0.985 |   0.98578  |    0.987967 |    0.985 |
| RandomForest       |      0.998 |   0.998014 |    0.99806  |    0.998 |
| ANN                |      0.984 |   0.984781 |    0.986811 |    0.984 |
| DecisionTree       |      0.995 |   0.995086 |    0.995357 |    0.995 |
| LogisticRegression |      0.889 |   0.910408 |    0.956854 |    0.889 |
