# Data Generation

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, TextGenerationPipeline
import torch
from tqdm import tqdm
import json

model_name = 'Qwen/Qwen3-0.6B'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to('cuda')
generator = TextGenerationPipeline(model=model, tokenizer=tokenizer, device=0)

# Example data
template_data = {
    "date": [f"2024-05-{str(day).zfill(2)}" for day in range(1, 32)],
    "hivenode01_memory_usage": [0.383553, 0.418137, 0.492768, 0.565933, 0.563794, 0.530966, 0.542266, 0.49038, 0.495031, 0.563613, 
                                0.495269, 0.618042, 0.536223, 0.613919, 0.508449, 0.576679, 0.521551, 0.622268, 0.611154, 0.518589, 
                                0.654723, 0.529004, 0.634645, 0.537238, 0.534131, 0.536508, 0.543266, 0.575944, 0.53557, 0.611579, 0.551645],
    "hivenode01_cpu_load_5min": [8.53, 12.07, 9.09, 8.55, 12.5, 7.98, 8.93, 7.85, 7.22, 8.98, 
                                 9.34, 9.6, 10.06, 8.68, 9.03, 8.08, 9.02, 7.89, 7.6, 11.22, 
                                 9.58, 9.53, 8.96, 10.89, 6.7, 9.73, 9.5, 10.5, 9.11, 8.75, 6.51],
    "hivenode01_cpu_load_10min": [9.12, 9.94, 9.3, 8.8, 11.66, 9.5, 8.63, 8.67, 7.77, 8.18, 
                                  8.99, 8.14, 9.02, 8.62, 8.54, 8.71, 8.91, 8.47, 7.66, 9.91, 
                                  8.67, 8.72, 8.77, 9.19, 7.58, 9.44, 8.89, 9.83, 8.32, 8.91, 8.17],
    "hivenode01_cpu_load_15min": [9.49, 9.07, 9.6, 7.92, 9.79, 9.62, 8.36, 9.32, 8.07, 7.91, 
                                  8.99, 8.12, 8.4, 8.46, 8.25, 8.73, 9.04, 8.54, 8.02, 9.58, 
                                  8.62, 8.84, 8.31, 8.68, 8.11, 9.01, 8.3, 8.95, 8.64, 8.83, 8.73],
    "hivenode02_memory_usage": [0.342719, 0.374206, 0.404097, 0.427319, 0.435087, 0.43736, 0.437884, 0.438484, 0.447014, 0.450261, 
                                0.456589, 0.457304, 0.461043, 0.459707, 0.456962, 0.460743, 0.470204, 0.466149, 0.474047, 0.476227, 
                                0.477294, 0.479754, 0.482177, 0.471509, 0.475243, 0.47675, 0.487594, 0.490805, 0.492136, 0.500023, 0.502623],
    "hivenode02_cpu_load_5min": [6.51, 8.39, 10.08, 12.8, 12.48, 10.14, 12.26, 8.85, 5.73, 9.06, 
                                 7.78, 7.96, 7.47, 6.82, 5.79, 7.28, 6.23, 7.19, 6.14, 7.79, 
                                 6.3, 6.57, 7.69, 6.68, 6, 6.59, 7.59, 9.85, 7.96, 9.24, 7.76],
    "hivenode02_cpu_load_10min": [6.27, 8.1, 10.61, 11.74, 12.44, 10.47, 11.24, 9.63, 7.08, 8.45, 
                                  7.22, 6.18, 6.94, 7, 6.41, 7.03, 6.72, 6.58, 6.04, 8.39, 
                                  6.34, 6.72, 6.91, 6.78, 6.35, 6.98, 6.84, 9.35, 8.14, 9.07, 7.54],
    "hivenode02_cpu_load_15min": [6.74, 7.63, 10.64, 10.35, 11.11, 10.6, 10.01, 9.69, 7.04, 8.02, 
                                  6.8, 6.12, 6.77, 7.13, 6.7, 7.07, 7.05, 6.9, 6.66, 8.08, 
                                  7, 7.39, 6.79, 7.14, 6.82, 7.05, 7.04, 8.52, 8, 8.46, 7.58],
    "hivenode03_memory_usage": [0.348359, 0.375609, 0.40562, 0.428764, 0.439686, 0.441799, 0.445848, 0.442726, 0.455341, 0.451364, 
                                0.458707, 0.458583, 0.463021, 0.474306, 0.468443, 0.46892, 0.47674, 0.473441, 0.477693, 0.477164, 
                                0.478697, 0.480722, 0.4821, 0.486745, 0.489345, 0.488128, 0.493571, 0.493586, 0.494228, 0.502349, 0.503208],
    "hivenode03_cpu_load_5min": [8.1, 9.61, 7.42, 8.05, 10.37, 7.72, 11.72, 7.67, 6.14, 9.78, 
                                 7.17, 7.31, 10.6, 6.07, 5.34, 8.05, 6.25, 6.13, 6.1, 8.68, 
                                 10.09, 7.21, 7.5, 7.04, 7.07, 8.59, 10.04, 7.41, 6.68, 12.27, 6.57],
    "hivenode03_cpu_load_10min": [7.6, 8.99, 8.52, 8.7, 10.56, 8.82, 9.46, 7.53, 7.36, 8.75, 
                                  7.09, 7.13, 9.56, 6.75, 6.73, 7.91, 6.71, 6.66, 7.11, 8.29, 
                                  8.12, 7, 6.84, 7.52, 7.84, 8.83, 8.43, 8.22, 7.42, 8.92, 7.05],
    "hivenode03_cpu_load_15min": [7.77, 8.52, 8.85, 8.12, 9.56, 9.05, 8.9, 8.19, 7.57, 8.34, 
                                  7.3, 7.36, 8.66, 7.17, 7.57, 7.68, 7.33, 7.05, 7.46, 8.48, 
                                  7.77, 7.61, 7.16, 7.84, 8.22, 8.38, 8.05, 8.11, 7.66, 8.29, 7.49],
    "hivenode04_memory_usage": [0.406619, 0.419903, 0.433678, 0.441757, 0.452172, 0.455543, 0.4598, 0.460577, 0.459536, 0.461846, 
                                0.465693, 0.474606, 0.478739, 0.470002, 0.465657, 0.467247, 0.473648, 0.473736, 0.478283, 0.482861, 
                                0.483788, 0.488319, 0.489811, 0.485238, 0.493503, 0.487962, 0.496605, 0.49354, 0.490805, 0.485683, 0.487579],
    "hivenode04_cpu_load_5min": [8, 10.23, 11.02, 7.19, 14.19, 9.73, 8, 7.59, 5.16, 7.4, 
                                 6.75, 11.49, 7.46, 6.14, 5.92, 8.56, 8.31, 5.56, 5.3, 8.13, 
                                 9.03, 7.09, 6.19, 9.83, 7.2, 6.55, 8.41, 8.59, 8.73, 7.54, 8.48],
    "hivenode04_cpu_load_10min": [6.98, 9.55, 9.47, 6.98, 10.99, 9.18, 7.47, 8.12, 6.7, 7.94, 
                                  6.55, 8.76, 7.3, 6.44, 7.34, 8.53, 8.74, 5.8, 6.15, 7.7, 
                                  8.45, 7, 6.08, 8.91, 7.49, 6.32, 8.62, 8.64, 9.39, 7.11, 8.12],
    "hivenode04_cpu_load_15min": [7.1, 8.69, 8.87, 7.09, 9.12, 8.57, 7.35, 8.13, 7.05, 7.85, 
                                  6.75, 7.59, 7.14, 6.73, 7.68, 8.18, 8.23, 6.5, 6.73, 7.81, 
                                  7.99, 7.58, 6.5, 8.22, 7.47, 6.78, 7.95, 8.17, 9.28, 7.25, 7.87],
    "hivenode05_memory_usage": [0.351445, 0.383579, 0.411094, 0.438981, 0.445999, 0.448852, 0.449878, 0.451535, 0.458583, 0.462726, 
                                0.466429, 0.469966, 0.471406, 0.469826, 0.481328, 0.483001, 0.476077, 0.489852, 0.49036, 0.485502, 
                                0.491012, 0.492307, 0.493674, 0.49574, 0.50239, 0.502286, 0.507123, 0.507206, 0.507957, 0.515078, 0.519858],
    "hivenode05_cpu_load_5min": [6.35, 10.45, 7.17, 7.47, 6.62, 7.21, 9.36, 7.46, 6.07, 7.37, 
                                 7.32, 7.54, 7.96, 7.12, 6.64, 7.61, 9.68, 7.25, 8.29, 9.44, 
                                 11.2, 7.9, 9.29, 8.73, 10.4, 6.47, 7.19, 11.26, 9.63, 7.66, 9.04],
    "hivenode05_cpu_load_10min": [6.28, 8.91, 7.31, 7.82, 7.7, 7.96, 7.92, 7.33, 6.2, 8.48, 
                                  6.69, 7.32, 7.58, 7.44, 6.99, 7.93, 7.84, 7.75, 7.74, 8.82, 
                                  10.24, 7.76, 7.84, 8.24, 8.31, 7.4, 7.27, 9.23, 8.78, 7.81, 8.21],
    "hivenode05_cpu_load_15min": [6.8, 8.32, 7.67, 7.49, 7.61, 8.17, 7.36, 7.52, 6.51, 7.85, 
                                  7.04, 7.15, 7.65, 7.48, 7.28, 7.67, 7.78, 8.08, 7.52, 8.79, 
                                  9.57, 7.87, 7.51, 8.07, 7.82, 7.67, 7.49, 8.74, 8.16, 8.06, 7.79],
    "hivenode06_memory_usage": [0.36604, 0.269375, 0.307352, 0.364142, 0.173979, 0.139898, 0.192349, 0.218628, 0.207623, 0.156441, 
                                0.164164, 0.205858, 0.18286, 0.156721, 0.154286, 0.194161, 0.153307, 0.17226, 0.154831, 0.158268, 
                                0.206682, 0.174523, 0.213316, 0.193095, 0.21925, 0.20669, 0.219678, 0.257834, 0.30988, 0.199916, 0.160641],
    "hivenode06_cpu_load_5min": [3.42, 3.33, 2.71, 2.62, 3.36, 3.65, 2.88, 3, 3.22, 3.28, 
                                 2.28, 3.02, 3.02, 2.89, 3.22, 3.55, 3.16, 3.36, 3.44, 2.8, 
                                 2.95, 2.28, 3.14, 2.43, 3.47, 2.78, 2.93, 3.51, 2.69, 2.55, 3.2],
    "hivenode06_cpu_load_10min": [3.09, 3.3, 3.01, 2.83, 3.27, 3.3, 3.22, 2.9, 3.19, 3.34, 
                                  2.94, 3.04, 3.19, 3.11, 3.57, 3.37, 3.31, 3.19, 3.12, 3.04, 
                                  3.05, 2.67, 3.22, 2.84, 3.73, 3.18, 3.1, 3.21, 3.08, 3.09, 3.39],
    "hivenode06_cpu_load_15min": [3.02, 3.19, 3.03, 2.87, 3.27, 3.17, 3.19, 2.94, 3.12, 3.12, 
                                  3.06, 3.01, 3.18, 3.38, 3.54, 3.31, 3.35, 3.09, 3.03, 3.12, 
                                  3.01, 2.76, 3.19, 2.87, 3.45, 3.13, 3.14, 3.13, 3.04, 3.22, 3.33],
    "hivenode07_memory_usage": [0.381104, 0.141959, 0.15592, 0.144891, 0.183825, 0.221047, 0.255345, 0.270316, 0.25585, 0.255843, 
                                0.186189, 0.215183, 0.259902, 0.310836, 0.185046, 0.190949, 0.17576, 0.179073, 0.207616, 0.205337, 
                                0.248384, 0.240304, 0.201106, 0.180418, 0.189253, 0.195343, 0.205485, 0.193422, 0.213659, 0.200243, 0.198259],
    "hivenode07_cpu_load_5min": [2.87, 2.82, 2.36, 2.04, 2.55, 2.15, 2.05, 2.06, 2.34, 2.16, 
                                 2.96, 2.44, 2.46, 2.43, 2.34, 2.76, 2.37, 2.58, 2.23, 2.6, 
                                 2.36, 2.35, 2.13, 2.58, 2.24, 2.66, 2.89, 2.58, 2.12, 3.02, 2.59],
    "hivenode07_cpu_load_10min": [2.41, 2.74, 2.3, 2.23, 2.55, 2.46, 2.54, 2.23, 2.37, 2.36, 
                                  2.7, 2.6, 2.57, 2.32, 2.67, 2.5, 2.62, 2.64, 2.25, 2.46, 
                                  2.78, 2.58, 2.36, 2.64, 2.39, 2.79, 2.56, 2.81, 2.43, 2.74, 2.75],
    "hivenode07_cpu_load_15min": [2.32, 2.75, 2.42, 2.36, 2.45, 2.48, 2.51, 2.27, 2.39, 2.43, 
                                  2.58, 2.56, 2.54, 2.48, 2.74, 2.52, 2.62, 2.59, 2.43, 2.47, 
                                  2.71, 2.67, 2.38, 2.56, 2.53, 2.64, 2.42, 2.84, 2.59, 2.66, 2.72]
}
template_df = pd.DataFrame(template_data)

# Date range
start_date = datetime(2024, 12, 1)
end_date = datetime(2025, 5, 31)

# List of nodes
nodes = ['hivenode01', 'hivenode02', 'hivenode03', 'hivenode04', 'hivenode05', 'hivenode06', 'hivenode07']

# Days in a month
def days_in_month(year, month):
    if month == 2:
        # Check for leap year
        if (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0):
            return 29
        return 28
    return 31 if month in [1, 3, 5, 7, 8, 10, 12] else 30

current_date = start_date
num_data = []
text_data = []
total_num_samples = 0
total_text_samples = 0
total_num_abnormalities = 0
total_days = (end_date - start_date).days + 1

for _ in tqdm(range(total_days), desc="Processing dates"):
    days = days_in_month(current_date.year, current_date.month)

    # clear num_data at the start of each month
    if not num_data or current_date.day == 1:
        num_data, text_data = [], []

    template_day_idx = (current_date.day - 1) % 31
    template_row = template_df.iloc[template_day_idx]
    row = {"date": current_date.strftime("%Y-%m-%d")}
    
    for node in tqdm(nodes, desc="Processing nodes", leave=False):
        # Generate num data
        memory_usage = template_row[f"{node}_memory_usage"] + np.random.normal(0, 0.05) #normal distribution random noise
        memory_usage = max(0, min(1, memory_usage))  #[0,1]
        
        cpu_load_5min = template_row[f"{node}_cpu_load_5min"] + np.random.normal(0, 1)
        cpu_load_10min = template_row[f"{node}_cpu_load_10min"] + np.random.normal(0, 1)
        cpu_load_15min = template_row[f"{node}_cpu_load_15min"] + np.random.normal(0, 1)

        cpu_load_5min = max(0, cpu_load_5min) # Ensure non-negative
        cpu_load_10min = max(0, cpu_load_10min)
        cpu_load_15min = max(0, cpu_load_15min)

        # Add node data to row
        row[node] = node 
        row[f"{node}_memory_usage"] = memory_usage
        row[f"{node}_cpu_load_5min"] = cpu_load_5min
        row[f"{node}_cpu_load_10min"] = cpu_load_10min
        row[f"{node}_cpu_load_15min"] = cpu_load_15min
        
        # Check for abnormalities
        if memory_usage > 0.7 or cpu_load_5min > 10 or cpu_load_10min > 10 or cpu_load_15min > 10:
            status = "abnormal"
            label = 1
            total_num_abnormalities += 1
        else:
            status = "normal"
            label = 0

        total_num_samples += 1

        # Generate text data
        prompt = (
            f"You are Qwen3-0.6B, a language model for generating operation logs. "
            f"Generate a single log entry with these requirements:\n"
            f"1. Retain all numerical values unchanged.\n"
            f"2. Use ISO 8601 timestamp (to the second).\n"
            f"3. Include fields: log level (INFO/WARN/ERROR), event type, node ID, memory_usage, cpu_load_5min, cpu_load_10min, cpu_load_15min, status.\n"
            f"4. For status=normal: emphasize stability and optimal performance. "
            f"For status=abnormal: include alert severity, likely root cause, impact scope, and remediation steps.\n"
            f"5. Vary syntax (active/passive voice, questions, imperatives), use domain terms (OOM, jitter, throughput), "
            f"and keep entry between 50-150 characters.\n"
            f"6. Output only the log entry, without repeating this prompt or adding commentary.\n\n"
            f"Input Data:\n"
            f"  timestamp: {current_date.strftime('%Y-%m-%dT%H:%M:%SZ')}\n"
            f"  node: {node}\n"
            f"  memory_usage: {memory_usage:.4f}\n"
            f"  cpu_load_5min: {cpu_load_5min:.2f}\n"
            f"  cpu_load_10min: {cpu_load_10min:.2f}\n"
            f"  cpu_load_15min: {cpu_load_15min:.2f}\n"
            f"  status: {status}\n"
        )

        generated = generator(prompt, max_new_tokens=150, num_return_sequences=1, do_sample=True, temperature=0.9)
        text = generated[0]['generated_text'].strip()
        text_data.append({
            "date": current_date.strftime("%Y-%m-%d"),
            "node": node,
            "text": text,
        })
        # Track total samples
        total_text_samples += 1

    num_data.append(row)
        
    if current_date.day == days or current_date == end_date:
        # Save num data in month
        month_df = pd.DataFrame(num_data)
        columns = ['date']
        for node in nodes:
            columns.extend([
                f"{node}_memory_usage",
                f"{node}_cpu_load_5min",
                f"{node}_cpu_load_10min",
                f"{node}_cpu_load_15min"
            ])
        month_df = month_df[columns]
        year_month = current_date.strftime("%Y-%m")
        os.makedirs('data/pre_train_num', exist_ok=True)
        month_df.to_csv(os.path.join('data/pre_train_num', f'{year_month}.csv'), index=False)

        # Save text data in month
        os.makedirs('data/pre_train_text', exist_ok=True)
        with open(f'data/pre_train_text/{year_month}.jsonl', 'w', encoding='utf-8') as f:
            for record in text_data:
                json.dump(record, f, ensure_ascii=False)
                f.write('\n')
        num_data, text_data = [], []
    
    current_date += timedelta(days=1)

num_abnormalities_rate = total_num_abnormalities / total_num_samples

print(f'Total num samples generated: {total_num_samples}, Total num abnormalities: {total_num_abnormalities} , Num abnormality rate: {num_abnormalities_rate:.2%}')
print(f'Total text samples generated: {total_text_samples}')

torch.cuda.empty_cache()

Device set to use cuda:0
Processing dates:   1%|          | 1/182 [00:44<2:14:17, 44.52s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Processing dates: 100%|██████████| 182/182 [2:04:02<00:00, 40.89s/it] 

Total num samples generated: 1274, Total num abnormalities: 244 , Num abnormality rate: 19.15%
Total text samples generated: 1274





# Data preprocessing

In [3]:
import pandas as pd
import os
from tqdm import tqdm

# Merge num data
num_files = sorted([f for f in os.listdir('data/pre_train_num') if f.endswith('.csv')])
all_num_data = []
for file in tqdm(num_files, desc="Merging num data"):
    df = pd.read_csv(os.path.join('data/pre_train_num', file))
    all_num_data.append(df)
num_df = pd.concat(all_num_data, ignore_index=True)
num_df['date'] = pd.to_datetime(num_df['date'], format='%Y-%m-%d')
num_df = num_df.sort_values('date')
num_df.to_csv('data/pre_train_num/num_202412-202505.csv', index=False)

# Merge text data
text_files = sorted([f for f in os.listdir('data/pre_train_text') if f.endswith('.jsonl')])
all_text_data = []
for file in tqdm(text_files, desc="Merging text data"):
    with open(os.path.join('data/pre_train_text', file), 'r', encoding='utf-8') as f:
        records = [json.loads(line) for line in f]
        all_text_data.extend(records)
text_df = pd.DataFrame(all_text_data)
text_df['date'] = pd.to_datetime(text_df['date'], format='%Y-%m-%d')
text_df = text_df.sort_values('date')
# Save text data as JSONL
with open('data/pre_train_text/text_202412-202505.jsonl', 'w', encoding='utf-8') as f:
    for _, row in text_df.iterrows():
        record = row.to_dict()
        if isinstance(record['date'], pd.Timestamp):
            record['date'] = record['date'].strftime('%Y-%m-%d')
        json.dump(record, f, ensure_ascii=False)
        f.write('\n')

# Text train-test split (80/20)
split_idx = int(len(num_df) * 0.8)
train_num_df = num_df.iloc[:split_idx]
test_num_df = num_df.iloc[split_idx:]

split_idx_text = int(len(text_df) * 0.8)
train_text_df = text_df.iloc[:split_idx_text]
test_text_df = text_df.iloc[split_idx_text:]


# Save datasets as Parquet to reduce storage space
os.makedirs('data/traindata', exist_ok=True)
os.makedirs('data/testdata', exist_ok=True)
train_num_df.to_parquet('data/traindata/train_num.parquet')
test_num_df.to_parquet('data/testdata/test_num.parquet')
train_text_df.to_parquet('data/traindata/train_text.parquet')
test_text_df.to_parquet('data/testdata/test_text.parquet')

# Calculate abnormality for numerical data
nodes = ['hivenode01', 'hivenode02', 'hivenode03', 'hivenode04', 'hivenode05', 'hivenode06', 'hivenode07']
def count_abnormalities(df):
    total_samples = len(df) * len(nodes)
    total_abnormal = 0
    for node in nodes:
        abnormal = ((df[f"{node}_memory_usage"] > 0.7) | 
                    (df[f"{node}_cpu_load_5min"] > 10) | 
                    (df[f"{node}_cpu_load_10min"] > 10) | 
                    (df[f"{node}_cpu_load_15min"] > 10)).sum()
        total_abnormal += abnormal
    return total_samples, total_abnormal

# Numerical data statistics
train_num_samples, train_num_abnormal = count_abnormalities(train_num_df)
test_num_samples, test_num_abnormal = count_abnormalities(test_num_df)
train_abn_rate = train_num_abnormal / train_num_samples 
test_abn_rate = test_num_abnormal / test_num_samples 

print(f"Numerical data: train set {train_num_samples} samples, abnormalities {train_num_abnormal}, abnormality rate {train_abn_rate:.2%}")
print(f"Numerical data: test set {test_num_samples} samples, abnormalities {test_num_abnormal}, abnormality rate {test_abn_rate:.2%}")
print(f"Text data: train set {train_text_df.shape[0]} samples, test set {test_text_df.shape[0]} samples")

Merging num data: 100%|██████████| 6/6 [00:00<00:00, 752.03it/s]
Merging text data: 100%|██████████| 6/6 [00:00<00:00, 260.87it/s]


Numerical data: train set 1015 samples, abnormalities 192, abnormality rate 18.92%
Numerical data: test set 259 samples, abnormalities 52, abnormality rate 20.08%
Text data: train set 1019 samples, test set 255 samples


# LLM & Fine-Tunning -- Qwen3-1.7B

In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from tqdm import tqdm
import re
import os

# Clear GPU memory
torch.cuda.empty_cache()

# Load text datasets
train_text_df = pd.read_parquet('data/traindata/train_text.parquet')
test_text_df = pd.read_parquet('data/testdata/test_text.parquet')

# Extract status labels
def extract_status(text):
    match = re.search(r"(normal|abnormal)", text, re.IGNORECASE)
    return 0 if match and match.group(0).lower() == "normal" else 1

train_text_df['label'] = train_text_df['text'].apply(extract_status)
test_text_df['label'] = test_text_df['text'].apply(extract_status)

# Initialize Qwen2-1.5B
model_name = 'Qwen/Qwen2-1.5B'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to('cuda')

# LoRA configuration
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)

# Prepare dataset for classification
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        # Create sequence labels: same label for all tokens, -100 for padding
        labels = torch.full_like(input_ids, label)
        labels[attention_mask == 0] = -100  # Ignore padding tokens
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

def custom_collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

train_dataset = TextDataset(train_text_df['text'].tolist(), train_text_df['label'].tolist(), tokenizer)
test_dataset = TextDataset(test_text_df['text'].tolist(), test_text_df['label'].tolist(), tokenizer)

# Training configuration for classification
training_args = TrainingArguments(
    output_dir="models/qwen2-1.5B-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=50,
    warmup_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    gradient_accumulation_steps=10,
    report_to="none",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=custom_collate_fn
)

# Fine-tune classification model
trainer.train()
trainer.save_model("models/qwen2-1.5B-finetuned")
tokenizer.save_pretrained("models/qwen2-1.5B-finetuned")

# Prepare context for future prediction (time-series)
def prepare_history_context(df, date, node, window_days=7):
    start_date = pd.to_datetime(date) - pd.Timedelta(days=window_days)
    history = df[(df['date'] >= start_date) & (df['date'] < date) & (df['node'] == node)]
    context = "\n".join(history['text'].tolist())
    return context

# Predict future 3 days
def predict_future_status(model, tokenizer, context, days_ahead=3, max_length=512):
    model.eval()
    predictions = []
    probabilities = []
    for day in range(1, days_ahead + 1):
        prompt = (
            f"Based on the operation logs from the past 7 days:\n{context}\n"
            f"Predict the system status (0=normal, 1=abnormal) and abnormal probability for day {day} ahead."
        )
        inputs = tokenizer(prompt, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt').to('cuda')
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits[:, -1, :2]
            probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
            pred = torch.argmax(logits, dim=-1).cpu().numpy()[0]
        predictions.append(pred)
        probabilities.append(probs[1])
    return predictions, probabilities

# Test set prediction for time-series
# Assume test_text_df contains 'date' and 'node' columns
test_dates = test_text_df['date'].unique()
nodes = ['hivenode01', 'hivenode02', 'hivenode03', 'hivenode04', 'hivenode05', 'hivenode06', 'hivenode07']
predictions = []
true_labels = []
probabilities = []

for date in tqdm(test_dates[7:], desc="Predicting future status"):
    date = pd.to_datetime(date)
    for node in nodes:
        context = prepare_history_context(test_text_df, date, node)
        if context:
            pred, prob = predict_future_status(model, tokenizer, context)
            predictions.extend(pred)
            probabilities.extend(prob)
            future_days = [date + pd.Timedelta(days=i) for i in range(1, 4)]
            for future_date in future_days:
                future_data = test_text_df[(test_text_df['date'] == future_date) & (test_text_df['node'] == node)]
                true_labels.append(future_data['label'].iloc[0] if not future_data.empty else 0)

# Truncate to match lengths
min_len = min(len(predictions), len(true_labels))
predictions = predictions[:min_len]
true_labels = true_labels[:min_len]

# Evaluate metrics
qwen_metrics = {
    'accuracy': accuracy_score(true_labels, predictions),
    'f1': f1_score(true_labels, predictions),
    'precision': precision_score(true_labels, predictions),
    'recall': recall_score(true_labels, predictions),
    'confusion_matrix': confusion_matrix(true_labels, predictions).tolist()
}

# Print results
print("Qwen2-1.5B Future Prediction Metrics:")
print(f"Accuracy: {qwen_metrics['accuracy']:.2%}")
print(f"F1 Score: {qwen_metrics['f1']:.4f}")
print(f"Precision: {qwen_metrics['precision']:.4f}")
print(f"Recall: {qwen_metrics['recall']:.4f}")
print(f"Confusion Matrix: {qwen_metrics['confusion_matrix']}")

# Clear GPU memory
torch.cuda.empty_cache()


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,No log,13.891824
2,13.749900,13.465537
3,13.749900,12.396853
4,12.300800,9.575779
5,12.300800,7.043048


Predicting future status: 100%|██████████| 30/30 [01:16<00:00,  2.55s/it]

Qwen2-1.5B Future Prediction Metrics:
Accuracy: 88.10%
F1 Score: 0.0000
Precision: 0.0000
Recall: 0.0000
Confusion Matrix: [[555, 75], [0, 0]]



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Qwen3-1.7B + Fine-tunning + RAG

In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from tqdm import tqdm
import re
import os

# Clear GPU memory
torch.cuda.empty_cache()

# -------------------------------
# 1. Load and preprocess data
# -------------------------------
train_text_df = pd.read_parquet('data/traindata/train_text.parquet')
test_text_df = pd.read_parquet('data/testdata/test_text.parquet')

def extract_status(text):
    match = re.search(r"(normal|abnormal)", text, re.IGNORECASE)
    return 0 if match and match.group(0).lower() == "normal" else 1

train_text_df['label'] = train_text_df['text'].apply(extract_status)
test_text_df['label'] = test_text_df['text'].apply(extract_status)

# -------------------------------
# 2. Fine-tune Qwen2-1.5B with LoRA for classification
# -------------------------------
model_name = 'Qwen/Qwen2-1.5B'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16).to('cuda')

peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        labels = torch.full_like(input_ids, label)
        labels[attention_mask == 0] = -100
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

def custom_collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.stack([item['labels'] for item in batch])
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

train_dataset = TextDataset(train_text_df['text'].tolist(), train_text_df['label'].tolist(), tokenizer)
test_dataset = TextDataset(test_text_df['text'].tolist(), test_text_df['label'].tolist(), tokenizer)

training_args = TrainingArguments(
    output_dir="models/qwen2-1.5B-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=50,
    warmup_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    gradient_accumulation_steps=10,
    report_to="none",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=custom_collate_fn
)

trainer.train()
trainer.save_model("models/qwen2-1.5B-finetuned")
tokenizer.save_pretrained("models/qwen2-1.5B-finetuned")

# -------------------------------
# 3. Build FAISS index on train texts for RAG
# -------------------------------
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
texts = train_text_df['text'].tolist()
vectorstore = FAISS.from_texts(texts, embeddings)

# -------------------------------
# 4. RAG-based classification prediction on test set
# -------------------------------
def rag_predict(model, tokenizer, texts, vectorstore, max_length=128, k=3):
    model.eval()
    predictions = []
    for text in tqdm(texts, desc="RAG Predicting"):
        docs = vectorstore.similarity_search(text, k=k)
        context = " ".join([doc.page_content for doc in docs])
        prompt = f"Context: {context}\nLog: {text}\nPredict status (0=normal, 1=abnormal):"
        inputs = tokenizer(prompt, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt').to('cuda')
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits[:, -1, :2]
            pred = torch.argmax(logits, dim=-1).cpu().numpy()[0]
        predictions.append(pred)
    return predictions

rag_predictions = rag_predict(model, tokenizer, test_text_df['text'].tolist(), vectorstore)
rag_metrics = {
    'accuracy': accuracy_score(test_text_df['label'], rag_predictions),
    'f1': f1_score(test_text_df['label'], rag_predictions),
    'precision': precision_score(test_text_df['label'], rag_predictions),
    'recall': recall_score(test_text_df['label'], rag_predictions),
    'confusion_matrix': confusion_matrix(test_text_df['label'], rag_predictions).tolist()
}

print("Qwen2-1.5B with RAG Metrics:")
print(f"Accuracy: {rag_metrics['accuracy']:.2%}")
print(f"F1 Score: {rag_metrics['f1']:.4f}")
print(f"Precision: {rag_metrics['precision']:.4f}")
print(f"Recall: {rag_metrics['recall']:.4f}")
print(f"Confusion Matrix: {rag_metrics['confusion_matrix']}")

# -------------------------------
# 5. Prepare context for future time-series prediction
# -------------------------------
def prepare_history_context(df, date, node, window_days=7):
    start_date = pd.to_datetime(date) - pd.Timedelta(days=window_days)
    history = df[(df['date'] >= start_date) & (df['date'] < date) & (df['node'] == node)]
    context = "\n".join(history['text'].tolist())
    return context

# -------------------------------
# 6. Predict future 3 days with RAG-augmented prompts
# -------------------------------
def predict_future_status_rag(model, tokenizer, context, vectorstore, days_ahead=3, max_length=512, k=3):
    model.eval()
    predictions = []
    probabilities = []
    for day in range(1, days_ahead + 1):
        # Retrieve top-k similar past logs
        docs = vectorstore.similarity_search(context, k=k)
        retrieved = " ".join([doc.page_content for doc in docs])
        prompt = (
            f"Retrieved: {retrieved}\nPast 7-day logs:\n{context}\n"
            f"Predict status (0=normal, 1=abnormal) and abnormal probability for day {day} ahead."
        )
        inputs = tokenizer(prompt, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt').to('cuda')
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits[:, -1, :2]
            probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
            pred = torch.argmax(logits, dim=-1).cpu().numpy()[0]
        predictions.append(pred)
        probabilities.append(probs[1])
        # Append predicted day text for next iteration context if needed
    return predictions, probabilities

# -------------------------------
# 7. Time-series RAG-based evaluation on test set
# -------------------------------
test_dates = test_text_df['date'].unique()
nodes = ['hivenode01', 'hivenode02', 'hivenode03', 'hivenode04', 'hivenode05', 'hivenode06', 'hivenode07']
predictions_ts = []
true_labels_ts = []
probabilities_ts = []

for date in tqdm(test_dates[7:], desc="Predicting future status RAG"):
    date = pd.to_datetime(date)
    for node in nodes:
        context = prepare_history_context(test_text_df, date, node)
        if context:
            pred, prob = predict_future_status_rag(model, tokenizer, context, vectorstore)
            predictions_ts.extend(pred)
            probabilities_ts.extend(prob)
            future_days = [date + pd.Timedelta(days=i) for i in range(1, 4)]
            for future_date in future_days:
                future_data = test_text_df[(test_text_df['date'] == future_date) & (test_text_df['node'] == node)]
                true_labels_ts.append(future_data['label'].iloc[0] if not future_data.empty else 0)

min_len_ts = min(len(predictions_ts), len(true_labels_ts))
predictions_ts = predictions_ts[:min_len_ts]
true_labels_ts = true_labels_ts[:min_len_ts]

rag_ts_metrics = {
    'accuracy': accuracy_score(true_labels_ts, predictions_ts),
    'f1': f1_score(true_labels_ts, predictions_ts),
    'precision': precision_score(true_labels_ts, predictions_ts),
    'recall': recall_score(true_labels_ts, predictions_ts),
    'confusion_matrix': confusion_matrix(true_labels_ts, predictions_ts).tolist()
}

print("Qwen2-1.5B Future Prediction with RAG Metrics:")
print(f"Accuracy: {rag_ts_metrics['accuracy']:.2%}")
print(f"F1 Score: {rag_ts_metrics['f1']:.4f}")
print(f"Precision: {rag_ts_metrics['precision']:.4f}")
print(f"Recall: {rag_ts_metrics['recall']:.4f}")
print(f"Confusion Matrix: {rag_ts_metrics['confusion_matrix']}")

# Clear GPU memory
torch.cuda.empty_cache()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,No log,13.888096
2,13.747700,13.460326
3,13.747700,12.378171
4,12.278500,9.428842
5,12.278500,6.882834


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
RAG Predicting: 100%|██████████| 255/255 [00:13<00:00, 18.70it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Qwen2-1.5B with RAG Metrics:
Accuracy: 100.00%
F1 Score: 0.0000
Precision: 0.0000
Recall: 0.0000
Confusion Matrix: [[255]]


Predicting future status RAG: 100%|██████████| 30/30 [01:26<00:00,  2.88s/it]

Qwen2-1.5B Future Prediction with RAG Metrics:
Accuracy: 99.05%
F1 Score: 0.0000
Precision: 0.0000
Recall: 0.0000
Confusion Matrix: [[624, 6], [0, 0]]



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# SVM

In [2]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from tqdm import tqdm

train_num_df = pd.read_parquet('data/traindata/train_num.parquet')
test_num_df = pd.read_parquet('data/testdata/test_num.parquet')
nodes = ['hivenode01', 'hivenode02', 'hivenode03', 'hivenode04', 'hivenode05', 'hivenode06', 'hivenode07']

def prepare_num_features(df):
    features = []
    for _, row in df.iterrows():
        for node in nodes:
            features.append([
                row[f"{node}_memory_usage"],
                row[f"{node}_cpu_load_5min"],
                row[f"{node}_cpu_load_10min"],
                row[f"{node}_cpu_load_15min"]
            ])
    return np.array(features)

def generate_num_labels(df):
    labels = []
    for _, row in df.iterrows():
        for node in nodes:
            is_abnormal = (row[f"{node}_memory_usage"] > 0.7 or
                           row[f"{node}_cpu_load_5min"] > 10 or
                           row[f"{node}_cpu_load_10min"] > 10 or
                           row[f"{node}_cpu_load_15min"] > 10)
            labels.append(1 if is_abnormal else 0)
    return labels

train_num_features = prepare_num_features(train_num_df)
test_num_features = prepare_num_features(test_num_df)
train_num_labels = generate_num_labels(train_num_df)
test_num_labels = generate_num_labels(test_num_df)

svm = SVC(kernel='rbf', class_weight='balanced')
svm.fit(train_num_features, train_num_labels)
predictions = svm.predict(test_num_features)
svm_metrics = {
    'accuracy': accuracy_score(test_num_labels, predictions),
    'f1': f1_score(test_num_labels, predictions),
    'precision': precision_score(test_num_labels, predictions),
    'recall': recall_score(test_num_labels, predictions),
    'confusion_matrix': confusion_matrix(test_num_labels, predictions).tolist()
}

print("SVM Metrics:")
print(f"Accuracy: {svm_metrics['accuracy']:.2%}")
print(f"F1 Score: {svm_metrics['f1']:.4f}")
print(f"Precision: {svm_metrics['precision']:.4f}")
print(f"Recall: {svm_metrics['recall']:.4f}")
print(f"Confusion Matrix: {svm_metrics['confusion_matrix']}")

SVM Metrics:
Accuracy: 93.44%
F1 Score: 0.8571
Precision: 0.7612
Recall: 0.9808
Confusion Matrix: [[191, 16], [1, 51]]


# KNN

In [3]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from tqdm import tqdm

train_num_df = pd.read_parquet('data/traindata/train_num.parquet')
test_num_df = pd.read_parquet('data/testdata/test_num.parquet')
nodes = ['hivenode01', 'hivenode02', 'hivenode03', 'hivenode04', 'hivenode05', 'hivenode06', 'hivenode07']

def prepare_num_features(df):
    features = []
    for _, row in df.iterrows():
        for node in nodes:
            features.append([
                row[f"{node}_memory_usage"],
                row[f"{node}_cpu_load_5min"],
                row[f"{node}_cpu_load_10min"],
                row[f"{node}_cpu_load_15min"]
            ])
    return np.array(features)

def generate_num_labels(df):
    labels = []
    for _, row in df.iterrows():
        for node in nodes:
            is_abnormal = (row[f"{node}_memory_usage"] > 0.7 or
                           row[f"{node}_cpu_load_5min"] > 10 or
                           row[f"{node}_cpu_load_10min"] > 10 or
                           row[f"{node}_cpu_load_15min"] > 10)
            labels.append(1 if is_abnormal else 0)
    return labels

train_num_features = prepare_num_features(train_num_df)
test_num_features = prepare_num_features(test_num_df)
train_num_labels = generate_num_labels(train_num_df)
test_num_labels = generate_num_labels(test_num_df)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train_num_features, train_num_labels)
predictions = knn.predict(test_num_features)
knn_metrics = {
    'accuracy': accuracy_score(test_num_labels, predictions),
    'f1': f1_score(test_num_labels, predictions),
    'precision': precision_score(test_num_labels, predictions),
    'recall': recall_score(test_num_labels, predictions),
    'confusion_matrix': confusion_matrix(test_num_labels, predictions).tolist()
}

print("KNN Metrics:")
print(f"Accuracy: {knn_metrics['accuracy']:.2%}")
print(f"F1 Score: {knn_metrics['f1']:.4f}")
print(f"Precision: {knn_metrics['precision']:.4f}")
print(f"Recall: {knn_metrics['recall']:.4f}")
print(f"Confusion Matrix: {knn_metrics['confusion_matrix']}")

KNN Metrics:
Accuracy: 96.91%
F1 Score: 0.9200
Precision: 0.9583
Recall: 0.8846
Confusion Matrix: [[205, 2], [6, 46]]


# RF

In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from tqdm import tqdm

train_num_df = pd.read_parquet('data/traindata/train_num.parquet')
test_num_df = pd.read_parquet('data/testdata/test_num.parquet')
nodes = ['hivenode01', 'hivenode02', 'hivenode03', 'hivenode04', 'hivenode05', 'hivenode06', 'hivenode07']

def prepare_num_features(df):
    features = []
    for _, row in df.iterrows():
        for node in nodes:
            features.append([
                row[f"{node}_memory_usage"],
                row[f"{node}_cpu_load_5min"],
                row[f"{node}_cpu_load_10min"],
                row[f"{node}_cpu_load_15min"]
            ])
    return np.array(features)

def generate_num_labels(df):
    labels = []
    for _, row in df.iterrows():
        for node in nodes:
            is_abnormal = (row[f"{node}_memory_usage"] > 0.7 or
                           row[f"{node}_cpu_load_5min"] > 10 or
                           row[f"{node}_cpu_load_10min"] > 10 or
                           row[f"{node}_cpu_load_15min"] > 10)
            labels.append(1 if is_abnormal else 0)
    return labels

train_num_features = prepare_num_features(train_num_df)
test_num_features = prepare_num_features(test_num_df)
train_num_labels = generate_num_labels(train_num_df)
test_num_labels = generate_num_labels(test_num_df)

rf = RandomForestClassifier(n_estimators=100, class_weight='balanced')
rf.fit(train_num_features, train_num_labels)
predictions = rf.predict(test_num_features)
rf_metrics = {
    'accuracy': accuracy_score(test_num_labels, predictions),
    'f1': f1_score(test_num_labels, predictions),
    'precision': precision_score(test_num_labels, predictions),
    'recall': recall_score(test_num_labels, predictions),
    'confusion_matrix': confusion_matrix(test_num_labels, predictions).tolist()
}

print("RF Metrics:")
print(f"Accuracy: {rf_metrics['accuracy']:.2%}")
print(f"F1 Score: {rf_metrics['f1']:.4f}")
print(f"Precision: {rf_metrics['precision']:.4f}")
print(f"Recall: {rf_metrics['recall']:.4f}")
print(f"Confusion Matrix: {rf_metrics['confusion_matrix']}")

RF Metrics:
Accuracy: 99.23%
F1 Score: 0.9808
Precision: 0.9808
Recall: 0.9808
Confusion Matrix: [[206, 1], [1, 51]]


# ANN

In [5]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from tqdm import tqdm

train_num_df = pd.read_parquet('data/traindata/train_num.parquet')
test_num_df = pd.read_parquet('data/testdata/test_num.parquet')
nodes = ['hivenode01', 'hivenode02', 'hivenode03', 'hivenode04', 'hivenode05', 'hivenode06', 'hivenode07']

def prepare_num_features(df):
    features = []
    for _, row in df.iterrows():
        for node in nodes:
            features.append([
                row[f"{node}_memory_usage"],
                row[f"{node}_cpu_load_5min"],
                row[f"{node}_cpu_load_10min"],
                row[f"{node}_cpu_load_15min"]
            ])
    return np.array(features)

def generate_num_labels(df):
    labels = []
    for _, row in df.iterrows():
        for node in nodes:
            is_abnormal = (row[f"{node}_memory_usage"] > 0.7 or
                           row[f"{node}_cpu_load_5min"] > 10 or
                           row[f"{node}_cpu_load_10min"] > 10 or
                           row[f"{node}_cpu_load_15min"] > 10)
            labels.append(1 if is_abnormal else 0)
    return labels

train_num_features = prepare_num_features(train_num_df)
test_num_features = prepare_num_features(test_num_df)
train_num_labels = generate_num_labels(train_num_df)
test_num_labels = generate_num_labels(test_num_df)

ann = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500)
ann.fit(train_num_features, train_num_labels)
predictions = ann.predict(test_num_features)
ann_metrics = {
    'accuracy': accuracy_score(test_num_labels, predictions),
    'f1': f1_score(test_num_labels, predictions),
    'precision': precision_score(test_num_labels, predictions),
    'recall': recall_score(test_num_labels, predictions),
    'confusion_matrix': confusion_matrix(test_num_labels, predictions).tolist()
}

print("ANN Metrics:")
print(f"Accuracy: {ann_metrics['accuracy']:.2%}")
print(f"F1 Score: {ann_metrics['f1']:.4f}")
print(f"Precision: {ann_metrics['precision']:.4f}")
print(f"Recall: {ann_metrics['recall']:.4f}")
print(f"Confusion Matrix: {ann_metrics['confusion_matrix']}")

ANN Metrics:
Accuracy: 94.21%
F1 Score: 0.8485
Precision: 0.8936
Recall: 0.8077
Confusion Matrix: [[202, 5], [10, 42]]


# DT

In [6]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from tqdm import tqdm

train_num_df = pd.read_parquet('data/traindata/train_num.parquet')
test_num_df = pd.read_parquet('data/testdata/test_num.parquet')
nodes = ['hivenode01', 'hivenode02', 'hivenode03', 'hivenode04', 'hivenode05', 'hivenode06', 'hivenode07']

def prepare_num_features(df):
    features = []
    for _, row in df.iterrows():
        for node in nodes:
            features.append([
                row[f"{node}_memory_usage"],
                row[f"{node}_cpu_load_5min"],
                row[f"{node}_cpu_load_10min"],
                row[f"{node}_cpu_load_15min"]
            ])
    return np.array(features)

def generate_num_labels(df):
    labels = []
    for _, row in df.iterrows():
        for node in nodes:
            is_abnormal = (row[f"{node}_memory_usage"] > 0.7 or
                           row[f"{node}_cpu_load_5min"] > 10 or
                           row[f"{node}_cpu_load_10min"] > 10 or
                           row[f"{node}_cpu_load_15min"] > 10)
            labels.append(1 if is_abnormal else 0)
    return labels

train_num_features = prepare_num_features(train_num_df)
test_num_features = prepare_num_features(test_num_df)
train_num_labels = generate_num_labels(train_num_df)
test_num_labels = generate_num_labels(test_num_df)

dt = DecisionTreeClassifier(class_weight='balanced')
dt.fit(train_num_features, train_num_labels)
predictions = dt.predict(test_num_features)
dt_metrics = {
    'accuracy': accuracy_score(test_num_labels, predictions),
    'f1': f1_score(test_num_labels, predictions),
    'precision': precision_score(test_num_labels, predictions),
    'recall': recall_score(test_num_labels, predictions),
    'confusion_matrix': confusion_matrix(test_num_labels, predictions).tolist()
}

print("DT Metrics:")
print(f"Accuracy: {dt_metrics['accuracy']:.2%}")
print(f"F1 Score: {dt_metrics['f1']:.4f}")
print(f"Precision: {dt_metrics['precision']:.4f}")
print(f"Recall: {dt_metrics['recall']:.4f}")
print(f"Confusion Matrix: {dt_metrics['confusion_matrix']}")

DT Metrics:
Accuracy: 99.23%
F1 Score: 0.9808
Precision: 0.9808
Recall: 0.9808
Confusion Matrix: [[206, 1], [1, 51]]


# LR

In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from tqdm import tqdm

train_num_df = pd.read_parquet('data/traindata/train_num.parquet')
test_num_df = pd.read_parquet('data/testdata/test_num.parquet')
nodes = ['hivenode01', 'hivenode02', 'hivenode03', 'hivenode04', 'hivenode05', 'hivenode06', 'hivenode07']

def prepare_num_features(df):
    features = []
    for _, row in df.iterrows():
        for node in nodes:
            features.append([
                row[f"{node}_memory_usage"],
                row[f"{node}_cpu_load_5min"],
                row[f"{node}_cpu_load_10min"],
                row[f"{node}_cpu_load_15min"]
            ])
    return np.array(features)

def generate_num_labels(df):
    labels = []
    for _, row in df.iterrows():
        for node in nodes:
            is_abnormal = (row[f"{node}_memory_usage"] > 0.7 or
                           row[f"{node}_cpu_load_5min"] > 10 or
                           row[f"{node}_cpu_load_10min"] > 10 or
                           row[f"{node}_cpu_load_15min"] > 10)
            labels.append(1 if is_abnormal else 0)
    return labels

train_num_features = prepare_num_features(train_num_df)
test_num_features = prepare_num_features(test_num_df)
train_num_labels = generate_num_labels(train_num_df)
test_num_labels = generate_num_labels(test_num_df)

lr = LogisticRegression(class_weight='balanced')
lr.fit(train_num_features, train_num_labels)
predictions = lr.predict(test_num_features)
lr_metrics = {
    'accuracy': accuracy_score(test_num_labels, predictions),
    'f1': f1_score(test_num_labels, predictions),
    'precision': precision_score(test_num_labels, predictions),
    'recall': recall_score(test_num_labels, predictions),
    'confusion_matrix': confusion_matrix(test_num_labels, predictions).tolist()
}

print("LR Metrics:")
print(f"Accuracy: {lr_metrics['accuracy']:.2%}")
print(f"F1 Score: {lr_metrics['f1']:.4f}")
print(f"Precision: {lr_metrics['precision']:.4f}")
print(f"Recall: {lr_metrics['recall']:.4f}")
print(f"Confusion Matrix: {lr_metrics['confusion_matrix']}")

LR Metrics:
Accuracy: 92.28%
F1 Score: 0.8361
Precision: 0.7286
Recall: 0.9808
Confusion Matrix: [[188, 19], [1, 51]]
