In [None]:
!pip install drain3

Collecting drain3
  Downloading drain3-0.9.11.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jsonpickle==1.5.1 (from drain3)
  Downloading jsonpickle-1.5.1-py2.py3-none-any.whl.metadata (6.6 kB)
Collecting cachetools==4.2.1 (from drain3)
  Downloading cachetools-4.2.1-py3-none-any.whl.metadata (4.6 kB)
Downloading cachetools-4.2.1-py3-none-any.whl (12 kB)
Downloading jsonpickle-1.5.1-py2.py3-none-any.whl (37 kB)
Building wheels for collected packages: drain3
  Building wheel for drain3 (setup.py) ... [?25l[?25hdone
  Created wheel for drain3: filename=drain3-0.9.11-py3-none-any.whl size=23998 sha256=95023dafad10fed74e77546ee68b0d8c70cc93d17f6036ac5de511932ebc260e
  Stored in directory: /root/.cache/pip/wheels/96/3f/bb/c2df80298168b46a45654266ac0c139220540689a17463e3cf
Successfully built drain3
Installing collected packages: jsonpickle, cachetools, drain3
  Attempting uninstall: jsonpickle
    Found existing installation: jsonpickle 4.0.5
    Uninstalli

In [None]:
# Transformer-based Log Anomaly Detection (LogBERT-style)
import os
import torch
import pandas as pd
from drain3 import TemplateMiner
from drain3.template_miner_config import TemplateMinerConfig
from transformers import BertTokenizerFast, BertForMaskedLM, Trainer, TrainingArguments
from torch.utils.data import Dataset
import numpy as np
import random

In [None]:
# Step 1: Load Raw Log File
def load_logs(file_path):
    with open(file_path, 'r') as f:
        logs = [line.strip() for line in f.readlines() if line.strip()]
    return logs

In [None]:
# Step 2: Drain3 Log Parsing
def parse_templates(logs):
    config = TemplateMinerConfig()
    # config.load_default()  # Remove this line as it's causing the error
    miner = TemplateMiner(config=config)  # Directly initialize the TemplateMiner with the config
    parsed_templates = []
    for log in logs:
        result = miner.add_log_message(log)
        parsed_templates.append(result['template_mined'] if result else "")
    return parsed_templates

In [None]:
# Step 3: Sequence Construction (sliding window)
def create_sequences(tokens, window_size=10):
    sequences = []
    for i in range(len(tokens) - window_size):
        seq = tokens[i:i+window_size]
        sequences.append(" ".join(seq))
    return sequences

In [None]:
# Step 4: Custom Dataset for BERT MLM
class LogDataset(Dataset):
    def __init__(self, sequences, tokenizer, max_length=64):
        self.tokenizer = tokenizer
        self.inputs = tokenizer(sequences, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')

    def __len__(self):
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.inputs.items()}
        labels = item['input_ids'].clone()

        # Masking
        rand = torch.rand(labels.shape)
        mask_arr = (rand < 0.15) * (labels != self.tokenizer.pad_token_id) * (labels != self.tokenizer.cls_token_id)

        # Change: Get indices directly for 1D tensor
        selection = torch.flatten(mask_arr.nonzero()).tolist()

        # Change: Index into 1D tensor using single index
        labels[selection] = self.tokenizer.mask_token_id

        item['labels'] = item['input_ids'].clone()
        item['input_ids'] = labels
        return item


In [None]:
# Step 5: Training the BERT Model on Logs
def train_logbert_model(sequences):
    tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
    dataset = LogDataset(sequences, tokenizer)

    model = BertForMaskedLM.from_pretrained("bert-base-uncased")

    training_args = TrainingArguments(
        output_dir="./logbert_model",
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=16,
        save_steps=1000,
        save_total_limit=2,
        prediction_loss_only=True,
        logging_dir="./logs"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
    )

    trainer.train()
    model.save_pretrained("./logbert_model")
    tokenizer.save_pretrained("./logbert_model")
    return model, tokenizer


In [None]:
# Step 6: Inference - Compute Anomaly Scores
'''def compute_anomaly_scores(sequences, model, tokenizer):
    model.eval()
    scores = []
    with torch.no_grad():
        for seq in sequences:
            inputs = tokenizer(seq, return_tensors="pt", truncation=True, padding='max_length', max_length=64)
            labels = inputs['input_ids'].clone()
            rand = torch.rand(labels.shape)
            mask_arr = (rand < 0.15) * (labels != tokenizer.pad_token_id) * (labels != tokenizer.cls_token_id)
            inputs['labels'] = labels
            outputs = model(**inputs)
            loss = outputs.loss.item()
            scores.append(loss)
    return scores '''

def compute_anomaly_scores(sequences, model, tokenizer):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    scores = []

    for seq in sequences:
        inputs = tokenizer(seq, return_tensors="pt", padding="max_length", truncation=True, max_length=64)
        inputs = {key: val.to(device) for key, val in inputs.items()}

        # Prepare masked input for MLM
        labels = inputs["input_ids"].clone()
        rand = torch.rand(labels.shape).to(device)
        mask_arr = (rand < 0.15) * (labels != tokenizer.pad_token_id) * (labels != tokenizer.cls_token_id)
        inputs["input_ids"][mask_arr] = tokenizer.mask_token_id
        inputs["labels"] = labels

        with torch.no_grad():
            outputs = model(**inputs)
            loss = outputs.loss.item()
            scores.append(loss)

    return scores


In [None]:
# Run Full Pipeline
log_lines = load_logs("/content/hdfs_logs.log")
templates = parse_templates(log_lines)
token_sequences = create_sequences(templates, window_size=10)
model, tokenizer = train_logbert_model(token_sequences)
scores = compute_anomaly_scores(token_sequences, model, tokenizer)

# Thresholding
threshold = np.percentile(scores, 95)
anomalies = [seq for seq, score in zip(token_sequences, scores) if score > threshold]

print(f"Total Sequences: {len(scores)}, Anomalies Detected: {len(anomalies)}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msaurabh57raut[0m ([33msaurabh57raut-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


Total Sequences: 1990, Anomalies Detected: 100


In [None]:
import os
import shutil
import matplotlib.pyplot as plt
from google.colab import drive

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Define paths
drive_base = '/content/drive/MyDrive/logbert_anomaly_detection'
os.makedirs(drive_base, exist_ok=True)

# 3. Save final model and tokenizer
final_model_path = os.path.join(drive_base, 'final_model')
model.save_pretrained(final_model_path)
tokenizer.save_pretrained(final_model_path)

# 4. Save full training checkpoint
checkpoint_src = '/content/logbert_model/checkpoint-375'
checkpoint_dst = os.path.join(drive_base, 'checkpoint-375')
shutil.copytree(checkpoint_src, checkpoint_dst, dirs_exist_ok=True)

# 5. Save anomaly scores (if available)
anomaly_scores_path = '/content/anomaly_scores.csv'
if os.path.exists(anomaly_scores_path):
    shutil.copy(anomaly_scores_path, os.path.join(drive_base, 'anomaly_scores.csv'))
    print(f"✔ Anomaly scores saved to: {os.path.join(drive_base, 'anomaly_scores.csv')}")
else:
    print("⚠ No anomaly_scores.csv found, skipping that.")

# 6. Save loss plot if you have training loss data
if 'training_loss_list' in globals():
    plt.plot(training_loss_list)
    plt.xlabel('Training Step')
    plt.ylabel('Loss')
    plt.title('Training Loss Curve')
    loss_plot_path = os.path.join(drive_base, 'loss_curve.png')
    plt.savefig(loss_plot_path)
    plt.close()
    print(f"✔ Training loss curve saved to: {loss_plot_path}")
else:
    print("⚠ No training_loss_list available, skipping loss plot.")

# 7. Summary
print("✅ All components saved to your Google Drive:")
print(f"📁 Final Model: {final_model_path}")
print(f"📁 Full Checkpoint: {checkpoint_dst}")
