In [2]:
import pandas as pd
df = pd.read_csv("Dataset/synthetic_logs.csv")
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert


In [3]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [4]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [6]:
!pip install sentence-transformers

Defaulting to user installation because normal site-packages is not writeable




Collecting sentence-transformers
  Downloading sentence_transformers-5.0.0-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.53.3-py3-none-any.whl.metadata (40 kB)
Collecting tqdm (from sentence-transformers)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.7.1-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.33.4-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Using cached regex-2024.11.6-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.21.2-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers<5.0.0,>=4.41.0

In [8]:
!pip install ipywidgets

Defaulting to user installation because normal site-packages is not writeable
Collecting ipywidgets
  Downloading ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl.metadata (20 kB)
Downloading ipywidgets-8.1.7-py3-none-any.whl (139 kB)
Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl (216 kB)
Downloading widgetsnbextension-4.0.14-py3-none-any.whl (2.2 MB)
   ---------------------------------------- 0.0/2.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/2.2 MB ? eta -:--:--
   ---- ----------------------------------- 0.3/2.2 MB ? eta -:--:--
   --------- ------------------------------ 0.5/2.2 MB 1.1 MB/s eta 0:00:02
   -------------- ------------------------- 0.8/2.2 MB 1.1 MB/s eta 0:00:02
   -------------- -------------

In [11]:
! pip install notebook


Defaulting to user installation because normal site-packages is not writeable


In [16]:
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer

In [19]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight embedding model
embeddings = model.encode(df['log_message'].tolist())

In [20]:
embeddings[:5]

array([[-0.10293962,  0.03354594, -0.02202607, ...,  0.00457793,
        -0.04259717,  0.00322621],
       [ 0.00804572, -0.03573923,  0.04938739, ...,  0.01538319,
        -0.06230947, -0.02774666],
       [-0.00908224,  0.13003924, -0.05275568, ...,  0.02014104,
        -0.05117098, -0.02930294],
       [-0.09751046,  0.04911299, -0.03977424, ...,  0.02477502,
        -0.03546079, -0.00018598],
       [-0.10468338,  0.05926038, -0.02488499, ...,  0.02502055,
        -0.037193  , -0.0256891 ]], dtype=float32)

In [22]:
clustering = DBSCAN(eps=0.2, min_samples=1, metric='cosine').fit(embeddings)
df['cluster'] = clustering.labels_

In [23]:
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0


In [24]:
# Group by cluster to inspect patterns
clusters = df.groupby('cluster')['log_message'].apply(list)
sorted_clusters = clusters.sort_values(key=lambda x: x.map(len), ascending=False)

In [25]:
print("Clustered Patterns:")
for cluster_id, messages in sorted_clusters.items():
    if len(messages) > 10:
        print(f"Cluster {cluster_id}:")
        for msg in messages[:5]:
            print(f"  {msg}")

Clustered Patterns:
Cluster 0:
  nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" status: 200 len: 1893 time: 0.2675118
  nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" HTTP status code -  200 len: 211 time: 0.0968180
  nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" RCODE  200 len: 1874 time: 0.2280791
  nova.osapi_compute.wsgi.server [req-f0bffbc3-5ab0-4916-91c1-0a61dd7d4ec2 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2

In [53]:
# regex
import re

def classify_with_regex(log_message):
    regex_patterns = {
        r"nova\.osapi_compute\.wsgi\.server \[req-[\w-]+ [\w]+ [\w]+ - - -\] \d+\.\d+\.\d+\.\d+ \"GET /v2/[\w-]+/servers/detail HTTP/1\.1\" (status:|HTTP status code -|RCODE|Return code:) ?200 len: \d+ time: [\d.]+": "Nova GET Request",
        r"nova\.compute\.claims \[req-[\w-]+ [\w]+ [\w]+ - - -\] \[instance: [\w-]+\] (Total memory: \d+ MB, used: [\d.]+ MB|memory limit: [\d.]+ MB, free: [\d.]+ MB|(?:vcpu|disk) limit not specified, defaulting to unlimited)": "Nova Compute Claim",
        r"User User\d+ logged (in|out)\.": "User Action",
        r"Backup (started|ended) at \d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.": "Backup Event",
        r"File data_\d+\.csv uploaded successfully by user User\d+\.": "File Upload",
        r"(Multiple .*login (attempts|failures).*user \d+|User \d+ made multiple incorrect login attempts)": "Failed Login Attempts",
        r"Account with ID \d+ created by User\d+\.": "Account Creation",
        r"(Denied access attempt|Unauthorized login attempt|Invalid login attempt.*|Account Account\d+ (blocked|access denied)) (on|to) .*account (Account\d+|Account\d+)": "Access Violation",
        r"System updated to version \d+\.\d+\.\d+\.": "System Update",
        r"System reboot initiated by user User\d+\.": "System Reboot"
    }

    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message, re.IGNORECASE):
            return label
    return None


In [54]:
classify_with_regex("User User123 logged OUT.")

'User Action'

In [63]:
df['regex_label'] = df['log_message'].apply(classify_with_regex)
df[df['regex_label'].notnull()]


Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,Nova GET Request
5,2025-10-09 10:30:31,ModernHR,nova.osapi_compute.wsgi.server [req-f0bffbc3-5...,HTTP Status,bert,0,Nova GET Request
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,regex,4,File Upload
8,2025-02-12 10:42:29,ThirdPartyAPI,nova.compute.claims [req-a07ac654-8e81-416d-bf...,Resource Usage,bert,5,Nova Compute Claim
12,2025-07-12 03:03:18,ModernHR,nova.osapi_compute.wsgi.server [req-d4f8d0c2-4...,HTTP Status,bert,0,Nova GET Request
...,...,...,...,...,...,...,...
2391,2025-08-30 19:35:46,ModernCRM,nova.osapi_compute.wsgi.server [req-bb798c56-0...,HTTP Status,bert,0,Nova GET Request
2395,5/2/2025 14:29,ThirdPartyAPI,Backup ended at 2025-05-06 11:23:16.,System Notification,regex,13,Backup Event
2401,2025-12-05 15:51:51,ModernCRM,nova.osapi_compute.wsgi.server [req-4bdf00b0-3...,HTTP Status,bert,0,Nova GET Request
2404,2025-09-18 02:18:30,ThirdPartyAPI,nova.osapi_compute.wsgi.server [req-2c9c783f-3...,HTTP Status,bert,0,Nova GET Request


In [62]:
df_non_regex = df[df['regex_label'].isnull()]

Classification Stage 2: Classification Using Embeddings

In [65]:
df_non_regex = df[df['regex_label'].isnull()].copy()
df_non_regex.shape

(1609, 7)

In [69]:
df_legacy = df_non_regex[df_non_regex.source=="LegacyCRM"]
df_legacy.shape

(7, 7)

In [67]:
df_non_legacy = df_non_regex[df_non_regex.source!="LegacyCRM"]
df_non_legacy

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0,
6,3/1/2025 19:14,ModernHR,Shard 6 replication task ended in failure,Error,bert,3,
...,...,...,...,...,...,...,...
2403,10/1/2025 1:31,ModernCRM,Backup completed successfully.,System Notification,regex,8,
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,bert,7,
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,bert,0,
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,1,


In [68]:
df_non_legacy.shape

(1602, 7)

In [70]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight embedding model
embeddings_filtered = model.encode(df_non_legacy['log_message'].tolist())

In [73]:
len(embeddings_filtered)

1602

In [72]:
X = embeddings_filtered
y = df_non_legacy['target_label'].values

In [74]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

                     precision    recall  f1-score   support

     Critical Error       0.98      1.00      0.99        42
              Error       0.98      0.98      0.98        49
        HTTP Status       1.00      1.00      1.00       217
     Resource Usage       1.00      1.00      1.00        30
     Security Alert       1.00      0.99      1.00       113
System Notification       1.00      1.00      1.00        30

           accuracy                           1.00       481
          macro avg       0.99      1.00      0.99       481
       weighted avg       1.00      1.00      1.00       481



In [77]:
import os
import joblib

# Ensure the target directory exists
os.makedirs('../models', exist_ok=True)

# Save the model
joblib.dump(clf, '../models/log_classifier.joblib')


['../models/log_classifier.joblib']