In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('dataset/synthetic_logs.csv')

In [25]:
df.shape

(2410, 7)

In [4]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [11]:
df.target_label.value_counts()

target_label
HTTP Status            1017
Security Alert          371
System Notification     356
Error                   177
Resource Usage          177
Critical Error          161
User Action             144
Workflow Error            4
Name: count, dtype: int64

### observation 
- here we can see that ``` Workflow Error ``` and ```  Deprecation Warning``` are very less number of record which can be handled with LLM models

In [14]:
df[df.target_label == 'HTTP Status']

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0
5,2025-10-09 10:30:31,ModernHR,nova.osapi_compute.wsgi.server [req-f0bffbc3-5...,HTTP Status,bert,0
9,2025-03-30 04:01:45,ModernHR,nova.osapi_compute.wsgi.server [req-2bf7cfee-a...,HTTP Status,bert,0
...,...,...,...,...,...,...
2399,2025-03-08 06:23:00,ThirdPartyAPI,nova.metadata.wsgi.server [req-ba29717b-249a-4...,HTTP Status,bert,0
2401,2025-12-05 15:51:51,ModernCRM,nova.osapi_compute.wsgi.server [req-4bdf00b0-3...,HTTP Status,bert,0
2404,2025-09-18 02:18:30,ThirdPartyAPI,nova.osapi_compute.wsgi.server [req-2c9c783f-3...,HTTP Status,bert,0
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,bert,0


In [15]:
df[df.target_label == 'User Action']

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
18,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,regex,9
27,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action,regex,11
57,9/14/2025 3:03,AnalyticsEngine,User User395 logged in.,User Action,regex,11
85,3/13/2025 2:11,ModernHR,User User225 logged in.,User Action,regex,11
88,3/8/2025 19:04,AnalyticsEngine,User User494 logged out.,User Action,regex,11
...,...,...,...,...,...,...
2207,10/4/2025 8:06,ModernCRM,User User495 logged in.,User Action,regex,11
2263,2/27/2025 14:40,AnalyticsEngine,User User429 logged out.,User Action,regex,11
2275,3/13/2025 17:17,AnalyticsEngine,User User755 logged out.,User Action,regex,11
2323,12/1/2025 18:17,ThirdPartyAPI,User User882 logged out.,User Action,regex,11


### finding patterns in target labels
1. ` User actions `  has some patterns like user, loggged, in and out
2. ` Http status ` has pattern like server, wsgi

In [5]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

Unnamed: 0,timestamp,source,log_message,target_label,complexity
18,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,regex
27,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action,regex
57,9/14/2025 3:03,AnalyticsEngine,User User395 logged in.,User Action,regex
85,3/13/2025 2:11,ModernHR,User User225 logged in.,User Action,regex
88,3/8/2025 19:04,AnalyticsEngine,User User494 logged out.,User Action,regex
...,...,...,...,...,...
2207,10/4/2025 8:06,ModernCRM,User User495 logged in.,User Action,regex
2263,2/27/2025 14:40,AnalyticsEngine,User User429 logged out.,User Action,regex
2275,3/13/2025 17:17,AnalyticsEngine,User User755 logged out.,User Action,regex
2323,12/1/2025 18:17,ThirdPartyAPI,User User882 logged out.,User Action,regex


In [7]:
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight embedding model
embeddings = model.encode(df['log_message'].tolist())

In [12]:
clustering = DBSCAN(eps=0.2, min_samples=1, metric='cosine').fit(embeddings)
df['cluster'] = clustering.labels_

In [18]:
df.sort_values('cluster')

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
41,2025-04-01 19:44:04,AnalyticsEngine,nova.osapi_compute.wsgi.server [req-3f4867ee-5...,HTTP Status,bert,0
2379,2025-01-11 19:55:02,ThirdPartyAPI,nova.osapi_compute.wsgi.server [req-405a1c42-a...,HTTP Status,bert,0
2380,2025-07-17 02:22:35,ModernHR,nova.metadata.wsgi.server [req-1f64aad7-bba8-4...,HTTP Status,bert,0
2382,2025-09-22 23:34:35,BillingSystem,nova.osapi_compute.wsgi.server [req-7504342b-b...,HTTP Status,bert,0
2384,2025-06-15 11:55:18,ThirdPartyAPI,nova.osapi_compute.wsgi.server [req-31a940b9-3...,HTTP Status,bert,0
...,...,...,...,...,...,...
2191,2/18/2025 2:16,BillingSystem,Global settings have been compromised,Critical Error,bert,131
2205,10/21/2025 22:14,BillingSystem,Admin rights elevated for user 1776,Security Alert,bert,132
2217,2025-05-12 09:46:54,LegacyCRM,Task assignment for TeamID 3425 could not comp...,Workflow Error,llm,133
2221,12/19/2025 2:47,ModernHR,System configuration inconsistencies were found,Critical Error,bert,134


In [16]:
clusters = df.groupby('cluster')['log_message'].apply(list)
sorted_clusters = clusters.sort_values(key=lambda x: x.map(len), ascending=False)


In [17]:
sorted_clusters

cluster
0      [nova.osapi_compute.wsgi.server [req-b9718cd8-...
5      [nova.compute.claims [req-a07ac654-8e81-416d-b...
11     [User User685 logged out., User User395 logged...
13     [Backup started at 2025-05-14 07:06:55., Backu...
7      [Multiple bad login attempts detected on user ...
                             ...                        
131              [Global settings have been compromised]
132                [Admin rights elevated for user 1776]
133    [Task assignment for TeamID 3425 could not com...
134    [System configuration inconsistencies were found]
135                [Admin rights elevated for user 3310]
Name: log_message, Length: 136, dtype: object

In [19]:
print("Clustered Patterns:")
for cluster_id, messages in sorted_clusters.items():
    if len(messages) > 10:
        print(f"Cluster {cluster_id}:")
        for msg in messages[:5]:
            print(f"  {msg}")

Clustered Patterns:
Cluster 0:
  nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" status: 200 len: 1893 time: 0.2675118
  nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" HTTP status code -  200 len: 211 time: 0.0968180
  nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" RCODE  200 len: 1874 time: 0.2280791
  nova.osapi_compute.wsgi.server [req-f0bffbc3-5ab0-4916-91c1-0a61dd7d4ec2 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2

## Classification Stage 1: Regex


In [20]:
import re
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message):
            return label
    return None

In [21]:
classify_with_regex("User User123 logged in.")


'User Action'

In [22]:
classify_with_regex("System reboot initiated by user User179.")


'System Notification'

In [23]:
classify_with_regex("Hey you, chill bro")


In [24]:
# Apply regex classification
df['regex_label'] = df['log_message'].apply(lambda x: classify_with_regex(x))
df[df['regex_label'].notnull()]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,regex,4,System Notification
14,1/4/2025 1:43,ThirdPartyAPI,File data_3847.csv uploaded successfully by us...,System Notification,regex,4,System Notification
15,5/1/2025 9:41,ModernCRM,Backup completed successfully.,System Notification,regex,8,System Notification
18,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,regex,9,User Action
27,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action,regex,11,User Action
...,...,...,...,...,...,...,...
2376,6/27/2025 8:47,ModernCRM,System updated to version 2.0.5.,System Notification,regex,21,System Notification
2381,9/5/2025 6:39,ThirdPartyAPI,Disk cleanup completed successfully.,System Notification,regex,32,System Notification
2394,4/3/2025 13:13,ModernHR,Disk cleanup completed successfully.,System Notification,regex,32,System Notification
2395,5/2/2025 14:29,ThirdPartyAPI,Backup ended at 2025-05-06 11:23:16.,System Notification,regex,13,System Notification


## Classification Stage 2: Classification Using Embeddings


In [27]:
df_non_regex = df[df['regex_label'].isnull()].copy()
df_non_regex.shape

(1910, 7)

In [28]:
df_legacy = df_non_regex[df_non_regex.source=="LegacyCRM"]
df_legacy

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
60,2025-10-06 16:55:23,LegacyCRM,Lead conversion failed for prospect ID 7842 du...,Workflow Error,llm,24,
255,2025-05-03 16:55:35,LegacyCRM,API endpoint 'getCustomerDetails' is deprecate...,Deprecation Warning,llm,48,
377,2025-06-24 12:16:29,LegacyCRM,Customer follow-up process for lead ID 5621 fa...,Workflow Error,llm,62,
1325,2025-04-17 07:33:44,LegacyCRM,Escalation rule execution failed for ticket ID...,Workflow Error,llm,105,
1734,2025-04-30 07:47:30,LegacyCRM,The 'ExportToCSV' feature is outdated. Please ...,Deprecation Warning,llm,118,
1826,2025-01-23 10:33:36,LegacyCRM,Support for legacy authentication methods will...,Deprecation Warning,llm,122,
2217,2025-05-12 09:46:54,LegacyCRM,Task assignment for TeamID 3425 could not comp...,Workflow Error,llm,133,


- here we see that legacy crm has very low record with target labesl `workflow` and  `Deprecation Warning`
- So we can use LLm

In [29]:
df_non_legacy = df_non_regex[df_non_regex.source!="LegacyCRM"]
df_non_legacy

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0,
...,...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,bert,0,
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,bert,7,
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,bert,0,
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,1,


- here we see that we have enough data to understand the context using bert

In [30]:

model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight embedding model
embeddings_filtered = model.encode(df_non_legacy['log_message'].tolist())

In [31]:
X = embeddings_filtered
y = df_non_legacy['target_label'].values

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

                precision    recall  f1-score   support

Critical Error       0.91      1.00      0.95        48
         Error       0.98      0.89      0.93        47
   HTTP Status       1.00      1.00      1.00       304
Resource Usage       1.00      1.00      1.00        49
Security Alert       1.00      0.99      1.00       123

      accuracy                           0.99       571
     macro avg       0.98      0.98      0.98       571
  weighted avg       0.99      0.99      0.99       571



In [34]:
import joblib
joblib.dump(clf, '../models/log_classifier.joblib')

['../models/log_classifier.joblib']