In [1]:
import pandas as pd

df = pd.read_csv("synthetic_logs.csv")
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert


In [3]:
df.shape

(2410, 5)

In [2]:
df["source"].value_counts()

Unnamed: 0_level_0,count
source,Unnamed: 1_level_1
ThirdPartyAPI,496
ModernHR,492
BillingSystem,479
AnalyticsEngine,471
ModernCRM,465
LegacyCRM,7


In [8]:
df["target_label"].value_counts()

Unnamed: 0_level_0,count
target_label,Unnamed: 1_level_1
HTTP Status,1017
Security Alert,371
System Notification,356
Error,177
Resource Usage,177
Critical Error,161
User Action,144
Workflow Error,4
Deprecation Warning,3


In [4]:
df["complexity"].value_counts()

Unnamed: 0_level_0,count
complexity,Unnamed: 1_level_1
bert,1903
regex,500
llm,7


In [5]:
df.isnull().sum()

Unnamed: 0,0
timestamp,0
source,0
log_message,0
target_label,0
complexity,0


In [6]:
df[df.target_label=='System Notification'].sample(10)

Unnamed: 0,timestamp,source,log_message,target_label,complexity
1948,5/14/2025 18:04,ModernCRM,Backup started at 2025-11-23 01:05:16.,System Notification,regex
2043,9/12/2025 20:20,ThirdPartyAPI,System reboot initiated by user User262.,System Notification,regex
2078,4/9/2025 17:00,AnalyticsEngine,File data_4759.csv uploaded successfully by us...,System Notification,regex
1834,6/13/2025 5:43,ModernCRM,Backup ended at 2025-02-21 02:50:57.,System Notification,regex
1033,7/26/2025 13:27,ModernHR,Backup started at 2025-02-12 06:47:30.,System Notification,regex
2376,6/27/2025 8:47,ModernCRM,System updated to version 2.0.5.,System Notification,regex
191,5/6/2025 23:58,ThirdPartyAPI,Backup started at 2025-12-09 10:19:11.,System Notification,regex
728,5/18/2025 18:19,BillingSystem,Backup ended at 2025-08-06 22:28:29.,System Notification,regex
1207,1/9/2025 22:57,ModernCRM,File data_3277.csv uploaded successfully by us...,System Notification,regex
762,9/5/2025 3:51,AnalyticsEngine,System updated to version 5.0.6.,System Notification,regex


In [9]:
df[df.log_message.str.startswith("System reboot initiated by user")].sample(10)

Unnamed: 0,timestamp,source,log_message,target_label,complexity
307,4/12/2025 0:41,BillingSystem,System reboot initiated by user User929.,System Notification,regex
1865,5/11/2025 10:58,AnalyticsEngine,System reboot initiated by user User932.,System Notification,regex
2253,10/7/2025 2:20,ModernHR,System reboot initiated by user User644.,System Notification,regex
1524,11/30/2025 2:39,ThirdPartyAPI,System reboot initiated by user User278.,System Notification,regex
800,8/15/2025 12:14,BillingSystem,System reboot initiated by user User901.,System Notification,regex
2360,5/1/2025 4:21,ThirdPartyAPI,System reboot initiated by user User876.,System Notification,regex
1776,2/21/2025 11:56,ModernHR,System reboot initiated by user User155.,System Notification,regex
852,3/31/2025 5:20,ModernCRM,System reboot initiated by user User811.,System Notification,regex
1304,8/10/2025 6:18,ThirdPartyAPI,System reboot initiated by user User758.,System Notification,regex
140,9/11/2025 8:49,AnalyticsEngine,System reboot initiated by user User639.,System Notification,regex


# clustering

In [None]:
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['log_message'].tolist())
embeddings[:2]

In [11]:
clustering = DBSCAN(eps=0.2, min_samples=1, metric='cosine').fit(embeddings)
df['cluster'] = clustering.labels_
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0


In [12]:
clusters = df.groupby('cluster')['log_message'].apply(list)
sorted_clusters = clusters.sort_values(key=lambda x: x.map(len), ascending=False)
print("Clustered Patterns:")
for cluster_id, messages in sorted_clusters.items():
    if len(messages) > 10:
        print(f"Cluster {cluster_id}:")
        for msg in messages[:5]:
            print(f"  {msg}")

Clustered Patterns:
Cluster 0:
  nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" status: 200 len: 1893 time: 0.2675118
  nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" HTTP status code -  200 len: 211 time: 0.0968180
  nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" RCODE  200 len: 1874 time: 0.2280791
  nova.osapi_compute.wsgi.server [req-f0bffbc3-5ab0-4916-91c1-0a61dd7d4ec2 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2

In [15]:
df[df["cluster"]==53]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
283,1/25/2025 13:45,ModernCRM,Module X experienced an invalid data format issue,Error,bert,53
344,11/13/2025 13:20,BillingSystem,Input format mismatch occurred in module X,Error,bert,53
378,12/24/2025 19:10,AnalyticsEngine,Module X reported an error in input format val...,Error,bert,53
454,5/20/2025 11:01,AnalyticsEngine,Module X failed to process input due to format...,Error,bert,53
530,12/12/2025 13:41,ModernCRM,Input data format in module X was invalid or c...,Error,bert,53
620,1/19/2025 2:53,BillingSystem,Input to module X did not conform to expected ...,Error,bert,53
645,8/8/2025 0:27,ModernHR,Format of input data in module X was not recog...,Error,bert,53
744,11/6/2025 15:30,ThirdPartyAPI,Error in input format occurred during module X...,Error,bert,53
1110,6/1/2025 15:58,BillingSystem,Invalid input format caused module X to fail,Error,bert,53
1239,4/9/2025 8:41,ModernHR,Input to module X was not in the expected format,Error,bert,53


# Classfication Using Regex

In [16]:
import re
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message):
            return label
    return None

## Examples

In [17]:
classify_with_regex("User User123 logged in.")

'User Action'

In [18]:
classify_with_regex("System reboot initiated by user User179.")

'System Notification'

In [19]:
classify_with_regex("Hey you, chill bro")

In [20]:
df['regex_label'] = df['log_message'].apply(lambda x: classify_with_regex(x))
df[df['regex_label'].notnull()]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,regex,4,System Notification
14,1/4/2025 1:43,ThirdPartyAPI,File data_3847.csv uploaded successfully by us...,System Notification,regex,4,System Notification
15,5/1/2025 9:41,ModernCRM,Backup completed successfully.,System Notification,regex,8,System Notification
18,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,regex,9,User Action
27,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action,regex,11,User Action
...,...,...,...,...,...,...,...
2376,6/27/2025 8:47,ModernCRM,System updated to version 2.0.5.,System Notification,regex,21,System Notification
2381,9/5/2025 6:39,ThirdPartyAPI,Disk cleanup completed successfully.,System Notification,regex,32,System Notification
2394,4/3/2025 13:13,ModernHR,Disk cleanup completed successfully.,System Notification,regex,32,System Notification
2395,5/2/2025 14:29,ThirdPartyAPI,Backup ended at 2025-05-06 11:23:16.,System Notification,regex,13,System Notification


In [21]:
df[df['regex_label'].isnull()].shape

(1910, 7)

In [23]:
df.shape

(2410, 7)

In [22]:
1910+500

2410

### Out of 2410 rows
1. 500 rows are classfied using regex function
2. 1910 rows couldn`t be classified. Now we will train to classify these rows using BERT and LLM

# Classfication using BERT

In [24]:
df_non_regex = df[df['regex_label'].isnull()].copy()
df_non_regex.shape

(1910, 7)

In [25]:
df_non_regex.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0,


In [26]:
df_non_regex['source'].value_counts()

Unnamed: 0_level_0,count
source,Unnamed: 1_level_1
ModernHR,402
ThirdPartyAPI,386
ModernCRM,373
AnalyticsEngine,371
BillingSystem,371
LegacyCRM,7


## We can clearly see that legacyCRM has less value for training model. We use LLM for this source to predict the classification.

In [27]:
df_legacy = df_non_regex[df_non_regex.source=="LegacyCRM"]
df_legacy

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
60,2025-10-06 16:55:23,LegacyCRM,Lead conversion failed for prospect ID 7842 du...,Workflow Error,llm,24,
255,2025-05-03 16:55:35,LegacyCRM,API endpoint 'getCustomerDetails' is deprecate...,Deprecation Warning,llm,48,
377,2025-06-24 12:16:29,LegacyCRM,Customer follow-up process for lead ID 5621 fa...,Workflow Error,llm,62,
1325,2025-04-17 07:33:44,LegacyCRM,Escalation rule execution failed for ticket ID...,Workflow Error,llm,105,
1734,2025-04-30 07:47:30,LegacyCRM,The 'ExportToCSV' feature is outdated. Please ...,Deprecation Warning,llm,118,
1826,2025-01-23 10:33:36,LegacyCRM,Support for legacy authentication methods will...,Deprecation Warning,llm,122,
2217,2025-05-12 09:46:54,LegacyCRM,Task assignment for TeamID 3425 could not comp...,Workflow Error,llm,133,


In [28]:
df_non_legacy = df_non_regex[df_non_regex.source!="LegacyCRM"]
df_non_legacy

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0,
...,...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,bert,0,
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,bert,7,
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,bert,0,
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,1,


In [29]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight embedding model
embeddings_filtered = model.encode(df_non_legacy['log_message'].tolist())

In [30]:
X = embeddings_filtered
y = df_non_legacy['target_label'].values

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

                precision    recall  f1-score   support

Critical Error       0.91      1.00      0.95        48
         Error       0.98      0.89      0.93        47
   HTTP Status       1.00      1.00      1.00       304
Resource Usage       1.00      1.00      1.00        49
Security Alert       1.00      0.99      1.00       123

      accuracy                           0.99       571
     macro avg       0.98      0.98      0.98       571
  weighted avg       0.99      0.99      0.99       571



In [33]:
import joblib
joblib.dump(clf,"log_classifier.joblib")

['log_classifier.joblib']