In [22]:
import pandas as pd

from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN

Data Load

In [23]:
df=pd.read_csv("dataset/synthetic_logs.csv")

In [24]:
#df.head()
df

Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert
...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,bert
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,bert
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,bert
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert


In [25]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [26]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

Examining logs messages to determine where we can use regex

Clustering

Using DBScan to cluster log_messages and sentance transformer to generate embedding

In [27]:
#extracting log messages
log_messages=df['log_message']

Embedding is numeric representation of text

In [28]:
#load pre-trained SentenceTransformer model
model= SentenceTransformer('all-MiniLM-L6-v2')

#Generat embedding for log messages
embeddings= model.encode(df['log_message'].tolist())
embeddings[:2]


array([[-1.02939598e-01,  3.35459486e-02, -2.20260844e-02,
         1.55103172e-03, -9.86921880e-03, -1.78956211e-01,
        -6.34409934e-02, -6.01761453e-02,  2.81108953e-02,
         5.99620081e-02, -1.72618236e-02,  1.43364200e-03,
        -1.49560079e-01,  3.15288268e-03, -5.66030741e-02,
         2.71685328e-02, -1.49890278e-02, -3.54037210e-02,
        -3.62936370e-02, -1.45410486e-02, -5.61492983e-03,
         8.75538811e-02,  4.55120727e-02,  2.50963680e-02,
         1.00187613e-02,  1.24267004e-02, -1.39923558e-01,
         7.68696666e-02,  3.14095393e-02, -4.15247958e-03,
         4.36902344e-02,  1.71249956e-02, -8.00950900e-02,
         5.74006140e-02,  1.89092122e-02,  8.55262056e-02,
         3.96399088e-02, -1.34371832e-01, -1.44367013e-03,
         3.06707830e-03,  1.76854059e-01,  4.44890792e-03,
        -1.69275142e-02,  2.24266183e-02, -4.35049757e-02,
         6.09031972e-03, -9.98171885e-03, -6.23973012e-02,
         1.07372692e-02, -6.04895223e-03, -7.14661255e-0

In [29]:
#Perform DBSCAN clustering
dbscan= DBSCAN(eps=0.2, min_samples=1, metric='cosine')
clusters = dbscan.fit_predict(embeddings)

#Add cluster labels to the dataframe
df['cluster_label'] = clusters
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0


In [30]:
df[df.cluster_label==5]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster_label
8,2025-02-12 10:42:29,ThirdPartyAPI,nova.compute.claims [req-a07ac654-8e81-416d-bf...,Resource Usage,bert,5
26,2025-03-03 17:11:11,ModernCRM,nova.compute.claims [req-d6986b54-3735-4a42-90...,Resource Usage,bert,5
40,2025-06-19 21:42:34,ThirdPartyAPI,nova.compute.claims [req-72b4858f-049e-49e1-b3...,Resource Usage,bert,5
58,2025-09-13 14:45:14,AnalyticsEngine,nova.compute.claims [req-5c8f52bd-8e3c-41f0-95...,Resource Usage,bert,5
61,2025-04-27 11:18:18,ThirdPartyAPI,nova.compute.claims [req-d38f479d-9bb9-4276-96...,Resource Usage,bert,5
...,...,...,...,...,...,...
2336,2025-12-10 11:53:33,AnalyticsEngine,nova.compute.claims [req-97fcea79-42f7-4241-9b...,Resource Usage,bert,5
2345,2025-12-22 01:38:48,BillingSystem,nova.compute.claims [req-caeb3818-dab6-4e8d-9e...,Resource Usage,bert,5
2352,2025-02-18 00:16:44,ModernCRM,nova.compute.claims [req-98474cd9-61e1-4afe-bd...,Resource Usage,bert,5
2355,2025-11-28 18:03:55,BillingSystem,nova.compute.claims [req-6f9ecdfe-481c-4535-9b...,Resource Usage,bert,5


Sort clusters by number of records in it and printing 5 log messages from those clusters that has more than 10 recordds in it.

In [31]:
cluster_counts=df['cluster_label'].value_counts()
large_clusters= cluster_counts[cluster_counts>10].index

for cluster in large_clusters:
  print(f"Cluster {cluster}:")
  print(df[df['cluster_label']==cluster]['log_message'].head(5).to_string(index=False))

Cluster 0:
nova.osapi_compute.wsgi.server [req-b9718cd8-f6...
nova.osapi_compute.wsgi.server [req-4895c258-b2...
nova.osapi_compute.wsgi.server [req-ee8bc8ba-92...
nova.osapi_compute.wsgi.server [req-f0bffbc3-5a...
nova.osapi_compute.wsgi.server [req-2bf7cfee-a2...
Cluster 5:
nova.compute.claims [req-a07ac654-8e81-416d-bfb...
nova.compute.claims [req-d6986b54-3735-4a42-907...
nova.compute.claims [req-72b4858f-049e-49e1-b31...
nova.compute.claims [req-5c8f52bd-8e3c-41f0-95a...
nova.compute.claims [req-d38f479d-9bb9-4276-968...
Cluster 11:
User User685 logged out.
 User User395 logged in.
 User User225 logged in.
User User494 logged out.
 User User900 logged in.
Cluster 13:
Backup started at 2025-05-14 07:06:55.
Backup started at 2025-02-15 20:00:19.
  Backup ended at 2025-08-08 13:06:23.
Backup started at 2025-11-14 08:27:43.
Backup started at 2025-12-09 10:19:11.
Cluster 7:
Multiple bad login attempts detected on user 85...
Multiple login failures occurred on user 9052 a...
  User 7153

Classification Stage 1 :  Regex

In [32]:
import re
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }

    for pattern, label in regex_patterns.items():
        #making regex case insentitive
        if re.search(pattern, log_message, re.IGNORECASE):
            return label
    return None

In [33]:
classify_with_regex("User User225 logged IN.")

'User Action'

In [34]:
df['regex_label']=df['log_message'].apply(classify_with_regex)

df[df.regex_label.notnull()]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster_label,regex_label
7,10/11/2025 8:44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,regex,4,System Notification
14,1/4/2025 1:43,ThirdPartyAPI,File data_3847.csv uploaded successfully by us...,System Notification,regex,4,System Notification
15,5/1/2025 9:41,ModernCRM,Backup completed successfully.,System Notification,regex,8,System Notification
18,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,regex,9,User Action
27,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action,regex,11,User Action
...,...,...,...,...,...,...,...
2376,6/27/2025 8:47,ModernCRM,System updated to version 2.0.5.,System Notification,regex,21,System Notification
2381,9/5/2025 6:39,ThirdPartyAPI,Disk cleanup completed successfully.,System Notification,regex,32,System Notification
2394,4/3/2025 13:13,ModernHR,Disk cleanup completed successfully.,System Notification,regex,32,System Notification
2395,5/2/2025 14:29,ThirdPartyAPI,Backup ended at 2025-05-06 11:23:16.,System Notification,regex,13,System Notification


In [35]:
df[df.regex_label.isna()]

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster_label,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0,
...,...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,bert,0,
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,bert,7,
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,bert,0,
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,1,


In [36]:
df_non_regex=df[df['regex_label'].isnull()].copy()
df_non_regex

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster_label,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0,
...,...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,bert,0,
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,bert,7,
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,bert,0,
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,1,


Classification Stage 2 : Classification using Embedding

Finding are there enough traning samples to use BERT classification if not then we'll use LLM classification


In [37]:
#print target_label that have 5 or less rows
print(df_non_regex['target_label'].value_counts()[df_non_regex['target_label'].value_counts()<=5].index.tolist())




New df for Non Legacy CRM data

In [38]:
df_non_legacy = df_non_regex[df_non_regex.source != 'LegacyCRM']
df_non_legacy.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI'], dtype=object)

In [39]:
filtered_embeddings = model.encode(df_non_legacy['log_message'].tolist())
filtered_embeddings[:2]

array([[-1.02939598e-01,  3.35459486e-02, -2.20260844e-02,
         1.55103172e-03, -9.86921880e-03, -1.78956211e-01,
        -6.34409934e-02, -6.01761453e-02,  2.81108953e-02,
         5.99620081e-02, -1.72618236e-02,  1.43364200e-03,
        -1.49560079e-01,  3.15288268e-03, -5.66030741e-02,
         2.71685328e-02, -1.49890278e-02, -3.54037210e-02,
        -3.62936370e-02, -1.45410486e-02, -5.61492983e-03,
         8.75538811e-02,  4.55120727e-02,  2.50963680e-02,
         1.00187613e-02,  1.24267004e-02, -1.39923558e-01,
         7.68696666e-02,  3.14095393e-02, -4.15247958e-03,
         4.36902344e-02,  1.71249956e-02, -8.00950900e-02,
         5.74006140e-02,  1.89092122e-02,  8.55262056e-02,
         3.96399088e-02, -1.34371832e-01, -1.44367013e-03,
         3.06707830e-03,  1.76854059e-01,  4.44890792e-03,
        -1.69275142e-02,  2.24266183e-02, -4.35049757e-02,
         6.09031972e-03, -9.98171885e-03, -6.23973012e-02,
         1.07372692e-02, -6.04895223e-03, -7.14661255e-0

Using filtered_embedding as X and target_label as Y training a logistic regression model

In [44]:
X= filtered_embeddings
y= df_non_legacy['target_label']

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)
clf= LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred=clf.predict(X_test)
report= classification_report(y_test, y_pred)
print(report)

                precision    recall  f1-score   support

Critical Error       0.97      1.00      0.98        57
         Error       0.98      0.96      0.97        54
   HTTP Status       1.00      1.00      1.00       305
Resource Usage       1.00      1.00      1.00        55
Security Alert       1.00      0.99      0.99       100

      accuracy                           0.99       571
     macro avg       0.99      0.99      0.99       571
  weighted avg       0.99      0.99      0.99       571



Exporting the model

In [None]:
import joblib

joblib.dump(clf, '../model/log_classifier.joblib')