In [3]:
import pandas as pd

df = pd.read_csv("synthetic_logs.csv")
df.head()


Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert


In [4]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [5]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [6]:
df[df.target_label=='System Notification'].sample(10)

Unnamed: 0,timestamp,source,log_message,target_label,complexity
2003,6/23/2025 17:54,ModernCRM,System reboot initiated by user User517.,System Notification,regex
1033,7/26/2025 13:27,ModernHR,Backup started at 2025-02-12 06:47:30.,System Notification,regex
2014,12/25/2025 4:33,AnalyticsEngine,System reboot initiated by user User293.,System Notification,regex
1194,7/20/2025 9:10,AnalyticsEngine,System reboot initiated by user User747.,System Notification,regex
2242,10/14/2025 19:09,AnalyticsEngine,Backup completed successfully.,System Notification,regex
2289,1/23/2025 9:59,ThirdPartyAPI,System updated to version 2.3.5.,System Notification,regex
1688,8/4/2025 12:08,ModernHR,Disk cleanup completed successfully.,System Notification,regex
209,2/14/2025 13:28,ModernCRM,Backup started at 2025-01-15 04:36:39.,System Notification,regex
1500,1/5/2025 16:24,ThirdPartyAPI,System updated to version 2.8.8.,System Notification,regex
1577,1/25/2025 1:32,BillingSystem,System updated to version 2.8.3.,System Notification,regex


In [7]:
df[df.log_message.str.startswith("System reboot initiated by user")]

Unnamed: 0,timestamp,source,log_message,target_label,complexity
36,11/19/2025 13:14,BillingSystem,System reboot initiated by user User243.,System Notification,regex
92,12/4/2025 21:20,BillingSystem,System reboot initiated by user User471.,System Notification,regex
139,5/8/2025 16:34,ModernHR,System reboot initiated by user User216.,System Notification,regex
140,9/11/2025 8:49,AnalyticsEngine,System reboot initiated by user User639.,System Notification,regex
161,3/31/2025 19:40,BillingSystem,System reboot initiated by user User819.,System Notification,regex
163,6/6/2025 15:29,BillingSystem,System reboot initiated by user User938.,System Notification,regex
307,4/12/2025 0:41,BillingSystem,System reboot initiated by user User929.,System Notification,regex
365,10/20/2025 22:32,ModernHR,System reboot initiated by user User533.,System Notification,regex
508,4/15/2025 2:04,ThirdPartyAPI,System reboot initiated by user User591.,System Notification,regex
552,9/22/2025 20:54,ModernHR,System reboot initiated by user User421.,System Notification,regex


In [8]:
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer




In [9]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight embedding model
embeddings = model.encode(df['log_message'].tolist())



In [10]:
embeddings[:5]

array([[-0.10293962,  0.03354594, -0.02202607, ...,  0.00457793,
        -0.04259717,  0.00322621],
       [ 0.00804572, -0.03573923,  0.04938739, ...,  0.01538319,
        -0.06230947, -0.02774666],
       [-0.00908224,  0.13003924, -0.05275568, ...,  0.02014104,
        -0.05117098, -0.02930294],
       [-0.09751046,  0.04911299, -0.03977424, ...,  0.02477502,
        -0.03546079, -0.00018598],
       [-0.10468338,  0.05926038, -0.02488499, ...,  0.02502055,
        -0.037193  , -0.0256891 ]], dtype=float32)

In [11]:
clustering = DBSCAN(eps=0.2, min_samples=1, metric='cosine').fit(embeddings)
df['cluster'] = clustering.labels_

In [12]:
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0


In [13]:
# Group by cluster to inspect patterns
clusters = df.groupby('cluster')['log_message'].apply(list)
sorted_clusters = clusters.sort_values(key=lambda x: x.map(len), ascending=False

SyntaxError: incomplete input (655400040.py, line 3)

In [None]:
print("Clustered Patterns:")
for cluster_id, messages in sorted_clusters.items():
    if len(messages) > 10:
        print(f"Cluster {cluster_id}:")
        for msg in messages[:5]:
            print(f"  {msg}")

#Classification Stage 1: Regex

In [None]:
import re
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message):
            return label
    return None

In [None]:
classify_with_regex("User User123 logged in.")

In [None]:
classify_with_regex("System reboot initiated by user User179.")

In [None]:
classify_with_regex("Hey you, chill bro")

In [None]:
# Apply regex classification
df['regex_label'] = df['log_message'].apply(lambda x: classify_with_regex(x))
df[df['regex_label'].notnull()]

In [None]:
df[df['regex_label'].isnull()].head(5)

#Classification Stage 2: Classification Using Embeddings

In [None]:
df_non_regex = df[df['regex_label'].isnull()].copy()
df_non_regex.shape

In [None]:
df_legacy = df_non_regex[df_non_regex.source=="LegacyCRM"]
df_legacy

In [None]:
df_non_legacy = df_non_regex[df_non_regex.source!="LegacyCRM"]
df_non_legacy

In [None]:
df_non_legacy.shape

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight embedding model
embeddings_filtered = model.encode(df_non_legacy['log_message'].tolist())

In [None]:
len(embeddings_filtered)

In [None]:
X = embeddings_filtered
y = df_non_legacy['target_label'].values

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

In [None]:
import joblib
joblib.dump(clf, 'log_classifier.joblib')