In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.manifold import TSNE

df = pd.read_csv("/content/logs.csv")

In [None]:
df.head()

In [None]:
df.source.unique()

In [None]:
df.target_label.unique()

In [None]:
#embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['log_message'].tolist())

In [None]:
embeddings[:5]

In [None]:
#DBSCAN

dbs = DBSCAN(eps=0.2, min_samples=1, metric='cosine')
clusters = dbs.fit_predict(embeddings)

df['cluster'] = clusters

In [None]:
df.head()
df[df.cluster==1]
df.head()

In [None]:
#Reduce dimensions to 2D for visualization

tsne = TSNE(n_components=2, random_state=42)
points_2d = tsne.fit_transform(embeddings)

plt.figure(figsize=(10, 6))

for cluster_num in set(clusters):
    # Get points for this cluster
    mask = (clusters == cluster_num)

    if cluster_num == -1:  #Noise
        plt.scatter(points_2d[mask, 0], points_2d[mask, 1],
                    c='gray', label='Noise', alpha=0.5)
    else:  #Regular clusters
        plt.scatter(points_2d[mask, 0], points_2d[mask, 1],
                    label=f'Cluster {cluster_num}')

plt.title(f"DBSCAN Clustering (Found {len(set(clusters))-1} clusters)")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.show()

In [None]:
df[df.cluster==1]
df.head()

In [None]:
cluster_counts = df['cluster'].value_counts()
large_clusters = cluster_counts[cluster_counts >10].index

for cluster in large_clusters:
  print(f"Cluster{cluster}:")
  print(df[df['cluster'] == cluster]['log_message'].head(5).to_string(index=False))

In [None]:
import re
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message, re.IGNORECASE):
            return label
    return None

In [None]:
result = classify_with_regex("User User494 logged OUT.")
print(result)

In [None]:
df.shape

In [None]:
df['regex_label'] = df['log_message'].apply(classify_with_regex)
df[df.regex_label.isna()]

In [None]:
df[df.regex_label.notnull()]

In [None]:
df_non_regex = df[df['regex_label'].isna()].copy()
df_non_regex

In [None]:
print(df_non_regex['target_label'].value_counts())

rare_categories = df_non_regex['target_label'].value_counts()[df_non_regex['target_label'].value_counts() <= 5].index.tolist()
print(rare_categories)

In [None]:
df_non_legacy = df_non_regex[df_non_regex.source!='LegacyCRM']


In [None]:
df_non_legacy.source.unique()

In [None]:
#embeddings for Bert for encoding

filtered_embeddings = model.encode(df_non_legacy['log_message'].tolist())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
X = filtered_embeddings
y = df_non_legacy['target_label']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

In [None]:
import joblibx`
joblib.dump(clf,'/content/sample_data/log_classification.joblib')
