In [2]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load dataset
df = pd.read_csv("data/labeled_logs.csv", on_bad_lines='skip', engine='python')# Show sample
print(df.head())

# Split features and target
X = df['log_message']
y = df['is_anomaly']  # 0 = normal, 1 = anomaly

# Vectorize text
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X_vec = vectorizer.fit_transform(X)
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_vec, y)

# Now re-split and train on balanced data
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)


             timestamp                           log_message  is_anomaly  \
0  2025-04-06 08:00:00                    Starting batch job           0   
1  2025-04-06 08:01:15  Connecting to PostgreSQL database...           0   
2  2025-04-06 08:02:30     Fetched 1000 records successfully           0   
3  2025-04-06 08:03:45             Transforming cost data...           0   
4  2025-04-06 08:04:30             WARN: Memory usage at 75%           1   

        anomaly_type  
0             normal  
1             normal  
2             normal  
3             normal  
4  high_memory_usage  


In [2]:
# Train logistic regression model
model = LogisticRegression(class_weight='balanced')
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        83
           1       1.00      1.00      1.00        60

    accuracy                           1.00       143
   macro avg       1.00      1.00      1.00       143
weighted avg       1.00      1.00      1.00       143



In [3]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        83
           1       1.00      1.00      1.00        60

    accuracy                           1.00       143
   macro avg       1.00      1.00      1.00       143
weighted avg       1.00      1.00      1.00       143

[[83  0]
 [ 0 60]]


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# Load dataset
df = pd.read_csv("data/labeled_logs.csv", on_bad_lines='skip', engine='python')

# Prepare features
X = df['log_message']
y = df['is_anomaly']

# Vectorize text
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X_vec = vectorizer.fit_transform(X)

# Resample
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_vec, y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict & evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        83
           1       1.00      1.00      1.00        60

    accuracy                           1.00       143
   macro avg       1.00      1.00      1.00       143
weighted avg       1.00      1.00      1.00       143



In [5]:
import joblib

# Save model and vectorizer
joblib.dump(model, 'models/anomaly_model.joblib')
joblib.dump(vectorizer, 'models/vectorizer.joblib')

['models/vectorizer.joblib']

In [None]:
import joblib

# Load model and vectorizer
model = joblib.load('models/anomaly_model.joblib')
vectorizer = joblib.load('models/vectorizer.joblib')

def predict_anomaly(log_line):
    vec = vectorizer.transform([log_line])
    prediction = model.predict(vec)[0]
    label = "⚠️ Anomaly" if prediction == 1 else "✅ Normal"
    print(f"[Prediction] {label}: {log_line}")
    return prediction

# Example usage
while True:
    user_log = input("Enter a log line (or type 'exit'): ")
    if user_log.lower() == 'exit':
        break
    predict_anomaly(user_log)

[Prediction] ⚠️ Anomaly: [ERROR] Failed to connect to database: timeout after 5000ms
[Prediction] ✅ Normal: [INFO] Agent initialized successfully on node: uw2p-akp-b3


In [10]:
df.to_csv("data/labeled_logs.csv", index=False, quoting=1, quotechar='"', escapechar="\\")