In [1]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("realtime_extracted_features.csv")
df.sample(10)

Unnamed: 0,qid,is_pickup,is_cleanup,is_qmgr,is_bounce,is_postsuper,queue_active,queue_unknown,queue_deferred,queue_bounced,...,contains_delivered,contains_held,numeric_pattern_count,user_is_unknown,user_frequency,process_postfix/bounce,process_postfix/cleanup,process_postfix/pickup,process_postfix/postsuper,process_postfix/qmgr
1315,277172650B3,1,0,0,0,0,0,1,0,0,...,0,0,16,1,7704,False,False,True,False,False
3585,C4CB52650F1,0,0,1,0,0,0,1,0,0,...,0,0,15,1,7704,False,False,False,False,True
4267,4C8FC265104,0,0,1,0,0,0,1,0,0,...,0,0,15,1,7704,False,False,False,False,True
6884,29E9B26514B,1,0,0,0,0,0,1,0,0,...,0,0,16,1,7704,False,False,True,False,False
4023,1FC8D2650FD,0,1,0,0,0,0,1,0,0,...,0,0,16,1,7704,False,True,False,False,False
5228,04E0326511D,0,1,0,0,0,0,1,0,0,...,0,0,16,1,7704,False,True,False,False,False
6592,EB4B4265141,0,1,0,0,0,0,1,0,0,...,0,0,16,1,7704,False,True,False,False,False
6856,23E9526514A,0,1,0,0,0,0,1,0,0,...,0,0,16,1,7704,False,True,False,False,False
546,A2B4D265098,0,0,1,0,0,1,0,0,0,...,0,0,17,0,81,False,False,False,False,True
3533,B354A2650EF,0,0,1,0,0,0,1,0,0,...,0,0,15,1,7704,False,False,False,False,True


In [3]:
df.isnull().sum()

qid                          0
is_pickup                    0
is_cleanup                   0
is_qmgr                      0
is_bounce                    0
is_postsuper                 0
queue_active                 0
queue_unknown                0
queue_deferred               0
queue_bounced                0
size                         0
nrcpt                        0
has_from_email               0
has_to_email                 0
has_message_id               0
has_removed                  0
has_error                    0
log_hour                     0
not_working_hour             0
is_weekend                   0
message_length               0
unique_words                 0
contains_connection          0
contains_status              0
contains_relay               0
contains_delivered           0
contains_held                0
numeric_pattern_count        0
user_is_unknown              0
user_frequency               0
process_postfix/bounce       0
process_postfix/cleanup      0
process_

In [26]:
df.dtypes

qid                          object
is_pickup                     int64
is_cleanup                    int64
is_qmgr                       int64
is_bounce                     int64
is_postsuper                  int64
queue_active                  int64
queue_unknown                 int64
queue_deferred                int64
queue_bounced                 int64
size                          int64
nrcpt                         int64
has_from_email                int64
has_to_email                  int64
has_message_id                int64
has_removed                   int64
has_error                     int64
log_hour                      int64
not_working_hour              int64
is_weekend                    int64
message_length                int64
unique_words                  int64
contains_connection           int64
contains_status               int64
contains_relay                int64
contains_delivered            int64
contains_held                 int64
numeric_pattern_count       

In [4]:
df.shape

(10197, 36)

In [5]:
scaler = StandardScaler()
X = df.drop(['qid'],axis=1)
X_scaled = scaler.fit_transform(X)

In [6]:
iso_model = IsolationForest(
    n_estimators=50,
    contamination=0.1,  # or even 0.1 for more aggressive detection
    max_samples=0.5,
    random_state=42
)

iso_model.fit(X_scaled)

In [7]:
# === Predict anomalies ===
# -1 = anomaly, 1 = normal
features_df = df
features_df['anomaly'] = iso_model.predict(X_scaled)

# === See results ===
print(features_df[['qid', 'anomaly']].sample(20))

              qid  anomaly
5796  5956F26512D        1
8280  2993F265168        1
6652  02FAF265143        1
7843  D2EED265163       -1
484   9643026509E        1
7125  5E5D5265150        1
2293  D91F92650CD        1
3716  D73332650F4        1
8636  645A9265178        1
4145  346A7265100        1
3777  E9A042650F5        1
4504  7325B265109        1
4094  2C4C32650FF        1
5443  2A5DF265123        1
5561  3BD94265125        1
9574  1752F265192        1
8205  13B0A26516D        1
7738  C9AB4265161        1
4652  8C24E26510D       -1
728   C41362650A4        1


In [38]:
df.columns

Index(['qid', 'is_pickup', 'is_cleanup', 'is_qmgr', 'is_bounce',
       'is_postsuper', 'queue_active', 'queue_unknown', 'queue_deferred',
       'queue_bounced', 'size', 'nrcpt', 'has_from_email', 'has_to_email',
       'has_message_id', 'has_removed', 'has_error', 'log_hour',
       'not_working_hour', 'is_weekend', 'message_length', 'unique_words',
       'contains_relay', 'contains_delivered', 'contains_held',
       'numeric_pattern_count', 'user_is_unknown', 'user_frequency',
       'process_postfix/bounce', 'process_postfix/cleanup',
       'process_postfix/pickup', 'process_postfix/postsuper',
       'process_postfix/qmgr', 'anomaly'],
      dtype='object')

In [8]:
import joblib
joblib.dump(iso_model, 'anomaly_model.pkl')
joblib.dump(scaler, 'anomaly_scaler.pkl')

['anomaly_scaler.pkl']

In [9]:
import numpy as np

# Example: get normal point
normal_point = X_scaled[0].copy()

# Make it anomalous: artificially inflate certain features
anomalous_point = normal_point.copy()
anomalous_point[ X.columns.get_loc('size') ] = normal_point[ X.columns.get_loc('size') ] + 10  # huge size
anomalous_point[ X.columns.get_loc('numeric_pattern_count') ] = 100  # crazy number

# Predict
pred = iso_model.predict([anomalous_point])
print(f"Prediction: {pred}")  # Should print [-1] for anomaly


Prediction: [-1]


In [11]:
# Analyze model performance
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# Check anomaly distribution
anomaly_counts = features_df['anomaly'].value_counts()
print("Anomaly distribution:")
print(f"Normal (1): {anomaly_counts.get(1, 0)} ({anomaly_counts.get(1, 0)/len(features_df)*100:.1f}%)")
print(f"Anomaly (-1): {anomaly_counts.get(-1, 0)} ({anomaly_counts.get(-1, 0)/len(features_df)*100:.1f}%)")

# Check decision scores to understand threshold
scores = iso_model.decision_function(X_scaled)
print(f"\nDecision scores: min={scores.min():.3f}, max={scores.max():.3f}, mean={scores.mean():.3f}")
print(f"Threshold for anomaly detection: ~{np.percentile(scores, 10):.3f}")

# Look at some normal examples
print("\n=== NORMAL EXAMPLES ===")
normal_indices = features_df[features_df['anomaly'] == 1].index[:5]
for idx in normal_indices:
    print(f"QID: {features_df.loc[idx, 'qid']}, Score: {scores[idx]:.3f}")

print("\n=== ANOMALY EXAMPLES ===")
anomaly_indices = features_df[features_df['anomaly'] == -1].index[:5]
for idx in anomaly_indices:
    print(f"QID: {features_df.loc[idx, 'qid']}, Score: {scores[idx]:.3f}")

# Feature importance analysis
# Create labels for feature importance analysis (anomaly = 1, normal = 0)
y_labels = (features_df['anomaly'] == -1).astype(int)

# Train a Random Forest for feature importance
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_scaled, y_labels)

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 15 Most Important Features:")
print(feature_importance.head(15))

# Also check which features have the highest variance
feature_variance = pd.DataFrame({
    'feature': X.columns,
    'variance': X.var()
}).sort_values('variance', ascending=False)

print("\nTop 15 Features by Variance:")
print(feature_variance.head(15))

Anomaly distribution:
Normal (1): 9189 (90.1%)
Anomaly (-1): 1008 (9.9%)

Decision scores: min=-0.180, max=0.142, mean=0.084
Threshold for anomaly detection: ~0.000

=== NORMAL EXAMPLES ===
QID: 3E9D426508E, Score: 0.089
QID: 3E9D426508E, Score: 0.075
QID: 3E9D426508E, Score: 0.041
QID: 3E9D426508E, Score: 0.108
QID: 3E9D426508E, Score: 0.089

=== ANOMALY EXAMPLES ===
QID: 59059265092, Score: -0.104
QID: 59059265092, Score: -0.108
QID: 59059265092, Score: -0.109
QID: 59059265092, Score: -0.104
QID: 59059265092, Score: -0.104
Top 15 Most Important Features:
                      feature  importance
19             message_length    0.291558
27      numeric_pattern_count    0.249337
16                   log_hour    0.181650
9                        size    0.110461
29             user_frequency    0.055303
30     process_postfix/bounce    0.016553
4                is_postsuper    0.015900
33  process_postfix/postsuper    0.013938
3                   is_bounce    0.012886
28            use

In [12]:
# Select top 10 most important features (including is_weekend and not_working_hour)
top_features = feature_importance.head(10)['feature'].tolist()

# Ensure is_weekend and not_working_hour are included
required_features = ['is_weekend', 'not_working_hour']
for req_feat in required_features:
    if req_feat not in top_features:
        # Replace the least important feature
        top_features[-1] = req_feat

print("Selected Top 10 Features:")
for i, feat in enumerate(top_features, 1):
    importance = feature_importance[feature_importance['feature'] == feat]['importance'].iloc[0]
    print(f"{i:2d}. {feat:<25} (importance: {importance:.4f})")

# Create reduced feature set
X_reduced = X[top_features]
print(f"\nOriginal features: {X.shape[1]}")
print(f"Reduced features: {X_reduced.shape[1]}")
print(f"Feature reduction: {((X.shape[1] - X_reduced.shape[1]) / X.shape[1] * 100):.1f}%")

Selected Top 10 Features:
 1. message_length            (importance: 0.2916)
 2. numeric_pattern_count     (importance: 0.2493)
 3. log_hour                  (importance: 0.1817)
 4. size                      (importance: 0.1105)
 5. user_frequency            (importance: 0.0553)
 6. process_postfix/bounce    (importance: 0.0166)
 7. is_postsuper              (importance: 0.0159)
 8. process_postfix/postsuper (importance: 0.0139)
 9. is_bounce                 (importance: 0.0129)
10. not_working_hour          (importance: 0.0046)

Original features: 35
Reduced features: 10
Feature reduction: 71.4%


In [13]:
# Force include is_weekend by replacing the least important feature
top_features_final = [
    'message_length',
    'numeric_pattern_count', 
    'log_hour',
    'size',
    'user_frequency',
    'not_working_hour',
    'is_weekend',  # Force include
    'process_postfix/bounce',
    'is_postsuper',
    'is_bounce'
]

print("Final Selected Features:")
for i, feat in enumerate(top_features_final, 1):
    print(f"{i:2d}. {feat}")

# Create reduced dataset
X_reduced = X[top_features_final]

# Scale the reduced features
scaler_reduced = StandardScaler()
X_reduced_scaled = scaler_reduced.fit_transform(X_reduced)

# Train new model with reduced features
iso_model_reduced = IsolationForest(
    n_estimators=50,
    contamination=0.1,
    max_samples=0.5,
    random_state=42
)

iso_model_reduced.fit(X_reduced_scaled)

# Test predictions
pred_reduced = iso_model_reduced.predict(X_reduced_scaled)
anomaly_count_reduced = sum(pred_reduced == -1)

print(f"\nReduced model results:")
print(f"Total samples: {len(pred_reduced)}")
print(f"Anomalies detected: {anomaly_count_reduced}")
print(f"Anomaly rate: {anomaly_count_reduced/len(pred_reduced)*100:.1f}%")

Final Selected Features:
 1. message_length
 2. numeric_pattern_count
 3. log_hour
 4. size
 5. user_frequency
 6. not_working_hour
 7. is_weekend
 8. process_postfix/bounce
 9. is_postsuper
10. is_bounce

Reduced model results:
Total samples: 10197
Anomalies detected: 1017
Anomaly rate: 10.0%


In [14]:
# Save the reduced model and scaler
import joblib
joblib.dump(iso_model_reduced, 'anomaly_model_reduced.pkl')
joblib.dump(scaler_reduced, 'anomaly_scaler_reduced.pkl')

# Also save the feature list
import json
with open('selected_features.json', 'w') as f:
    json.dump(top_features_final, f)

print("Saved:")
print("- anomaly_model_reduced.pkl")
print("- anomaly_scaler_reduced.pkl") 
print("- selected_features.json")

Saved:
- anomaly_model_reduced.pkl
- anomaly_scaler_reduced.pkl
- selected_features.json


In [None]:

# Analyze the nature of logs to determine optimal sensitivity
import re
from datetime import datetime

print("=== LOG DATA ANALYSIS FOR SENSITIVITY TUNING ===")

# Load original log data for analysis
with open('filtered_logs.json', 'r') as f:
    logs = json.load(f)

log_df = pd.DataFrame(logs)

# Analyze log patterns
print(f"Total logs: {len(log_df)}")
print(f"Date range: {log_df['timestamp'].min()} to {log_df['timestamp'].max()}")

# Process distribution
process_dist = log_df['process'].value_counts()
print(f"\nProcess distribution:")
for proc, count in process_dist.head(10).items():
    print(f"  {proc}: {count} ({count/len(log_df)*100:.1f}%)")

# User distribution
user_dist = log_df['user'].value_counts()
print(f"\nUser distribution (top 10):")
for user, count in user_dist.head(10).items():
    print(f"  {user}: {count} ({count/len(log_df)*100:.1f}%)")

# Queue status distribution
queue_dist = log_df['queue_status'].value_counts()
print(f"\nQueue status distribution:")
for status, count in queue_dist.items():
    print(f"  {status}: {count} ({count/len(log_df)*100:.1f}%)")

# Analyze message patterns for real anomalies
print(f"\n=== POTENTIAL REAL ANOMALY INDICATORS ===")

# Look for error patterns
error_keywords = ['error', 'fail', 'timeout', 'reject', 'bounce', 'deferred']
error_logs = []
for log in logs:
    message = str(log.get('message', '')).lower()
    if any(keyword in message for keyword in error_keywords):
        error_logs.append(log)

print(f"Logs with error keywords: {len(error_logs)} ({len(error_logs)/len(logs)*100:.2f}%)")

# Analyze size distribution for outliers
sizes = []
for log in logs:
    message = str(log.get('message', ''))
    size_match = re.search(r'size=(\d+)', message)
    if size_match:
        sizes.append(int(size_match.group(1)))

if sizes:
    sizes_df = pd.Series(sizes)
    print(f"\nMessage size analysis:")
    print(f"  Mean size: {sizes_df.mean():.0f}")
    print(f"  Median size: {sizes_df.median():.0f}")
    print(f"  95th percentile: {sizes_df.quantile(0.95):.0f}")
    print(f"  99th percentile: {sizes_df.quantile(0.99):.0f}")
    print(f"  Max size: {sizes_df.max()}")
    
    # Large message threshold (99th percentile could be anomalous)
    large_msg_threshold = sizes_df.quantile(0.99)
    large_msgs = sum(s > large_msg_threshold for s in sizes)
    print(f"  Messages > 99th percentile: {large_msgs} ({large_msgs/len(sizes)*100:.2f}%)")

# Time analysis
hours = []
for log in logs:
    timestamp = log.get('timestamp', '')
    try:
        # Extract hour from timestamp like "Jun 14 05:30:01"
        time_match = re.search(r'(\d{2}):(\d{2}):(\d{2})', timestamp)
        if time_match:
            hour = int(time_match.group(1))
            hours.append(hour)
    except:
        pass

if hours:
    hours_df = pd.Series(hours)
    print(f"\nTime distribution analysis:")
    print(f"  Most common hours: {hours_df.value_counts().head(3).to_dict()}")
    
    # Unusual hours (late night/early morning)
    unusual_hours = sum(1 for h in hours if h < 6 or h > 22)
    print(f"  Logs in unusual hours (< 6 AM or > 10 PM): {unusual_hours} ({unusual_hours/len(hours)*100:.2f}%)")

# Based on analysis, recommend contamination rate
estimated_real_anomalies = len(error_logs)/len(logs) * 100
print(f"\n=== SENSITIVITY RECOMMENDATION ===")
print(f"Estimated real anomaly rate based on error patterns: {estimated_real_anomalies:.2f}%")

if estimated_real_anomalies < 1:
    recommended_contamination = 0.01  # 1%
    print("Recommendation: Use contamination=0.01 (1%) - Low anomaly environment")
elif estimated_real_anomalies < 3:
    recommended_contamination = 0.03  # 3%  
    print("Recommendation: Use contamination=0.03 (3%) - Moderate anomaly environment")
else:
    recommended_contamination = 0.05  # 5%
    print("Recommendation: Use contamination=0.05 (5%) - Higher anomaly environment")

print(f"Current model uses: 10% (very aggressive)")
print(f"Recommended: {recommended_contamination*100}% (more realistic)")

=== LOG DATA ANALYSIS FOR SENSITIVITY TUNING ===
Total logs: 10197
Date range: Jun 14 05:30:01 to Jun 14 10:01:01

Process distribution:
  postfix/qmgr: 5076 (49.8%)
  postfix/cleanup: 2538 (24.9%)
  postfix/pickup: 2493 (24.4%)
  postfix/postsuper: 45 (0.4%)
  postfix/bounce: 45 (0.4%)

User distribution (top 10):
  unknown: 7704 (75.6%)
  user15@tlsoc.cse.iitb.ac.in: 81 (0.8%)
  user8@tlsoc.cse.iitb.ac.in: 72 (0.7%)
  user36@tlsoc.cse.iitb.ac.in: 63 (0.6%)
  user11@tlsoc.cse.iitb.ac.in: 54 (0.5%)
  user94@tlsoc.cse.iitb.ac.in: 54 (0.5%)
  user46@tlsoc.cse.iitb.ac.in: 54 (0.5%)
  user44@tlsoc.cse.iitb.ac.in: 54 (0.5%)
  user42@tlsoc.cse.iitb.ac.in: 54 (0.5%)
  user75@tlsoc.cse.iitb.ac.in: 54 (0.5%)

Queue status distribution:
  unknown: 7659 (75.1%)
  queue active: 2538 (24.9%)

=== POTENTIAL REAL ANOMALY INDICATORS ===
Logs with error keywords: 45 (0.44%)

Message size analysis:
  Mean size: 401
  Median size: 351
  95th percentile: 352
  99th percentile: 2904
  Max size: 2904
  Mess

In [17]:
# Train optimized model with recommended sensitivity
print("\n=== TRAINING OPTIMIZED MODEL ===")

# Use 1% contamination based on analysis (error rate was 0.44%)
optimal_contamination = 0.01

# Train new optimized model
iso_model_optimized = IsolationForest(
    n_estimators=100,  # Increase for better stability
    contamination=optimal_contamination,
    max_samples=0.8,   # Use more samples for training
    random_state=42
)

iso_model_optimized.fit(X_reduced_scaled)

# Test predictions
pred_optimized = iso_model_optimized.predict(X_reduced_scaled)
anomaly_count_optimized = sum(pred_optimized == -1)

print(f"Optimized model results:")
print(f"Total samples: {len(pred_optimized)}")
print(f"Anomalies detected: {anomaly_count_optimized}")
print(f"Anomaly rate: {anomaly_count_optimized/len(pred_optimized)*100:.2f}%")

# Compare with original model
print(f"\nComparison:")
print(f"Original model (10% contamination): {anomaly_count_reduced} anomalies ({anomaly_count_reduced/len(pred_reduced)*100:.1f}%)")
print(f"Optimized model (1% contamination): {anomaly_count_optimized} anomalies ({anomaly_count_optimized/len(pred_optimized)*100:.2f}%)")

# Check if detected anomalies align with actual error logs
detected_anomaly_indices = np.where(pred_optimized == -1)[0]
print(f"\nSample of detected anomalies (QIDs):")
for i, idx in enumerate(detected_anomaly_indices[:10]):
    qid = df.iloc[idx]['qid']
    print(f"  {i+1}. {qid}")

# Save optimized model
joblib.dump(iso_model_optimized, 'anomaly_model_optimized.pkl')
print(f"\nSaved optimized model to 'anomaly_model_optimized.pkl'")
print(f"Contamination rate: {optimal_contamination*100:.1f}%")
print(f"Features: {len(top_features_final)}")


=== TRAINING OPTIMIZED MODEL ===
Optimized model results:
Total samples: 10197
Anomalies detected: 99
Anomaly rate: 0.97%

Comparison:
Original model (10% contamination): 1017 anomalies (10.0%)
Optimized model (1% contamination): 99 anomalies (0.97%)

Sample of detected anomalies (QIDs):
  1. 0A1FC2650AF
  2. 0BBAC2650B0
  3. 0A1FC2650AF
  4. 0BBAC2650B0
  5. 0A1FC2650AF
  6. 0BBAC2650B0
  7. 0A1FC2650AF
  8. 0BBAC2650B0
  9. 0A1FC2650AF
  10. 0BBAC2650B0

Saved optimized model to 'anomaly_model_optimized.pkl'
Contamination rate: 1.0%
Features: 10
