In [23]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

In [24]:
df = pd.read_csv("realtime_extracted_features.csv")
df.sample(10)

Unnamed: 0,qid,is_pickup,is_cleanup,is_qmgr,is_bounce,is_postsuper,queue_active,queue_unknown,queue_deferred,queue_bounced,...,contains_delivered,contains_held,numeric_pattern_count,user_is_unknown,user_frequency,process_postfix/bounce,process_postfix/cleanup,process_postfix/pickup,process_postfix/postsuper,process_postfix/qmgr
9100,B5E38265185,0,0,1,0,0,0,1,0,0,...,0,0,15,1,7704,False,False,False,False,True
3949,137342650FB,0,0,1,0,0,1,0,0,0,...,0,0,17,0,36,False,False,False,False,True
1368,30C4F2650B4,0,0,1,0,0,0,1,0,0,...,0,0,15,1,7704,False,False,False,False,True
9798,43F74265198,0,1,0,0,0,0,1,0,0,...,0,0,16,1,7704,False,True,False,False,False
1341,30C4F2650B4,0,1,0,0,0,0,1,0,0,...,0,0,16,1,7704,False,True,False,False,False
9741,3833E265196,0,0,1,0,0,1,0,0,0,...,0,0,17,0,18,False,False,False,False,True
4108,2C4C32650FF,0,1,0,0,0,0,1,0,0,...,0,0,16,1,7704,False,True,False,False,False
6470,D387326513D,0,0,1,0,0,0,1,0,0,...,0,0,15,1,7704,False,False,False,False,True
5212,04E0326511D,0,1,0,0,0,0,1,0,0,...,0,0,16,1,7704,False,True,False,False,False
1006,EEB962650AB,0,1,0,0,0,0,1,0,0,...,0,0,16,1,7704,False,True,False,False,False


In [25]:
df.isnull().sum()

qid                          0
is_pickup                    0
is_cleanup                   0
is_qmgr                      0
is_bounce                    0
is_postsuper                 0
queue_active                 0
queue_unknown                0
queue_deferred               0
queue_bounced                0
size                         0
nrcpt                        0
has_from_email               0
has_to_email                 0
has_message_id               0
has_removed                  0
has_error                    0
log_hour                     0
not_working_hour             0
is_weekend                   0
message_length               0
unique_words                 0
contains_connection          0
contains_status              0
contains_relay               0
contains_delivered           0
contains_held                0
numeric_pattern_count        0
user_is_unknown              0
user_frequency               0
process_postfix/bounce       0
process_postfix/cleanup      0
process_

In [26]:
df.dtypes

qid                          object
is_pickup                     int64
is_cleanup                    int64
is_qmgr                       int64
is_bounce                     int64
is_postsuper                  int64
queue_active                  int64
queue_unknown                 int64
queue_deferred                int64
queue_bounced                 int64
size                          int64
nrcpt                         int64
has_from_email                int64
has_to_email                  int64
has_message_id                int64
has_removed                   int64
has_error                     int64
log_hour                      int64
not_working_hour              int64
is_weekend                    int64
message_length                int64
unique_words                  int64
contains_connection           int64
contains_status               int64
contains_relay                int64
contains_delivered            int64
contains_held                 int64
numeric_pattern_count       

In [27]:
df.shape

(10197, 36)

In [29]:
scaler = StandardScaler()
X = df.drop(['qid'],axis=1)
X_scaled = scaler.fit_transform(X)

In [36]:
iso_model = IsolationForest(
    n_estimators=50,
    contamination=0.1,  # or even 0.1 for more aggressive detection
    max_samples=0.5,
    random_state=42
)

iso_model.fit(X_scaled)

In [37]:
# === Predict anomalies ===
# -1 = anomaly, 1 = normal
features_df = df
features_df['anomaly'] = iso_model.predict(X_scaled)

# === See results ===
print(features_df[['qid', 'anomaly']].sample(20))

              qid  anomaly
8574  5C01C265177        1
1355  30C4F2650B4        1
8342  2FF6F26516D        1
2951  59F5B2650E0        1
2858  4CE6B2650DE        1
6179  9E341265135        1
5133  EAC4A26511B        1
7369  83437265156       -1
4406  5E392265106        1
5954  7507F26512F        1
5868  6856C265126        1
8955  9BE14265181        1
5918  6EB5626512E        1
3012  699422650E2        1
377   83CF626509A        1
1466  431A32650B7        1
3347  A0E3F2650EC        1
2938  59F5B2650E0        1
3843  0198D2650F8        1
6407  CD84126513C        1


In [38]:
df.columns

Index(['qid', 'is_pickup', 'is_cleanup', 'is_qmgr', 'is_bounce',
       'is_postsuper', 'queue_active', 'queue_unknown', 'queue_deferred',
       'queue_bounced', 'size', 'nrcpt', 'has_from_email', 'has_to_email',
       'has_message_id', 'has_removed', 'has_error', 'log_hour',
       'not_working_hour', 'is_weekend', 'message_length', 'unique_words',
       'contains_relay', 'contains_delivered', 'contains_held',
       'numeric_pattern_count', 'user_is_unknown', 'user_frequency',
       'process_postfix/bounce', 'process_postfix/cleanup',
       'process_postfix/pickup', 'process_postfix/postsuper',
       'process_postfix/qmgr', 'anomaly'],
      dtype='object')

In [35]:
import joblib
joblib.dump(iso_model, 'anomaly_model.pkl')
joblib.dump(scaler, 'anomaly_scaler.pkl')

['anomaly_scaler.pkl']

In [39]:
import numpy as np

# Example: get normal point
normal_point = X_scaled[0].copy()

# Make it anomalous: artificially inflate certain features
anomalous_point = normal_point.copy()
anomalous_point[ X.columns.get_loc('size') ] = normal_point[ X.columns.get_loc('size') ] + 10  # huge size
anomalous_point[ X.columns.get_loc('numeric_pattern_count') ] = 100  # crazy number

# Predict
pred = iso_model.predict([anomalous_point])
print(f"Prediction: {pred}")  # Should print [-1] for anomaly


Prediction: [-1]
