In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
df = pd.read_csv("phase1_aadhaar_pressure_metrics_FINAL.csv")
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['pincode', 'date']).reset_index(drop=True)
df.head(50)

Unnamed: 0,date,state,district,pincode,total_enrollments,total_demo_updates,total_bio_updates,atpi,dvi,bsi,population_pressure,aadhaar_load_score,load_status
0,2025-09-02,100000,100000,100000,3.0,0.0,0.0,0.0,0.0,0.0,1.386294,0.033463,Normal
1,2025-09-03,100000,100000,100000,1.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.016732,Normal
2,2025-09-08,100000,100000,100000,1.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.016732,Normal
3,2025-09-09,100000,100000,100000,1.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.016732,Normal
4,2025-09-11,100000,100000,100000,2.0,0.0,0.0,0.0,0.0,0.0,1.098612,0.026519,Normal
5,2025-09-12,100000,100000,100000,2.0,0.0,0.0,0.0,0.0,0.0,1.098612,0.026519,Normal
6,2025-09-19,100000,100000,100000,1.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.016732,Normal
7,2025-09-20,100000,100000,100000,1.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.016732,Normal
8,2025-10-24,100000,100000,100000,1.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.016732,Normal
9,2025-11-15,100000,100000,100000,3.0,0.0,0.0,0.0,0.0,0.0,1.386294,0.033463,Normal


In [5]:
df['load_delta'] = (
    df.groupby('pincode')['aadhaar_load_score']
      .diff()
      .fillna(0)
)

In [6]:
df['load_rolling_mean'] = (
    df.groupby('pincode')['aadhaar_load_score']
      .rolling(7, min_periods=1)
      .mean()
      .reset_index(level=0, drop=True)
)

df['load_rolling_std'] = (
    df.groupby('pincode')['aadhaar_load_score']
      .rolling(7, min_periods=1)
      .std()
      .fillna(0)
      .reset_index(level=0, drop=True)
)


In [7]:
df['z_score'] = (
    (df['aadhaar_load_score'] - df['load_rolling_mean']) /
    (df['load_rolling_std'] + 1e-6)
)

df['spike_anomaly'] = df['z_score'].abs() > 3


In [19]:
pop_median = df['population_pressure'].median()

df['silent_failure'] = (
    (df['aadhaar_load_score'] < df['load_rolling_mean'] * 0.3) &
    (df['population_pressure'] > df['population_pressure'].quantile(0.6)) &
    (df['load_rolling_mean'] > df['aadhaar_load_score'].quantile(0.5))
)



In [24]:
features = df[
    [
        'aadhaar_load_score',
        'load_delta',
        'atpi',
        'dvi',
        'bsi',
        'population_pressure'
    ]
].fillna(0)
iso = IsolationForest(
    n_estimators=300,
    contamination=0.07,
    random_state=42
)

df['iforest_flag'] = iso.fit_predict(features)
df['iforest_anomaly'] = df['iforest_flag'] == -1




In [25]:
df['anomaly_type'] = 'Normal'

df.loc[df['silent_failure'], 'anomaly_type'] = 'Silent Service Failure'
df.loc[df['spike_anomaly'], 'anomaly_type'] = 'Sudden Overload Spike'
df.loc[df['iforest_anomaly'], 'anomaly_type'] = 'Structural Anomaly'


In [26]:
df['anomaly_severity'] = 'None'

df.loc[df['z_score'].abs() > 4, 'anomaly_severity'] = 'Critical'
df.loc[df['z_score'].abs().between(2.5, 4), 'anomaly_severity'] = 'High'
df.loc[df['silent_failure'], 'anomaly_severity'] = 'Medium'
df.loc[df['iforest_anomaly'], 'anomaly_severity'] = 'Medium'


In [27]:
phase2_cols = [
    'date', 'state', 'district', 'pincode',
    'aadhaar_load_score', 'load_status',
    'z_score', 'load_delta',
    'anomaly_type', 'anomaly_severity'
]

phase2_df = df[phase2_cols]
phase2_df.head(50)


Unnamed: 0,date,state,district,pincode,aadhaar_load_score,load_status,z_score,load_delta,anomaly_type,anomaly_severity
0,2025-09-02,100000,100000,100000,0.033463,Normal,0.0,0.0,Normal,
1,2025-09-03,100000,100000,100000,0.016732,Normal,-0.707047,-0.016732,Normal,
2,2025-09-08,100000,100000,100000,0.016732,Normal,-0.577291,0.0,Normal,
3,2025-09-09,100000,100000,100000,0.016732,Normal,-0.49994,0.0,Normal,
4,2025-09-11,100000,100000,100000,0.026519,Normal,0.584766,0.009787,Normal,
5,2025-09-12,100000,100000,100000,0.026519,Normal,0.526386,0.0,Normal,
6,2025-09-19,100000,100000,100000,0.016732,Normal,-0.754799,-0.009787,Normal,
7,2025-09-20,100000,100000,100000,0.016732,Normal,-0.585417,0.0,Normal,
8,2025-10-24,100000,100000,100000,0.016732,Normal,-0.585417,0.0,Normal,
9,2025-11-15,100000,100000,100000,0.033463,Normal,1.680121,0.016732,Normal,


In [30]:
phase2_df.to_csv(
    "phase2_aadhaar_anomaly_detection_final.csv",
    index=False
)

print("✅ Phase 2 anomaly detection CSV exported successfully")


✅ Phase 2 anomaly detection CSV exported successfully


In [28]:
phase2_df['anomaly_severity'].value_counts(normalize=True)

anomaly_severity
None      0.925989
Medium    0.074011
Name: proportion, dtype: float64

In [29]:
phase2_df['anomaly_type'].value_counts(normalize=True)

anomaly_type
Normal                    0.925989
Structural Anomaly        0.069995
Silent Service Failure    0.004016
Name: proportion, dtype: float64

In [2]:
phase2_df = phase2_df ['state'].nunique()

NameError: name 'phase2_df' is not defined