In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
df = pd.read_csv("phase2_aadhaar_anomaly_detection_final.csv")
df['date'] = pd.to_datetime(df['date'])
df['anomaly_severity'] = df['anomaly_severity'].fillna('None')

df.head(25)


Unnamed: 0,date,state,district,pincode,aadhaar_load_score,load_status,z_score,load_delta,anomaly_type,anomaly_severity
0,2025-09-02,100000,100000,100000,0.033463,Normal,0.0,0.0,Normal,
1,2025-09-03,100000,100000,100000,0.016732,Normal,-0.707047,-0.016732,Normal,
2,2025-09-08,100000,100000,100000,0.016732,Normal,-0.577291,0.0,Normal,
3,2025-09-09,100000,100000,100000,0.016732,Normal,-0.49994,0.0,Normal,
4,2025-09-11,100000,100000,100000,0.026519,Normal,0.584766,0.009787,Normal,
5,2025-09-12,100000,100000,100000,0.026519,Normal,0.526386,0.0,Normal,
6,2025-09-19,100000,100000,100000,0.016732,Normal,-0.754799,-0.009787,Normal,
7,2025-09-20,100000,100000,100000,0.016732,Normal,-0.585417,0.0,Normal,
8,2025-10-24,100000,100000,100000,0.016732,Normal,-0.585417,0.0,Normal,
9,2025-11-15,100000,100000,100000,0.033463,Normal,1.680121,0.016732,Normal,


In [2]:
df['state'].nunique()

NameError: name 'df' is not defined

In [3]:
df = df.sort_values(['pincode', 'date']).reset_index(drop=True)
df['t'] = df.groupby('pincode').cumcount()
pin_counts = df.groupby('pincode').size()
valid_pins = pin_counts[pin_counts >= 10].index

df_valid = df[df['pincode'].isin(valid_pins)]




In [4]:
def fast_forecast(pin_df, horizon=14):
    """
    Fast rolling trend-based forecast for Aadhaar load score
    """
    vals = pin_df['aadhaar_load_score'].values

    # Use last 7 observations (or fewer if not available)
    recent = vals[-7:]

    if len(recent) < 3:
        return None

    level = recent.mean()
    trend = np.diff(recent).mean()

    # Generate forecast
    forecast = [max(level + (i + 1) * trend, 0) for i in range(horizon)]

    return forecast


In [8]:
forecast_rows = []

for pin, g in df_valid.groupby('pincode'):
    g = g.sort_values('date')

    if len(g) < 5:
        continue

    fc = fast_forecast(g, horizon=14)

    if fc is None:
        continue

    for i, val in enumerate(fc):
        forecast_rows.append({
            'pincode': pin,
            'forecast_day': i + 1,
            'predicted_load_score': float(val)
        })

forecast_df = pd.DataFrame(forecast_rows)
forecast_df.head(42)


Unnamed: 0,pincode,forecast_day,predicted_load_score
0,100000,1,0.040736
1,100000,2,0.040736
2,100000,3,0.040736
3,100000,4,0.040736
4,100000,5,0.040736
5,100000,6,0.040736
6,100000,7,0.040736
7,100000,8,0.040736
8,100000,9,0.040736
9,100000,10,0.040736


In [12]:
forecast_df['predicted_load_status'] = np.select(
    [
        forecast_df['predicted_load_score'] <= p40,
        (forecast_df['predicted_load_score'] > p40) &
        (forecast_df['predicted_load_score'] <= p75),
        forecast_df['predicted_load_score'] > p75
    ],
    [
        'Normal',
        'Emerging Stress',
        'Critical Overload'
    ],
    default='Normal'   # ðŸ‘ˆ IMPORTANT
)
forecast_df['predicted_load_status'].value_counts()

predicted_load_status
Normal               225422
Emerging Stress       34389
Critical Overload     13637
Name: count, dtype: int64

In [14]:
# Detect first future day when load becomes non-normal
stress_days = (
    forecast_df
        .loc[forecast_df['predicted_load_status'] != 'Normal']
        .groupby('pincode', as_index=False)['forecast_day']
        .min()
        .rename(columns={'forecast_day': 'stress_starts_in_days'})
)

stress_days.head(20)


Unnamed: 0,pincode,stress_starts_in_days
0,110002,1
1,110005,1
2,110006,1
3,110007,1
4,110008,1
5,110009,1
6,110010,1
7,110011,3
8,110012,1
9,110013,1


In [17]:
# Keep all anomalies
df_anom = df[df['anomaly_flag'] == True]

# Sample normal points (10x anomalies)
df_norm = df[df['anomaly_flag'] == False].sample(
    n=min(len(df_anom) * 10, len(df[df['anomaly_flag'] == False])),
    random_state=42
)

df_train = pd.concat([df_anom, df_norm]).sample(frac=1, random_state=42)


In [21]:
from sklearn.ensemble import RandomForestClassifier

X_train = df_train[
    [
        'aadhaar_load_score',
        'load_delta',
        'z_score'
    ]
].fillna(0)

y_train = df_train['anomaly_flag']

rf = RandomForestClassifier(
    n_estimators=80,      # â†“ from 200
    max_depth=4,          # â†“ from 5
    min_samples_leaf=50,  # regularization
    n_jobs=-1,            # parallelize
    random_state=42
)

rf.fit(X_train, y_train)

print("âœ… Fast Random Forest anomaly model trained")


âœ… Fast Random Forest anomaly model trained


In [24]:
rf_input = pd.DataFrame({
    'aadhaar_load_score': forecast_df['predicted_load_score'],
    'load_delta': np.zeros(len(forecast_df)),
    'z_score': np.zeros(len(forecast_df))
})

forecast_df['anomaly_risk'] = rf.predict_proba(rf_input)[:, 1]
forecast_df[['pincode', 'forecast_day', 'anomaly_risk']].head(15)

Unnamed: 0,pincode,forecast_day,anomaly_risk
0,100000,1,0.084261
1,100000,2,0.084261
2,100000,3,0.084261
3,100000,4,0.084261
4,100000,5,0.084261
5,100000,6,0.084261
6,100000,7,0.084261
7,100000,8,0.084261
8,100000,9,0.084261
9,100000,10,0.084261


In [25]:
forecast_df['early_warning'] = (
    (forecast_df['predicted_load_status'] != 'Normal') |
    (forecast_df['anomaly_risk'] > 0.6)
)


In [30]:
phase3_final_df = forecast_df[
    [
        'pincode',
        'forecast_day',
        'predicted_load_score',
        'predicted_load_status',
        'anomaly_risk',
        'early_warning'
    ]
]

phase3_final_df.head(20)


Unnamed: 0,pincode,forecast_day,predicted_load_score,predicted_load_status,anomaly_risk,early_warning
0,100000,1,0.040736,Normal,0.084261,False
1,100000,2,0.040736,Normal,0.084261,False
2,100000,3,0.040736,Normal,0.084261,False
3,100000,4,0.040736,Normal,0.084261,False
4,100000,5,0.040736,Normal,0.084261,False
5,100000,6,0.040736,Normal,0.084261,False
6,100000,7,0.040736,Normal,0.084261,False
7,100000,8,0.040736,Normal,0.084261,False
8,100000,9,0.040736,Normal,0.084261,False
9,100000,10,0.040736,Normal,0.084261,False


In [31]:
phase3_final_df.describe

<bound method NDFrame.describe of         pincode  forecast_day  predicted_load_score predicted_load_status  \
0        100000             1              0.040736                Normal   
1        100000             2              0.040736                Normal   
2        100000             3              0.040736                Normal   
3        100000             4              0.040736                Normal   
4        100000             5              0.040736                Normal   
...         ...           ...                   ...                   ...   
273443   855456            10              0.027327                Normal   
273444   855456            11              0.023597                Normal   
273445   855456            12              0.019867                Normal   
273446   855456            13              0.016136                Normal   
273447   855456            14              0.012406                Normal   

0           0.084261          False  
1  

In [34]:
phase3_final_df['early_warning'].value_counts()



False    225422
True      48026
Name: count, dtype: int64

In [36]:
early_warning_df = phase3_final_df[phase3_final_df['early_warning'] == True]
early_warning_df.head(10)


Unnamed: 0,pincode,forecast_day,predicted_load_score,predicted_load_status,anomaly_risk,early_warning
28,110002,1,0.179521,Critical Overload,0.084261,True
29,110002,2,0.152912,Emerging Stress,0.084261,True
30,110002,3,0.126304,Emerging Stress,0.084261,True
70,110005,1,0.154187,Emerging Stress,0.084261,True
71,110005,2,0.137683,Emerging Stress,0.084261,True
72,110005,3,0.12118,Emerging Stress,0.084261,True
84,110006,1,0.203313,Critical Overload,0.096066,True
85,110006,2,0.178661,Critical Overload,0.084261,True
86,110006,3,0.154009,Emerging Stress,0.084261,True
87,110006,4,0.129356,Emerging Stress,0.084261,True


In [37]:
phase3_final_df.to_csv(
    "phase3_aadhaar_early_warning_final.csv",
    index=False
)

print("âœ… Phase 3 early-warning dataset exported successfully")


