In [None]:
# Import libraries
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

from anomaly_detector import AnomalyDetector
from visualization import AadhaarVisualizer

# Settings
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print("‚úì Libraries imported")

## 1. Load Processed Data

In [None]:
print("Loading processed datasets...")

enrolment_df = pd.read_parquet('../outputs/enrolment_processed.parquet')
demographic_df = pd.read_parquet('../outputs/demographic_processed.parquet')
biometric_df = pd.read_parquet('../outputs/biometric_processed.parquet')

print(f"‚úì Enrolment: {len(enrolment_df):,} records")
print(f"‚úì Demographic: {len(demographic_df):,} records")
print(f"‚úì Biometric: {len(biometric_df):,} records")

# Initialize detector
detector = AnomalyDetector()
visualizer = AadhaarVisualizer(output_dir='../outputs/figures')

print("‚úì Anomaly detector initialized")

## 2. STATISTICAL OUTLIER DETECTION

### 2.1 IQR Method - Enrolment Volume

In [None]:
# Prepare daily aggregation
daily_enrol = enrolment_df.groupby('date')['total_enrolments'].sum().reset_index()

# Detect outliers using IQR
iqr_outliers = detector.detect_outliers_iqr(
    daily_enrol['total_enrolments'].values,
    threshold=1.5
)

print(f"=== IQR OUTLIER DETECTION ===\n")
print(f"Total days: {len(daily_enrol)}")
print(f"Outliers detected: {iqr_outliers['n_outliers']}")
print(f"Outlier percentage: {iqr_outliers['outlier_percentage']:.2f}%")
print(f"\nStatistics:")
print(f"  Q1: {iqr_outliers['q1']:,.0f}")
print(f"  Q3: {iqr_outliers['q3']:,.0f}")
print(f"  IQR: {iqr_outliers['iqr']:,.0f}")
print(f"  Lower bound: {iqr_outliers['lower_bound']:,.0f}")
print(f"  Upper bound: {iqr_outliers['upper_bound']:,.0f}")

# Mark outliers in dataframe
daily_enrol['is_outlier_iqr'] = iqr_outliers['is_outlier']
outlier_days = daily_enrol[daily_enrol['is_outlier_iqr']]

print(f"\nOutlier dates:")
for idx, row in outlier_days.iterrows():
    print(f"  {row['date'].strftime('%Y-%m-%d')}: {row['total_enrolments']:,.0f} enrolments")

# Visualization
fig, ax = plt.subplots(figsize=(14, 6))

ax.plot(daily_enrol['date'], daily_enrol['total_enrolments'], 
        color='#3A86FF', linewidth=2, label='Daily Enrolments')

# Highlight outliers
if len(outlier_days) > 0:
    ax.scatter(outlier_days['date'], outlier_days['total_enrolments'],
              color='#E63946', s=100, marker='o', label='Outliers (IQR)', zorder=3)

# Add threshold lines
ax.axhline(y=iqr_outliers['upper_bound'], color='#FB5607', 
          linestyle='--', linewidth=2, label='Upper Bound')
ax.axhline(y=iqr_outliers['lower_bound'], color='#FB5607', 
          linestyle='--', linewidth=2, label='Lower Bound')

ax.set_xlabel('Date', fontweight='bold', fontsize=12)
ax.set_ylabel('Total Enrolments', fontweight='bold', fontsize=12)
ax.set_title('IQR Outlier Detection: Daily Enrolment Volume', 
             fontweight='bold', fontsize=14, pad=20)
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/figures/23_iqr_outliers.png', 
            dpi=300, bbox_inches='tight')
plt.show()

### 2.2 Z-Score Method

In [None]:
# Z-score detection
zscore_outliers = detector.detect_outliers_zscore(
    daily_enrol['total_enrolments'].values,
    threshold=3.0
)

print(f"=== Z-SCORE OUTLIER DETECTION ===\n")
print(f"Threshold: ¬±3 standard deviations")
print(f"Outliers detected: {zscore_outliers['n_outliers']}")
print(f"Outlier percentage: {zscore_outliers['outlier_percentage']:.2f}%")
print(f"\nStatistics:")
print(f"  Mean: {zscore_outliers['mean']:,.0f}")
print(f"  Std Dev: {zscore_outliers['std']:,.0f}")

daily_enrol['is_outlier_zscore'] = zscore_outliers['is_outlier']
outlier_days_z = daily_enrol[daily_enrol['is_outlier_zscore']]

print(f"\nOutlier dates (Z-score > 3):")
for idx, row in outlier_days_z.iterrows():
    z = (row['total_enrolments'] - zscore_outliers['mean']) / zscore_outliers['std']
    print(f"  {row['date'].strftime('%Y-%m-%d')}: {row['total_enrolments']:,.0f} (Z={z:.2f})")

### 2.3 Modified Z-Score (Robust)

In [None]:
# Modified Z-score (uses median instead of mean)
mod_zscore_outliers = detector.detect_outliers_modified_zscore(
    daily_enrol['total_enrolments'].values,
    threshold=3.5
)

print(f"=== MODIFIED Z-SCORE OUTLIER DETECTION ===\n")
print(f"Threshold: ¬±3.5 (robust method using median)")
print(f"Outliers detected: {mod_zscore_outliers['n_outliers']}")
print(f"Outlier percentage: {mod_zscore_outliers['outlier_percentage']:.2f}%")
print(f"\nStatistics:")
print(f"  Median: {mod_zscore_outliers['median']:,.0f}")
print(f"  MAD: {mod_zscore_outliers['mad']:,.0f}")

daily_enrol['is_outlier_mod_z'] = mod_zscore_outliers['is_outlier']

# Compare all three methods
comparison = pd.DataFrame({
    'Method': ['IQR', 'Z-Score', 'Modified Z-Score'],
    'Outliers': [iqr_outliers['n_outliers'], 
                zscore_outliers['n_outliers'], 
                mod_zscore_outliers['n_outliers']],
    'Percentage': [iqr_outliers['outlier_percentage'], 
                   zscore_outliers['outlier_percentage'], 
                   mod_zscore_outliers['outlier_percentage']]
})

print("\n=== METHOD COMPARISON ===\n")
print(comparison.to_string(index=False))

# Visualization
fig, ax = plt.subplots(figsize=(10, 6))

methods = comparison['Method']
counts = comparison['Outliers']
colors = ['#3A86FF', '#FB5607', '#8338EC']

bars = ax.bar(methods, counts, color=colors, alpha=0.7, edgecolor='black', linewidth=2)

for bar, count in zip(bars, counts):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(count)}',
            ha='center', va='bottom', fontweight='bold', fontsize=14)

ax.set_ylabel('Number of Outliers', fontweight='bold', fontsize=12)
ax.set_title('Outlier Detection Method Comparison', 
             fontweight='bold', fontsize=14, pad=20)
ax.grid(True, axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/figures/24_outlier_method_comparison.png', 
            dpi=300, bbox_inches='tight')
plt.show()

## 3. TEMPORAL ANOMALY DETECTION

### 3.1 Weekly Aggregation Anomalies

In [None]:
# Create time series
ts_data = daily_enrol.set_index('date')['total_enrolments']

# Detect temporal anomalies
temporal_anomalies = detector.detect_temporal_anomalies(
    ts_data,
    window=7,
    threshold=2.0
)

print(f"=== TEMPORAL ANOMALY DETECTION ===\n")
print(f"Window size: 7 days (weekly)")
print(f"Threshold: 2.0 standard deviations")
print(f"Anomalies detected: {temporal_anomalies['n_anomalies']}")
print(f"Anomaly percentage: {temporal_anomalies['anomaly_percentage']:.2f}%")

# Show anomaly dates
anomaly_dates = ts_data[temporal_anomalies['is_anomaly']]
print(f"\nTemporal anomalies:")
for date, value in anomaly_dates.items():
    print(f"  {date.strftime('%Y-%m-%d')}: {value:,.0f} enrolments")

# Visualization
fig, ax = plt.subplots(figsize=(14, 6))

ax.plot(ts_data.index, ts_data.values, 
        color='#3A86FF', linewidth=2, label='Daily Enrolments')

# Plot rolling mean and std bands
rolling_mean = temporal_anomalies['rolling_mean']
rolling_std = temporal_anomalies['rolling_std']

ax.plot(ts_data.index, rolling_mean, 
        color='#FB5607', linewidth=2, linestyle='--', label='7-Day Moving Average')

# Upper/lower bounds
upper_bound = rolling_mean + (2.0 * rolling_std)
lower_bound = rolling_mean - (2.0 * rolling_std)
ax.fill_between(ts_data.index, lower_bound, upper_bound, 
                alpha=0.2, color='#FB5607', label='¬±2 Std Dev Band')

# Mark anomalies
if len(anomaly_dates) > 0:
    ax.scatter(anomaly_dates.index, anomaly_dates.values,
              color='#E63946', s=100, marker='X', label='Temporal Anomalies', zorder=3)

ax.set_xlabel('Date', fontweight='bold', fontsize=12)
ax.set_ylabel('Total Enrolments', fontweight='bold', fontsize=12)
ax.set_title('Temporal Anomaly Detection: 7-Day Rolling Window', 
             fontweight='bold', fontsize=14, pad=20)
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/figures/25_temporal_anomalies.png', 
            dpi=300, bbox_inches='tight')
plt.show()

### 3.2 Changepoint Detection

In [None]:
# Detect changepoints
changepoints = detector.detect_changepoints(
    ts_data.values,
    penalty=10
)

print(f"=== CHANGEPOINT DETECTION ===\n")
print(f"Penalty parameter: 10")
print(f"Changepoints detected: {changepoints['n_changepoints']}")

if changepoints['n_changepoints'] > 0:
    print(f"\nChangepoint indices:")
    for cp_idx in changepoints['changepoint_indices']:
        cp_date = ts_data.index[cp_idx]
        cp_value = ts_data.iloc[cp_idx]
        print(f"  Index {cp_idx}: {cp_date.strftime('%Y-%m-%d')} (value: {cp_value:,.0f})")

# Visualization
fig, ax = plt.subplots(figsize=(14, 6))

ax.plot(ts_data.index, ts_data.values, 
        color='#3A86FF', linewidth=2, label='Daily Enrolments')

# Mark changepoints
if changepoints['n_changepoints'] > 0:
    for cp_idx in changepoints['changepoint_indices']:
        ax.axvline(x=ts_data.index[cp_idx], color='#E63946', 
                  linestyle='--', linewidth=2, alpha=0.7)

ax.set_xlabel('Date', fontweight='bold', fontsize=12)
ax.set_ylabel('Total Enrolments', fontweight='bold', fontsize=12)
ax.set_title('Changepoint Detection: Structural Breaks in Time Series', 
             fontweight='bold', fontsize=14, pad=20)
ax.grid(True, alpha=0.3)

# Add custom legend entry for changepoints
if changepoints['n_changepoints'] > 0:
    from matplotlib.lines import Line2D
    custom_lines = [Line2D([0], [0], color='#3A86FF', linewidth=2),
                   Line2D([0], [0], color='#E63946', linewidth=2, linestyle='--')]
    ax.legend(custom_lines, ['Daily Enrolments', 'Changepoints'])

plt.tight_layout()
plt.savefig('../outputs/figures/26_changepoint_detection.png', 
            dpi=300, bbox_inches='tight')
plt.show()

## 4. MACHINE LEARNING-BASED ANOMALY DETECTION

### 4.1 Isolation Forest - Daily Enrolments

In [None]:
# Prepare features for ML detection
features_df = daily_enrol[['total_enrolments']].copy()

# Apply Isolation Forest
iso_forest_results = detector.detect_isolation_forest(
    features_df.values,
    contamination=0.05,  # Expect 5% anomalies
    random_state=42
)

print(f"=== ISOLATION FOREST ANOMALY DETECTION ===\n")
print(f"Contamination parameter: 0.05 (5%)")
print(f"Anomalies detected: {iso_forest_results['n_anomalies']}")
print(f"Anomaly percentage: {iso_forest_results['anomaly_percentage']:.2f}%")

daily_enrol['is_anomaly_ml'] = iso_forest_results['is_anomaly']
daily_enrol['anomaly_score'] = iso_forest_results['anomaly_scores']

ml_anomalies = daily_enrol[daily_enrol['is_anomaly_ml']]

print(f"\nML-detected anomalies (sorted by score):")
ml_sorted = ml_anomalies.sort_values('anomaly_score')
for idx, row in ml_sorted.head(10).iterrows():
    print(f"  {row['date'].strftime('%Y-%m-%d')}: {row['total_enrolments']:,.0f} "
          f"(score: {row['anomaly_score']:.3f})")

# Visualization
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 10))

# Time series with anomalies
ax1.plot(daily_enrol['date'], daily_enrol['total_enrolments'], 
        color='#3A86FF', linewidth=2, label='Daily Enrolments')

if len(ml_anomalies) > 0:
    ax1.scatter(ml_anomalies['date'], ml_anomalies['total_enrolments'],
               color='#E63946', s=100, marker='D', label='ML Anomalies', zorder=3)

ax1.set_xlabel('Date', fontweight='bold', fontsize=12)
ax1.set_ylabel('Total Enrolments', fontweight='bold', fontsize=12)
ax1.set_title('Isolation Forest Anomaly Detection', 
             fontweight='bold', fontsize=14, pad=20)
ax1.legend()
ax1.grid(True, alpha=0.3)

# Anomaly scores
colors = ['#E63946' if is_anom else '#3A86FF' 
         for is_anom in daily_enrol['is_anomaly_ml']]
ax2.scatter(daily_enrol['date'], daily_enrol['anomaly_score'], 
           c=colors, alpha=0.6, s=30)
ax2.axhline(y=0, color='black', linestyle='--', linewidth=1)
ax2.set_xlabel('Date', fontweight='bold', fontsize=12)
ax2.set_ylabel('Anomaly Score', fontweight='bold', fontsize=12)
ax2.set_title('Isolation Forest Anomaly Scores (Negative = Anomaly)', 
             fontweight='bold', fontsize=14, pad=20)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/figures/27_isolation_forest_anomalies.png', 
            dpi=300, bbox_inches='tight')
plt.show()

### 4.2 Multi-Feature Isolation Forest

In [None]:
# Prepare multi-dimensional features
state_daily = enrolment_df.groupby(['date', 'state'])['total_enrolments'].sum().reset_index()
state_pivot = state_daily.pivot(index='date', columns='state', values='total_enrolments').fillna(0)

# Use top 10 states as features
top_10_states = enrolment_df.groupby('state')['total_enrolments'].sum().nlargest(10).index
feature_matrix = state_pivot[top_10_states].values

print(f"=== MULTI-FEATURE ANOMALY DETECTION ===\n")
print(f"Feature dimensions: {feature_matrix.shape}")
print(f"Features: Top 10 states' daily enrolments")

# Apply Isolation Forest
multi_iso_results = detector.detect_isolation_forest(
    feature_matrix,
    contamination=0.05,
    random_state=42
)

print(f"\nAnomalies detected: {multi_iso_results['n_anomalies']}")
print(f"Anomaly percentage: {multi_iso_results['anomaly_percentage']:.2f}%")

# Add results to dataframe
multi_df = pd.DataFrame({
    'date': state_pivot.index,
    'is_anomaly': multi_iso_results['is_anomaly'],
    'anomaly_score': multi_iso_results['anomaly_scores']
})

multi_anomalies = multi_df[multi_df['is_anomaly']]

print(f"\nMulti-feature anomaly dates:")
for idx, row in multi_anomalies.iterrows():
    print(f"  {row['date'].strftime('%Y-%m-%d')}: score = {row['anomaly_score']:.3f}")

## 5. GEOGRAPHIC ANOMALY PATTERNS

### 5.1 State-Level Anomalies

In [None]:
# Calculate state totals
state_totals = enrolment_df.groupby('state')['total_enrolments'].sum()

# Detect state-level outliers
state_outliers = detector.detect_outliers_iqr(state_totals.values, threshold=1.5)

print(f"=== GEOGRAPHIC ANOMALIES (State Level) ===\n")
print(f"Total states: {len(state_totals)}")
print(f"Anomalous states: {state_outliers['n_outliers']}")
print(f"Outlier percentage: {state_outliers['outlier_percentage']:.2f}%")

state_df = pd.DataFrame({
    'state': state_totals.index,
    'total_enrolments': state_totals.values,
    'is_outlier': state_outliers['is_outlier']
})

outlier_states = state_df[state_df['is_outlier']].sort_values('total_enrolments', ascending=False)

print(f"\nOutlier states (exceptionally high or low):")
for idx, row in outlier_states.iterrows():
    print(f"  {row['state']:30s}: {row['total_enrolments']:10,.0f}")

# Visualization
fig, ax = plt.subplots(figsize=(14, 8))

colors = ['#E63946' if outlier else '#3A86FF' for outlier in state_df['is_outlier']]
sorted_states = state_df.sort_values('total_enrolments', ascending=True)

bars = ax.barh(range(len(sorted_states)), sorted_states['total_enrolments'], 
              color=colors, alpha=0.7)
ax.set_yticks(range(len(sorted_states)))
ax.set_yticklabels(sorted_states['state'], fontsize=8)
ax.set_xlabel('Total Enrolments', fontweight='bold', fontsize=12)
ax.set_ylabel('State', fontweight='bold', fontsize=12)
ax.set_title('State-Level Anomalies: Enrolment Distribution', 
             fontweight='bold', fontsize=14, pad=20)
ax.grid(True, axis='x', alpha=0.3)

# Add legend
from matplotlib.patches import Patch
legend_elements = [Patch(facecolor='#3A86FF', label='Normal'),
                   Patch(facecolor='#E63946', label='Outlier')]
ax.legend(handles=legend_elements)

plt.tight_layout()
plt.savefig('../outputs/figures/28_geographic_anomalies.png', 
            dpi=300, bbox_inches='tight')
plt.show()

### 5.2 District-Level Anomalies

In [None]:
# District totals
district_totals = enrolment_df.groupby('district')['total_enrolments'].sum()

# Detect district-level outliers
district_outliers = detector.detect_outliers_iqr(district_totals.values, threshold=1.5)

print(f"=== GEOGRAPHIC ANOMALIES (District Level) ===\n")
print(f"Total districts: {len(district_totals)}")
print(f"Anomalous districts: {district_outliers['n_outliers']}")
print(f"Outlier percentage: {district_outliers['outlier_percentage']:.2f}%")

district_df = pd.DataFrame({
    'district': district_totals.index,
    'total_enrolments': district_totals.values,
    'is_outlier': district_outliers['is_outlier']
})

outlier_districts = district_df[district_df['is_outlier']].sort_values(
    'total_enrolments', ascending=False)

print(f"\nTop 20 outlier districts:")
for idx, row in outlier_districts.head(20).iterrows():
    print(f"  {row['district']:30s}: {row['total_enrolments']:10,.0f}")

## 6. ANOMALY SUMMARY & INSIGHTS

In [None]:
print("="*80)
print("ANOMALY DETECTION - KEY FINDINGS")
print("="*80)

print("\nüìä STATISTICAL OUTLIERS (Daily Enrolments):")
print(f"  ‚Ä¢ IQR method: {iqr_outliers['n_outliers']} outliers ({iqr_outliers['outlier_percentage']:.2f}%)")
print(f"  ‚Ä¢ Z-score method: {zscore_outliers['n_outliers']} outliers ({zscore_outliers['outlier_percentage']:.2f}%)")
print(f"  ‚Ä¢ Modified Z-score: {mod_zscore_outliers['n_outliers']} outliers ({mod_zscore_outliers['outlier_percentage']:.2f}%)")

print("\nüïí TEMPORAL ANOMALIES:")
print(f"  ‚Ä¢ Rolling window anomalies: {temporal_anomalies['n_anomalies']}")
print(f"  ‚Ä¢ Changepoints detected: {changepoints['n_changepoints']}")

print("\nü§ñ MACHINE LEARNING DETECTION:")
print(f"  ‚Ä¢ Isolation Forest (univariate): {iso_forest_results['n_anomalies']} anomalies")
print(f"  ‚Ä¢ Isolation Forest (multi-feature): {multi_iso_results['n_anomalies']} anomalies")

print("\nüó∫Ô∏è  GEOGRAPHIC ANOMALIES:")
print(f"  ‚Ä¢ State-level outliers: {state_outliers['n_outliers']} states")
print(f"  ‚Ä¢ District-level outliers: {district_outliers['n_outliers']} districts")

print("\nüí° KEY INSIGHTS:")
print("  ‚Ä¢ Statistical methods provide baseline outlier detection")
print("  ‚Ä¢ ML-based methods capture complex, multi-dimensional patterns")
print("  ‚Ä¢ Temporal anomalies reveal unusual activity periods")
print("  ‚Ä¢ Geographic outliers highlight states/districts with exceptional volumes")
print("  ‚Ä¢ Changepoints indicate structural shifts in enrolment patterns")

print("\n‚úì Anomaly detection completed successfully")
print("="*80)