In [None]:
# Import libraries
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

from temporal_analysis import TemporalAnalyzer
from spatial_analysis import SpatialAnalyzer
from visualization import AadhaarVisualizer

# Settings
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print("‚úì Libraries imported")

## 1. Load Processed Data

In [None]:
print("Loading processed datasets...")

enrolment_df = pd.read_parquet('../outputs/enrolment_processed.parquet')
demographic_df = pd.read_parquet('../outputs/demographic_processed.parquet')
biometric_df = pd.read_parquet('../outputs/biometric_processed.parquet')

print(f"‚úì Enrolment: {len(enrolment_df):,} records")
print(f"‚úì Demographic: {len(demographic_df):,} records")
print(f"‚úì Biometric: {len(biometric_df):,} records")

# Initialize analyzers
temporal = TemporalAnalyzer()
spatial = SpatialAnalyzer()
visualizer = AadhaarVisualizer(output_dir='../outputs/figures')

print("‚úì Analyzers initialized")

## 2. TEMPORAL ANALYSIS

### 2.1 Time Series Decomposition - Enrolment

In [None]:
# Prepare daily time series
ts_data = enrolment_df.groupby('date')['total_enrolments'].sum().reset_index()
ts_data.set_index('date', inplace=True)
ts_data = ts_data.sort_index()

# Resample to weekly for better decomposition
ts_weekly = ts_data.resample('W').sum()

print(f"Time series shape: {ts_weekly.shape}")
print(f"Date range: {ts_weekly.index.min()} to {ts_weekly.index.max()}")
print(f"\nFirst 5 weeks:")
print(ts_weekly.head())

# Perform seasonal decomposition
decomposition = temporal.seasonal_decompose(
    ts_weekly['total_enrolments'], 
    period=4  # 4 weeks = monthly seasonality
)

# Plot decomposition
fig, axes = plt.subplots(4, 1, figsize=(14, 10))
fig.suptitle('Seasonal Decomposition: Enrolment Time Series (Weekly)', 
             fontweight='bold', fontsize=16)

# Original
axes[0].plot(decomposition['observed'], color='#3A86FF', linewidth=2)
axes[0].set_ylabel('Observed', fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Trend
axes[1].plot(decomposition['trend'], color='#FB5607', linewidth=2)
axes[1].set_ylabel('Trend', fontweight='bold')
axes[1].grid(True, alpha=0.3)

# Seasonal
axes[2].plot(decomposition['seasonal'], color='#8338EC', linewidth=2)
axes[2].set_ylabel('Seasonal', fontweight='bold')
axes[2].grid(True, alpha=0.3)

# Residual
axes[3].plot(decomposition['residual'], color='#E63946', linewidth=2)
axes[3].set_ylabel('Residual', fontweight='bold')
axes[3].set_xlabel('Date', fontweight='bold')
axes[3].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/figures/15_temporal_decomposition_enrolment.png', 
            dpi=300, bbox_inches='tight')
plt.show()

print(f"\n‚úì Trend strength: {decomposition['trend_strength']:.3f}")
print(f"‚úì Seasonal strength: {decomposition['seasonal_strength']:.3f}")

### 2.2 Stationarity Testing

In [None]:
# Test stationarity
stationarity_result = temporal.test_stationarity(ts_weekly['total_enrolments'])

print("=== STATIONARITY TEST RESULTS ===\n")
print(f"ADF Statistic: {stationarity_result['adf_statistic']:.4f}")
print(f"P-value: {stationarity_result['p_value']:.4f}")
print(f"Is Stationary: {stationarity_result['is_stationary']}")
print(f"\nCritical Values:")
for key, value in stationarity_result['critical_values'].items():
    print(f"  {key}: {value:.4f}")

if not stationarity_result['is_stationary']:
    print("\n‚ö† Series is NON-STATIONARY - differencing recommended for ARIMA modeling")
else:
    print("\n‚úì Series is STATIONARY - suitable for time series modeling")

### 2.3 Peak Period Identification

In [None]:
# Identify peaks and troughs
peaks, troughs = temporal.identify_peaks_troughs(
    ts_weekly['total_enrolments'], 
    prominence=0.2
)

print(f"=== PEAK ANALYSIS ===\n")
print(f"Number of peaks detected: {len(peaks)}")
print(f"Number of troughs detected: {len(troughs)}")

# Visualize peaks and troughs
fig, ax = plt.subplots(figsize=(14, 6))

ax.plot(ts_weekly.index, ts_weekly['total_enrolments'], 
        color='#3A86FF', linewidth=2, label='Enrolments')

# Mark peaks
if len(peaks) > 0:
    ax.scatter(ts_weekly.index[peaks], ts_weekly['total_enrolments'].iloc[peaks],
              color='#06D6A0', s=100, marker='^', label='Peaks', zorder=3)

# Mark troughs
if len(troughs) > 0:
    ax.scatter(ts_weekly.index[troughs], ts_weekly['total_enrolments'].iloc[troughs],
              color='#E63946', s=100, marker='v', label='Troughs', zorder=3)

ax.set_xlabel('Date', fontweight='bold', fontsize=12)
ax.set_ylabel('Total Enrolments', fontweight='bold', fontsize=12)
ax.set_title('Peak & Trough Detection: Weekly Enrolment Pattern', 
             fontweight='bold', fontsize=14, pad=20)
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/figures/16_peak_trough_detection.png', 
            dpi=300, bbox_inches='tight')
plt.show()

# Top 5 peak periods
if len(peaks) > 0:
    peak_dates = ts_weekly.iloc[peaks].nlargest(5, 'total_enrolments')
    print("\nTop 5 Peak Periods:")
    for date, row in peak_dates.iterrows():
        print(f"  {date.strftime('%Y-%m-%d')}: {row['total_enrolments']:,.0f} enrolments")

### 2.4 Moving Average Analysis

In [None]:
# Calculate moving averages
ma_4week = temporal.calculate_moving_average(ts_weekly['total_enrolments'], window=4)
ma_8week = temporal.calculate_moving_average(ts_weekly['total_enrolments'], window=8)
ma_12week = temporal.calculate_moving_average(ts_weekly['total_enrolments'], window=12)

# Plot
fig, ax = plt.subplots(figsize=(14, 6))

ax.plot(ts_weekly.index, ts_weekly['total_enrolments'], 
        color='#3A86FF', linewidth=1, alpha=0.5, label='Actual')
ax.plot(ts_weekly.index, ma_4week, color='#FB5607', linewidth=2, label='4-Week MA')
ax.plot(ts_weekly.index, ma_8week, color='#8338EC', linewidth=2, label='8-Week MA')
ax.plot(ts_weekly.index, ma_12week, color='#06D6A0', linewidth=2, label='12-Week MA')

ax.set_xlabel('Date', fontweight='bold', fontsize=12)
ax.set_ylabel('Total Enrolments', fontweight='bold', fontsize=12)
ax.set_title('Moving Average Smoothing: Enrolment Trends', 
             fontweight='bold', fontsize=14, pad=20)
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/figures/17_moving_average_trends.png', 
            dpi=300, bbox_inches='tight')
plt.show()

### 2.5 Year-over-Year Growth Analysis

In [None]:
# Monthly aggregation for YoY
monthly_ts = enrolment_df.groupby(['year', 'month'])['total_enrolments'].sum().reset_index()
monthly_ts['year_month'] = pd.to_datetime(monthly_ts['year'].astype(str) + '-' + 
                                          monthly_ts['month'].astype(str).str.zfill(2) + '-01')

# Create pivot for YoY comparison
yoy_pivot = monthly_ts.pivot(index='month', columns='year', values='total_enrolments')

print("=== YEAR-OVER-YEAR COMPARISON ===\n")
print(yoy_pivot)

# Calculate YoY growth if multiple years
if yoy_pivot.shape[1] > 1:
    years = sorted(yoy_pivot.columns)
    for i in range(1, len(years)):
        prev_year = years[i-1]
        curr_year = years[i]
        yoy_pivot[f'Growth_{prev_year}_to_{curr_year}_%'] = (
            (yoy_pivot[curr_year] - yoy_pivot[prev_year]) / yoy_pivot[prev_year] * 100
        )
    
    print("\n=== GROWTH RATES ===\n")
    growth_cols = [col for col in yoy_pivot.columns if 'Growth' in str(col)]
    if growth_cols:
        print(yoy_pivot[growth_cols])

## 3. SPATIAL ANALYSIS

### 3.1 Geographic Concentration Metrics

In [None]:
# State-level aggregation
state_totals = enrolment_df.groupby('state')['total_enrolments'].sum().sort_values(ascending=False)

print("=== GEOGRAPHIC CONCENTRATION ===\n")
print(f"Total states covered: {len(state_totals)}")
print(f"Total enrolments: {state_totals.sum():,}\n")

# Calculate concentration metrics
gini = spatial.calculate_gini_coefficient(state_totals.values)
herfindahl = spatial.calculate_herfindahl_index(state_totals.values)

print(f"Gini Coefficient: {gini:.4f}")
print(f"  (0 = perfect equality, 1 = perfect inequality)")
print(f"\nHerfindahl Index: {herfindahl:.4f}")
print(f"  (Lower = more dispersed, Higher = more concentrated)")

# Top 10 states concentration
top10_share = state_totals.head(10).sum() / state_totals.sum() * 100
print(f"\nTop 10 states account for: {top10_share:.1f}% of all enrolments")

# Top 20 states
print("\n=== TOP 20 STATES BY ENROLMENT ===\n")
for rank, (state, count) in enumerate(state_totals.head(20).items(), 1):
    pct = count / state_totals.sum() * 100
    print(f"{rank:2d}. {state:30s}: {count:10,.0f} ({pct:5.2f}%)")

### 3.2 Per Capita Enrolment Rates

In [None]:
# Calculate per capita rates
per_capita_enrol = spatial.calculate_per_capita_rate(
    state_totals.to_dict(), 
    metric_name='enrolments_per_100k'
)

per_capita_df = pd.DataFrame([
    {'state': state, 
     'total_enrolments': state_totals.get(state, 0),
     'per_capita_rate': rate}
    for state, rate in per_capita_enrol.items()
]).sort_values('per_capita_rate', ascending=False)

print("=== PER CAPITA ENROLMENT RATES (per 100,000 population) ===\n")
print("Top 15 states:\n")
for idx, row in per_capita_df.head(15).iterrows():
    print(f"{row['state']:30s}: {row['per_capita_rate']:8.2f} per 100k "
          f"({row['total_enrolments']:,.0f} total)")

# Visualize
fig, ax = plt.subplots(figsize=(14, 8))

top15_pc = per_capita_df.head(15)
colors = plt.cm.viridis(np.linspace(0, 1, len(top15_pc)))

bars = ax.barh(range(len(top15_pc)), top15_pc['per_capita_rate'], color=colors)
ax.set_yticks(range(len(top15_pc)))
ax.set_yticklabels(top15_pc['state'])
ax.set_xlabel('Enrolments per 100,000 Population', fontweight='bold', fontsize=12)
ax.set_ylabel('State', fontweight='bold', fontsize=12)
ax.set_title('Top 15 States: Per Capita Enrolment Rates', 
             fontweight='bold', fontsize=14, pad=20)
ax.grid(True, axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/figures/18_per_capita_enrolment_rates.png', 
            dpi=300, bbox_inches='tight')
plt.show()

### 3.3 Regional Disparity Analysis

In [None]:
# Calculate regional disparity
disparity_metrics = spatial.calculate_regional_disparity(state_totals.to_dict())

print("=== REGIONAL DISPARITY METRICS ===\n")
print(f"Coefficient of Variation: {disparity_metrics['coefficient_of_variation']:.4f}")
print(f"  (Higher = more disparity between states)")
print(f"\nGini Coefficient: {disparity_metrics['gini_coefficient']:.4f}")
print(f"Herfindahl Index: {disparity_metrics['herfindahl_index']:.4f}")
print(f"\nMean enrolments: {disparity_metrics['mean']:,.0f}")
print(f"Std deviation: {disparity_metrics['std']:,.0f}")
print(f"Min enrolments: {disparity_metrics['min']:,.0f}")
print(f"Max enrolments: {disparity_metrics['max']:,.0f}")
print(f"Range: {disparity_metrics['range']:,.0f}")

# Distribution visualization
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Box plot
axes[0].boxplot([state_totals.values], vert=True, patch_artist=True,
                boxprops=dict(facecolor='#3A86FF', alpha=0.7))
axes[0].set_ylabel('Total Enrolments', fontweight='bold', fontsize=12)
axes[0].set_title('Enrolment Distribution Across States', 
                  fontweight='bold', fontsize=14, pad=20)
axes[0].grid(True, alpha=0.3)
axes[0].set_xticklabels(['All States'])

# Histogram
axes[1].hist(state_totals.values, bins=30, color='#8338EC', alpha=0.7, edgecolor='black')
axes[1].axvline(disparity_metrics['mean'], color='#FB5607', 
                linestyle='--', linewidth=2, label=f"Mean: {disparity_metrics['mean']:,.0f}")
axes[1].axvline(np.median(state_totals.values), color='#06D6A0', 
                linestyle='--', linewidth=2, label=f"Median: {np.median(state_totals.values):,.0f}")
axes[1].set_xlabel('Total Enrolments', fontweight='bold', fontsize=12)
axes[1].set_ylabel('Number of States', fontweight='bold', fontsize=12)
axes[1].set_title('Distribution of Enrolments Across States', 
                  fontweight='bold', fontsize=14, pad=20)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/figures/19_regional_disparity.png', 
            dpi=300, bbox_inches='tight')
plt.show()

### 3.4 Interactive Choropleth Map

In [None]:
# Prepare data for map
map_data = per_capita_df.copy()
map_data['state_name'] = map_data['state']

# Create interactive choropleth
fig = px.choropleth(
    map_data,
    locations='state_name',
    locationmode='ISO-3',  # Note: This requires proper state codes
    color='per_capita_rate',
    hover_name='state_name',
    hover_data={
        'total_enrolments': ':,.0f',
        'per_capita_rate': ':.2f'
    },
    color_continuous_scale='Viridis',
    title='Geographic Distribution: Enrolment Per Capita Rates by State',
    labels={'per_capita_rate': 'Per 100k Population'}
)

fig.update_geos(
    showcountries=True,
    showcoastlines=True,
    projection_type='natural earth'
)

fig.update_layout(
    height=600,
    font=dict(size=12),
    title_font=dict(size=16, family='Arial Black')
)

fig.write_html('../outputs/figures/20_choropleth_map.html')
print("‚úì Interactive map saved to: outputs/figures/20_choropleth_map.html")
fig.show()

### 3.5 District-Level Analysis (Top Districts)

In [None]:
# Top districts
district_totals = enrolment_df.groupby(['state', 'district'])['total_enrolments'].sum()
district_totals = district_totals.sort_values(ascending=False)

print("=== TOP 20 DISTRICTS BY ENROLMENT ===\n")
for rank, ((state, district), count) in enumerate(district_totals.head(20).items(), 1):
    print(f"{rank:2d}. {district:25s} ({state:20s}): {count:10,.0f}")

# Visualize top 15
top15_districts = district_totals.head(15)
district_labels = [f"{dist} ({state})" for (state, dist) in top15_districts.index]

fig, ax = plt.subplots(figsize=(14, 8))
colors = plt.cm.plasma(np.linspace(0, 1, len(top15_districts)))

bars = ax.barh(range(len(top15_districts)), top15_districts.values, color=colors)
ax.set_yticks(range(len(top15_districts)))
ax.set_yticklabels(district_labels)
ax.set_xlabel('Total Enrolments', fontweight='bold', fontsize=12)
ax.set_ylabel('District (State)', fontweight='bold', fontsize=12)
ax.set_title('Top 15 Districts by Enrolment Volume', 
             fontweight='bold', fontsize=14, pad=20)
ax.grid(True, axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/figures/21_top_districts.png', 
            dpi=300, bbox_inches='tight')
plt.show()

## 4. DEMOGRAPHIC & BIOMETRIC UPDATE PATTERNS

### 4.1 Update Type Distribution

In [None]:
# Demographic update fields
demo_fields = ['name_update', 'address_update', 'dob_update', 'gender_update', 
               'mobile_update', 'email_update']

demo_field_totals = {}
for field in demo_fields:
    if field in demographic_df.columns:
        demo_field_totals[field] = demographic_df[field].sum()

print("=== DEMOGRAPHIC UPDATE BREAKDOWN ===\n")
for field, count in sorted(demo_field_totals.items(), key=lambda x: x[1], reverse=True):
    pct = count / len(demographic_df) * 100
    print(f"{field:20s}: {count:10,.0f} ({pct:5.2f}%)")

# Biometric update fields
bio_fields = ['fingerprint_update', 'iris_update', 'photo_update']

bio_field_totals = {}
for field in bio_fields:
    if field in biometric_df.columns:
        bio_field_totals[field] = biometric_df[field].sum()

print("\n=== BIOMETRIC UPDATE BREAKDOWN ===\n")
for field, count in sorted(bio_field_totals.items(), key=lambda x: x[1], reverse=True):
    pct = count / len(biometric_df) * 100
    print(f"{field:20s}: {count:10,.0f} ({pct:5.2f}%)")

# Visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Demographic updates
if demo_field_totals:
    demo_df = pd.DataFrame(list(demo_field_totals.items()), 
                          columns=['Field', 'Count'])
    demo_df = demo_df.sort_values('Count', ascending=True)
    
    ax1.barh(demo_df['Field'], demo_df['Count'], color='#3A86FF', alpha=0.8)
    ax1.set_xlabel('Number of Updates', fontweight='bold', fontsize=12)
    ax1.set_title('Demographic Update Types', fontweight='bold', fontsize=14, pad=20)
    ax1.grid(True, axis='x', alpha=0.3)

# Biometric updates
if bio_field_totals:
    bio_df = pd.DataFrame(list(bio_field_totals.items()), 
                         columns=['Field', 'Count'])
    bio_df = bio_df.sort_values('Count', ascending=True)
    
    ax2.barh(bio_df['Field'], bio_df['Count'], color='#8338EC', alpha=0.8)
    ax2.set_xlabel('Number of Updates', fontweight='bold', fontsize=12)
    ax2.set_title('Biometric Update Types', fontweight='bold', fontsize=14, pad=20)
    ax2.grid(True, axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig('../outputs/figures/22_update_type_distribution.png', 
            dpi=300, bbox_inches='tight')
plt.show()

## 5. KEY FINDINGS SUMMARY

In [None]:
print("="*80)
print("TEMPORAL & SPATIAL ANALYSIS - KEY FINDINGS")
print("="*80)

print("\nüìä TEMPORAL PATTERNS:")
print(f"  ‚Ä¢ Trend strength: {decomposition['trend_strength']:.3f}")
print(f"  ‚Ä¢ Seasonal strength: {decomposition['seasonal_strength']:.3f}")
print(f"  ‚Ä¢ Peaks detected: {len(peaks)}")
print(f"  ‚Ä¢ Troughs detected: {len(troughs)}")
print(f"  ‚Ä¢ Stationarity: {'Yes' if stationarity_result['is_stationary'] else 'No'}")

print("\nüó∫Ô∏è  SPATIAL PATTERNS:")
print(f"  ‚Ä¢ States covered: {len(state_totals)}")
print(f"  ‚Ä¢ Gini coefficient: {gini:.4f}")
print(f"  ‚Ä¢ Herfindahl index: {herfindahl:.4f}")
print(f"  ‚Ä¢ Top 10 concentration: {top10_share:.1f}%")
print(f"  ‚Ä¢ Coefficient of variation: {disparity_metrics['coefficient_of_variation']:.4f}")

print("\nüîÑ UPDATE PATTERNS:")
if demo_field_totals:
    top_demo = max(demo_field_totals.items(), key=lambda x: x[1])
    print(f"  ‚Ä¢ Top demographic field: {top_demo[0]} ({top_demo[1]:,.0f})")
if bio_field_totals:
    top_bio = max(bio_field_totals.items(), key=lambda x: x[1])
    print(f"  ‚Ä¢ Top biometric field: {top_bio[0]} ({top_bio[1]:,.0f})")

print("\n‚úì Temporal & Spatial analysis completed successfully")
print("="*80)