In [2]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from google.colab import files
import io

In [3]:
import pandas as pd
import os

# --- EDIT YOUR FILE PATHS HERE ---
ENROLMENT_FILE = '/content/drive/MyDrive/UDAI/Clean_DataSet/structured_Unified_DataSet/Aadhar_enrolment_Str.csv'
BIOMETRIC_FILE = '/content/drive/MyDrive/UDAI/Clean_DataSet/structured_Unified_DataSet/Aadhar_biometric_Str.csv'

def check_overlap(enrol_path, bio_path):
    if not os.path.exists(enrol_path) or not os.path.exists(bio_path):
        print("Error: One or both files not found. Check your paths.")
        return

    # Load data
    df_e = pd.read_csv(enrol_path)
    df_b = pd.read_csv(bio_path)

    # Standardize column names
    df_e.columns = df_e.columns.str.strip().str.lower()
    df_b.columns = df_b.columns.str.strip().str.lower()

    # Extract unique districts
    enrol_dists = set(df_e['district'].str.strip().str.lower().unique())
    bio_dists = set(df_b['district'].str.strip().str.lower().unique())

    common = enrol_dists.intersection(bio_dists)

    print("--- GEOGRAPHIC OVERLAP AUDIT ---")
    print(f"Districts in Enrolment: {len(enrol_dists)}")
    print(f"Districts in Biometric: {len(bio_dists)}")
    print(f"Common Districts Match: {len(common)}")

    if len(common) > 0:
        print(f"\nSUCCESS: Found {len(common)} matches. You can proceed to Gap Analysis.")
    else:
        print("\nWARNING: Zero matches found. Check for spelling differences (e.g. 'Gurgaon' vs 'Gurugram').")

# Run the audit
check_overlap(ENROLMENT_FILE, BIOMETRIC_FILE)

--- GEOGRAPHIC OVERLAP AUDIT ---
Districts in Enrolment: 916
Districts in Biometric: 900
Common Districts Match: 886

SUCCESS: Found 886 matches. You can proceed to Gap Analysis.


In [4]:
def run_biometric_compliance_gap(enrol_file, bio_file):
    # 1. LOAD AND NORMALIZE
    # Standardizing columns to lowercase and stripping whitespace for a perfect join
    df_e = pd.read_csv(enrol_file)
    df_b = pd.read_csv(bio_file)

    df_e.columns = df_e.columns.str.strip().str.lower()
    df_b.columns = df_b.columns.str.strip().str.lower()

    # 2. AGGREGATE COHORTS (Age 5-17)
    # Total new children brought into the system in 2025
    e_grp = df_e.groupby(['state', 'district'])['age_5_17'].sum().reset_index()
    e_grp.rename(columns={'age_5_17': 'new_enrolments'}, inplace=True)

    # Total children performing mandatory updates in 2025
    b_grp = df_b.groupby(['state', 'district'])['bio_age_5_17'].sum().reset_index()
    b_grp.rename(columns={'bio_age_5_17': 'maintenance_updates'}, inplace=True)

    # 3. GAP CALCULATION (Inner Join)
    gap_df = pd.merge(e_grp, b_grp, on=['state', 'district'], how='inner')

    # Policy Metric: Maintenance-to-Enrolment Ratio
    # This tells us: "For every 1 child we add, how many existing children are we maintaining?"
    gap_df['compliance_ratio'] = gap_df['maintenance_updates'] / (gap_df['new_enrolments'] + 0.1)

    # 4. CATEGORIZE GAP SEVERITY
    # Ratio < 0.5: District is focused on growth but ignoring existing child stock (Risk: Deactivation)
    # Ratio > 1.5: District is heavily burdened with maintenance (Risk: Long Queues for new users)
    def assess_risk(ratio):
        if ratio < 0.5: return 'High Risk: Maintenance Gap'
        if ratio > 2.0: return 'Operational Strain: Backlog Heavy'
        return 'Healthy Balanced Service'

    gap_df['gap_status'] = gap_df['compliance_ratio'].apply(assess_risk)

    # 5. DATA VISUALIZATION
    # Sorting by most vulnerable (lowest ratio)
    gap_df = gap_df.sort_values(by='compliance_ratio')

    fig = px.bar(
        gap_df,
        x='district',
        y='compliance_ratio',
        color='gap_status',
        hover_data=['state', 'new_enrolments', 'maintenance_updates'],
        title='<b>Gap Analysis #1: Biometric Maintenance vs. New Enrolment (Age 5-17)</b>',
        labels={'compliance_ratio': 'Updates per New Enrolment'},
        color_discrete_map={
            'High Risk: Maintenance Gap': '#d62728',  # Red
            'Operational Strain: Backlog Heavy': '#ff7f0e',  # Orange
            'Healthy Balanced Service': '#2ca02c'   # Green
        }
    )

    # Add target benchmark line
    fig.add_hline(y=1.0, line_dash="dot", annotation_text="Ideal Policy Balance (1:1)", annotation_position="top left")

    fig.update_layout(
        template='plotly_white',
        xaxis_tickangle=-45,
        height=600
    )

    fig.show()

    # 6. RETURN SUMMARY DATA FOR REPORTING
    return gap_df

# Execute Analysis
gap_report = run_biometric_compliance_gap(ENROLMENT_FILE, BIOMETRIC_FILE)

# Quick Summary Output
print("\n--- TOP 5 DISTRICTS WITH HIGHEST MAINTENANCE GAP ---")
print(gap_report[['state', 'district', 'compliance_ratio']].head(5))


--- TOP 5 DISTRICTS WITH HIGHEST MAINTENANCE GAP ---
                 state       district  compliance_ratio
880        West Bengal  east midnapur               0.0
295  Jammu and Kashmir       bandipur               0.0
564             Odisha         anugal               0.0
633          Rajasthan        balotra               0.0
675             Sikkim         namchi               0.0


In [5]:
def create_statewise_gap_heatmap(gap_df):
    """
    Creates a State-wise Heat Map based on the Gap Analysis results.
    Identifies which states have the highest Biometric Maintenance Gap.
    """

    # 1. Aggregate District Data to State Level
    state_gap = gap_df.groupby('state').agg({
        'new_enrolments': 'sum',
        'maintenance_updates': 'sum'
    }).reset_index()

    # 2. Re-calculate the Ratio at State Scale
    state_gap['state_compliance_ratio'] = state_gap['maintenance_updates'] / (state_gap['new_enrolments'] + 0.1)

    # 3. Create the Heat Map
    # We use a Diverging color scale (RdYlGn - Red to Yellow to Green)
    # Red = Low Compliance (High Gap), Green = Healthy Maintenance
    fig = px.bar(
        state_gap.sort_values('state_compliance_ratio'),
        x='state_compliance_ratio',
        y='state',
        orientation='h',
        color='state_compliance_ratio',
        color_continuous_scale='RdYlGn',
        title='<b>National Policy Heat Map: State-wise Biometric Compliance</b>',
        labels={'state_compliance_ratio': 'Maintenance Ratio (Updates per Enrolment)', 'state': 'State'},
        template='plotly_white'
    )

    # Add the Critical Policy Line
    fig.add_vline(x=1.0, line_dash="dash", line_color="black", annotation_text="Target Balance")

    fig.update_layout(
        coloraxis_colorbar=dict(title="Compliance Level"),
        height=700
    )

    fig.show()

# --- EXECUTION ---
# Assuming 'gap_report' is the DataFrame returned from the previous script
create_statewise_gap_heatmap(gap_report)

In [6]:
# --- RUN THIS TO GET YOUR SPECIFIC FINDINGS ---
print("--- DATA FOR GAP ANALYSIS #1 (BIOMETRIC) ---")
# Replace 'gap_report' with the variable name from your Cell 3
top_mbu_gap = gap_report.sort_values('compliance_ratio').head(10)
print(f"Top 3 High-Risk Districts (MBU Gap):")
for i, row in top_mbu_gap.iterrows():
    print(f"- {row['district'].title()} ({row['state'].title()}): Ratio {row['compliance_ratio']:.2f}")

--- DATA FOR GAP ANALYSIS #1 (BIOMETRIC) ---
Top 3 High-Risk Districts (MBU Gap):
- East Midnapur (West Bengal): Ratio 0.00
- Bandipur (Jammu And Kashmir): Ratio 0.00
- Balotra (Rajasthan): Ratio 0.00
- Salumbar (Rajasthan): Ratio 0.00
- Tiruvarur (Tamil Nadu): Ratio 0.00
- Namchi (Sikkim): Ratio 0.00
- Anugal (Odisha): Ratio 0.00
- Pashchim Champaran (Bihar): Ratio 0.00
- Banas Kantha (Gujarat): Ratio 0.01
- Eastern West Khasi Hills (Meghalaya): Ratio 0.01


In [8]:
# Final Export for Gap 1
gap1_export = gap_report[['state', 'district', 'compliance_ratio']].copy()
gap1_export['state'] = gap1_export['state'].str.strip().str.lower()
gap1_export['district'] = gap1_export['district'].str.strip().str.lower()


gap1_export.to_csv('/content/drive/MyDrive/UDAI/Collab /Results/gap1_results.csv', index=False)
print("Gap 1 Exported successfully as gap1_results.csv")

Gap 1 Exported successfully as gap1_results.csv


# üîç Data-Driven Insights: Gap Analysis #1 (Biometric Compliance)

Based on the **2025 Aadhaar lifecycle data**, we have identified a critical imbalance between "Identity Creation" and "Identity Maintenance." The following insights highlight where the digital safety net is at its thinnest.

---

### üö® 1. The "Zero-Maintenance" Hotspots
Our analysis identified several districts with a **Compliance Ratio of 0.00**.

* **The Evidence:** Districts such as **East Midnapur (West Bengal)**, **Bandipur (J&K)**, **Balotra (Rajasthan)**, and **Pashchim Champaran (Bihar)** show active new enrolments but **zero** mandatory biometric updates for the 5-17 age group.
* **The Verdict:** In these zones, the infrastructure is 100% focused on growth. Existing children are being "digitally abandoned," as their mandatory age-5/15 updates are completely stalled.

### ‚ö†Ô∏è 2. Regional Clusters of "Digital Decay"
The data reveals systemic compliance failures in specific state corridors, particularly in **Gujarat** and **Bihar**.

* **The Evidence:** * **Gujarat:** A high-risk cluster including **Banas Kantha (0.01)**, **Sabar Kantha (0.05)**, and **Panch Mahals (0.05)**.
    * **Bihar:** Significant lags in **Bhabua (0.23)**, **Monghyr (0.25)**, and **Sheikpura (0.28)**.
* **The Verdict:** These ratios (below 0.3) indicate that for every 10 children added to the system, 7 existing children are failing to update their biometrics. This creates a "Time Bomb" of authentication failures for school and welfare services.

### ‚úÖ 3. Balanced vs. Maintenance-Heavy Zones
A few districts in **Uttar Pradesh** and the **North East** serve as benchmarks for a healthy system.

* **The Evidence:** **Barabanki (0.86)** and **Raebareli (0.97)** are approaching the healthy 1:1 equilibrium. **Mahrajganj (1.21)** has actually crossed into a "Maintenance-Heavy" phase, clearing a backlog of old records.
* **The Verdict:** These districts prove that with proper resource allocation, a balanced identity lifecycle is achievable.

---

## üõ†Ô∏è Data-Backed Policy Action Plan

| Risk Category | Key Districts | Recommended Policy Action |
| :--- | :--- | :--- |
| **üö® CRITICAL (0.00)** | East Midnapur, Bandipur, Pashchim Champaran | **Immediate Freeze** on new drives; 100% shift to Biometric Update Camps. |
| **‚ö†Ô∏è HIGH RISK (<0.3)** | Banas Kantha, Sabar Kantha, Bhabua | **Mandatory SMS Campaign** targeting parents of children aged 5-17. |
| **üü° STABLE (<1.0)** | Barabanki, West Medinipur, Noklak | **Routine Monitoring;** Ensure hardware availability for existing demand. |
| **üü¢ MAINTENANCE (>1.0)** | Mahrajganj, West Karbi Anglong | **System Benchmark:** Study these for "Best Practice" rollout. |

---
**Note:** *Compliance Ratio = (Biometric Updates Age 5-17) / (New Enrolments Age 5-17). A ratio of 1.0 represents a perfectly balanced ecosystem.*