UIDAI Aadhaar Enrolment Data Analysis

This notebook is part of the UIDAI Data Hackathon 2026.

Objective:

Analyse Aadhaar enrolment trends across age groups

Identify operational and policy-relevant insights at state, district, and pincode levels

Dataset source: UIDAI Aadhaar Enrolment dataset (Government of India)

In [1]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import numpy as np
base_dir = Path.cwd().parents[0]
raw_dir = base_dir / "data" / "raw" #keep the orignal csv files in the {base_dir}/data/raw
clean_dir = base_dir / "data" / "cleaned" #orignal might be having some duplicated data or typing errors we will fix them in new csvs and put new csvs in (base_dir}/data/cleaned
csv_files = [
    "api_data_aadhar_enrolment_0_500000.csv",
    "api_data_aadhar_enrolment_500000_1000000.csv",
    "api_data_aadhar_enrolment_1000000_1006029.csv"
]
df1 = pd.read_csv(raw_dir / csv_files[0])
df2 = pd.read_csv(raw_dir / csv_files[1])
df3 = pd.read_csv(raw_dir / csv_files[2])


In [2]:
age_cols = ["age_0_5", "age_5_17", "age_18_greater"]

In [3]:
def initial_clean(df):
    # Convert date column
    df["date"] = pd.to_datetime(df["date"], dayfirst=True)
    
    # Ensure numeric age columns
    df[age_cols] = df[age_cols].apply(pd.to_numeric, errors="coerce").fillna(0).astype(int)
    
    # Helper columns
    df["year"] = df["date"].dt.year
    df["month"] = df["date"].dt.to_period("M")
    
    # Total enrolments
    df["total_enrolments"] = df[age_cols].sum(axis=1)
    
    # Convert numeric-only state/district to NaN
    df["state"] = df["state"].where(~df["state"].astype(str).str.isdigit(), np.nan)
    df["district"] = df["district"].where(~df["district"].astype(str).str.isdigit(), np.nan)
    
    return df

df1 = initial_clean(df1)
df2 = initial_clean(df2)
df3 = initial_clean(df3)


In [4]:
df1["total_enrolments"] = df1[age_cols].sum(axis=1)
df2["total_enrolments"] = df2[age_cols].sum(axis=1)
df3["total_enrolments"] = df3[age_cols].sum(axis=1)
df_original = pd.concat([df1, df2, df3], ignore_index=True)
df_original

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,year,month,total_enrolments
0,2025-03-02,Meghalaya,East Khasi Hills,793121,11,61,37,2025,2025-03,109
1,2025-03-09,Karnataka,Bengaluru Urban,560043,14,33,39,2025,2025-03,86
2,2025-03-09,Uttar Pradesh,Kanpur Nagar,208001,29,82,12,2025,2025-03,123
3,2025-03-09,Uttar Pradesh,Aligarh,202133,62,29,15,2025,2025-03,106
4,2025-03-09,Karnataka,Bengaluru Urban,560016,14,16,21,2025,2025-03,51
...,...,...,...,...,...,...,...,...,...,...
1006024,2025-12-31,West Bengal,West Midnapore,721149,2,0,0,2025,2025-12,2
1006025,2025-12-31,West Bengal,West Midnapore,721150,2,2,0,2025,2025-12,4
1006026,2025-12-31,West Bengal,West Midnapore,721305,0,1,0,2025,2025-12,1
1006027,2025-12-31,West Bengal,West Midnapore,721504,1,0,0,2025,2025-12,1


### Did some analysis before data cleaning 

In [5]:
print(df_original.info())
print("\n\n")
print(df_original.describe())
print("\n\n")
print(df_original.columns)
print("\n\n")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1006029 entries, 0 to 1006028
Data columns (total 10 columns):
 #   Column            Non-Null Count    Dtype         
---  ------            --------------    -----         
 0   date              1006029 non-null  datetime64[ns]
 1   state             1006007 non-null  object        
 2   district          1006007 non-null  object        
 3   pincode           1006029 non-null  int64         
 4   age_0_5           1006029 non-null  int64         
 5   age_5_17          1006029 non-null  int64         
 6   age_18_greater    1006029 non-null  int64         
 7   year              1006029 non-null  int32         
 8   month             1006029 non-null  period[M]     
 9   total_enrolments  1006029 non-null  int64         
dtypes: datetime64[ns](1), int32(1), int64(5), object(2), period[M](1)
memory usage: 72.9+ MB
None



                                date       pincode       age_0_5  \
count                        1006029  1.00602

In [6]:
print(df_original.isnull().sum(),"\n\n")
print((df_original == "").sum(),"\n\n")
print(df_original.dtypes,"\n\n")
print((df_original['date'] == pd.to_datetime(df_original['date'], errors='coerce')),"\n\n")
print(df_original['date'].isnull().sum(),"\n\n")
print((df_original[['age_0_5','age_5_17','age_18_greater']] < 0).sum(),"\n\n")
print((df_original['total_enrolments'] == 0).sum(),"\n\n")
print(df_original.duplicated().sum(),"\n\n")

date                 0
state               22
district            22
pincode              0
age_0_5              0
age_5_17             0
age_18_greater       0
year                 0
month                0
total_enrolments     0
dtype: int64 


date                0
state               0
district            0
pincode             0
age_0_5             0
age_5_17            0
age_18_greater      0
year                0
month               0
total_enrolments    0
dtype: int64 


date                datetime64[ns]
state                       object
district                    object
pincode                      int64
age_0_5                      int64
age_5_17                     int64
age_18_greater               int64
year                         int32
month                    period[M]
total_enrolments             int64
dtype: object 


0          True
1          True
2          True
3          True
4          True
           ... 
1006024    True
1006025    True
1006026    True
1006027

In [8]:
sorted(df_original['state'].dropna().unique())

['Andaman & Nicobar Islands',
 'Andaman and Nicobar Islands',
 'Andhra Pradesh',
 'Arunachal Pradesh',
 'Assam',
 'Bihar',
 'Chandigarh',
 'Chhattisgarh',
 'Dadra & Nagar Haveli',
 'Dadra and Nagar Haveli',
 'Dadra and Nagar Haveli and Daman and Diu',
 'Daman & Diu',
 'Daman and Diu',
 'Delhi',
 'Goa',
 'Gujarat',
 'Haryana',
 'Himachal Pradesh',
 'Jammu & Kashmir',
 'Jammu And Kashmir',
 'Jammu and Kashmir',
 'Jharkhand',
 'Karnataka',
 'Kerala',
 'Ladakh',
 'Lakshadweep',
 'Madhya Pradesh',
 'Maharashtra',
 'Manipur',
 'Meghalaya',
 'Mizoram',
 'Nagaland',
 'ODISHA',
 'Odisha',
 'Orissa',
 'Pondicherry',
 'Puducherry',
 'Punjab',
 'Rajasthan',
 'Sikkim',
 'Tamil Nadu',
 'Telangana',
 'The Dadra And Nagar Haveli And Daman And Diu',
 'Tripura',
 'Uttar Pradesh',
 'Uttarakhand',
 'WEST BENGAL',
 'WESTBENGAL',
 'West  Bengal',
 'West Bangal',
 'West Bengal',
 'West bengal',
 'Westbengal',
 'andhra pradesh']

### In the csv files given we had numeric states, some states had spelling mistakes

## Below is data cleaning, required only once

### State name corrections

In [9]:
state_mapping = {
    # Dadra and Nagar Haveli and Daman and Diu 
    'Dadra & Nagar Haveli': 'Dadra and Nagar Haveli and Daman and Diu',
    'Dadra and Nagar Haveli': 'Dadra and Nagar Haveli and Daman and Diu',
    'Dadra and Nagar Haveli and Daman and Diu': 'Dadra and Nagar Haveli and Daman and Diu',
    'The Dadra And Nagar Haveli And Daman And Diu': 'Dadra and Nagar Haveli and Daman and Diu',
    'Daman & Diu': 'Dadra and Nagar Haveli and Daman and Diu',
    'Daman and Diu': 'Dadra and Nagar Haveli and Daman and Diu',

    
    # Andaman & Nicobar Islands
    'Andaman & Nicobar Islands': 'Andaman and Nicobar Islands',
    'Andaman and Nicobar Islands': 'Andaman and Nicobar Islands',
    
    # Jammu & Kashmir
    'Jammu & Kashmir': 'Jammu and Kashmir',
    'Jammu And Kashmir': 'Jammu and Kashmir',
    'Jammu and Kashmir': 'Jammu and Kashmir',
    
    # Odisha
    'ODISHA': 'Odisha',
    'Odisha': 'Odisha',
    'Orissa': 'Odisha',
    
    # Puducherry
    'Pondicherry': 'Puducherry',
    'Puducherry': 'Puducherry',
    
    # West Bengal
    'WEST BENGAL': 'West Bengal',
    'WESTBENGAL': 'West Bengal',
    'West  Bengal': 'West Bengal',
    'West Bangal': 'West Bengal',
    'West Bengal': 'West Bengal',
    'West bengal': 'West Bengal',
    'Westbengal': 'West Bengal',
    
    # Andhra Pradesh
    'andhra pradesh': 'Andhra Pradesh',
}

for df in [df1, df2, df3]:
    df.loc[:, "state"] = df["state"].replace(state_mapping)

# Correcting the district names

In [10]:
districts_by_state = (
    df_original
    .dropna(subset=["state", "district"])
    .groupby("state")["district"]
    .unique()
)
index = 0
for state, districts in districts_by_state.items():
    index += 1
    '''
    user_input = input("Press 'n' to stop, press Enter or type anything to continue: ")
    if user_input.lower() == 'n':
        print("Terminated by user.")
        break
'''
    # sort using lowercase + no-space key, but keep original display
    sorted_districts = sorted(
           districts,
           key=lambda x: x.replace(" ", "").lower()
    )

    print(f"\nState{index}. {state}\n")
    for d in sorted_districts:
        print(d)
    print("\n\n")


State1. Andaman & Nicobar Islands

Andamans
Nicobars
South Andaman




State2. Andaman and Nicobar Islands

Nicobar
North And Middle Andaman
South Andaman




State3. Andhra Pradesh

Adilabad
Alluri Sitharama Raju
Anakapalli
Anantapur
Ananthapur
Ananthapuramu
Annamayya
Bapatla
Chittoor
Cuddapah
Dr. B. R. Ambedkar Konaseema
East Godavari
Eluru
Guntur
Hyderabad
K.V.Rangareddy
K.v. Rangareddy
Kakinada
Karimnagar
Karim Nagar
Khammam
Krishna
Kurnool
Mahabub Nagar
Mahabubnagar
Mahbubnagar
Medak
N. T. R
Nalgonda
Nandyal
Nellore
Nizamabad
Palnadu
Parvathipuram Manyam
Prakasam
Rangareddi
Spsr Nellore
Srikakulam
Sri Potti Sriramulu Nellore
Sri Sathya Sai
Tirupati
Visakhapatanam
Visakhapatnam
Vizianagaram
Warangal
West Godavari
Y. S. R




State4. Arunachal Pradesh

Anjaw
Changlang
Dibang Valley
East Kameng
East Siang
Kamle
Kra Daadi
Kurung Kumey
Leparada
Lohit
Longding
Lower Dibang Valley
Lower Siang
Lower Subansiri
Namsai
Pakke Kessang
Papum Pare
Shi-yomi
Siang
Tawang
Tirap
Upper Siang
Upper S

In [11]:
district_replace_map = {
    "Andaman and Nicobar Islands": {
        "Nicobars": "Nicobar",
        "Andamans": None
    },

    "Andhra Pradesh": {
        "chittoor": "Chittoor",
        "Anantapur": "Ananthapuramu",
        "Ananthapur": "Ananthapuramu",
        "Cuddapah": "Y. S. R",
        "K.V.Rangareddy": "Rangareddi",
        "K.v. Rangareddy": "Rangareddi",
        "rangareddi": "Rangareddi",
        "Spsr Nellore": "Sri Potti Sriramulu Nellore",
        "Visakhapatanam": "Visakhapatnam",
        "Karim Nagar": "Karimnagar",
        "Mahabub Nagar": "Mahabubnagar",
        "Mahbubnagar": "Mahabubnagar",
        "N. T. R": "NTR",
        "Hyderabad": None
    },

    "Arunachal Pradesh": {
        "Shi-yomi": "Shi Yomi"
    },

    "Assam": {
        "North Cachar Hills": "Dima Hasao",
        "Sibsagar": "Sivasagar",
        "Tamulpur District": "Tamulpur"
    },

    "Bihar": {
        "Aurangabad(bh)": "Aurangabad",
        "Aurangabad(BH)": "Aurangabad",
        "Bhabua": "Kaimur",
        "Monghyr": "Munger",
        "Purba Champaran": "East Champaran",
        "Purbi Champaran": "East Champaran",
        "Pashchim Champaran": "West Champaran",
        "Samstipur": "Samastipur",
        "Sheikpura": "Sheikhpura",
        "Purnea": "Purnia"
    },

    "Chandigarh": {
        "Rupnagar": None
    },

    "Chhattisgarh": {
        "Gaurela-pendra-marwahi": "Gaurela Pendra Marwahi",
        "Janjgir-champa": "Janjgir Champa",
        "Janjgir - Champa": "Janjgir Champa",
        "Mohla-Manpur-Ambagarh Chouki": "Mohalla-Manpur-Ambagarh Chowki",
        "Manendragarh–Chirmiri–Bharatpur": "Manendragarh Chirmiri Bharatpur",
        "Uttar Bastar Kanker": "Kanker",
        "Dakshin Bastar Dantewada": "Dantewada"
    },

    "Dadra and Nagar Haveli and Daman and Diu": {
        "Dadra & Nagar Haveli": "Dadra and Nagar Haveli",
        "Dadra And Nagar Haveli": "Dadra and Nagar Haveli"
    },

    "Delhi": {
        "North East": "North East Delhi",
        "North East   *": "North East Delhi"
    },

    "Gujarat": {
        "Ahmadabad": "Ahmedabad",
        "Banas Kantha": "Banaskantha",
        "Chhotaudepur": "Chhota Udaipur",
        "Dohad": "Dahod",
        "Panch Mahals": "Panchmahals",
        "Sabar Kantha": "Sabarkantha",
        "Surendra Nagar": "Surendranagar",
        "The Dangs": "Dang"
    },

    "Haryana": {
        "Jhajjar *": "Jhajjar",
        "Yamuna Nagar": "Yamunanagar",
        "Mewat": "Nuh"
    },

    "Himachal Pradesh": {
        "Lahul & Spiti": "Lahaul and Spiti",
        "Lahul and Spiti": "Lahaul and Spiti"
    },

    "Jammu and Kashmir": {
        "Bandipore": "Bandipora",
        "Baramula": "Baramulla",
        "Punch": "Poonch",
        "punch": "Poonch",
        "Rajauri": "Rajouri",
        "Shupiyan": "Shopian",
        "Leh (ladakh)": "Leh"
    },

    "Jharkhand": {
        "Bokaro *": "Bokaro",
        "Garhwa *": "Garhwa",
        "East Singhbum": "East Singhbhum",
        "Hazaribag": "Hazaribagh",
        "Kodarma": "Koderma",
        "Pakaur": "Pakur",
        "Palamau": "Palamu",
        "Pashchimi Singhbhum": "West Singhbhum",
        "Purbi Singhbhum": "East Singhbhum",
        "Seraikela-kharsawan": "Seraikela-Kharsawan"
    },

    "Karnataka": {
        "Bagalkot *": "Bagalkot",
        "Gadag *": "Gadag",
        "Udupi *": "Udupi",
        "Chamarajanagar *": "Chamarajanagar",
        "Chamrajnagar": "Chamarajanagar",
        "Chamrajanagar": "Chamarajanagar",
        "Belgaum": "Belagavi",
        "Bellary": "Ballari",
        "Gulbarga": "Kalaburagi",
        "Bijapur": "Vijayapura",
        "Bijapur(KAR)": "Vijayapura",
        "Hasan": "Hassan",
        "Davangere": "Davanagere",
        "Shimoga": "Shivamogga",
        "Tumkur": "Tumakuru",
        "Mysore": "Mysuru",
        "yadgir": "Yadgir"
    },

    "Kerala": {
        "Kasargod": "Kasaragod"
    },

    "Ladakh": {},

    "Lakshadweep": {},

    "Madhya Pradesh": {
        "Ashok Nagar": "Ashoknagar",
        "Harda *": "Harda",
        "East Nimar": "Khandwa",
        "West Nimar": "Khargone",
        "Hoshangabad": "Narmadapuram",
        "Narsimhapur": "Narsinghpur"
    },
    
    "Maharashtra": {
        "Ahmadnagar": "Ahmednagar",
        "Ahmed Nagar": "Ahmednagar",
        "Bid": "Beed",
        "Gondiya": "Gondia",
        "Gondiya *": "Gondia",
        "Hingoli *": "Hingoli",
        "Nandurbar *": "Nandurbar",
        "Washim *": "Washim",
        "Raigarh": None,
        "Raigarh(MH)": None,
        "Mumbai( Sub Urban )": "Mumbai Suburban",
        "Chatrapati Sambhaji Nagar": "Chhatrapati Sambhajinagar",
        "Osmanabad": "Dharashiv"
    },
    
    "Manipur": {},
    
    "Meghalaya": {
        "Kamrup": None,
        "Jaintia Hills": None
    },
    
    "Mizoram": {
        "Mammit": "Mamit"
    },
    
    "Nagaland": {},
    
    "Odisha": {
        "ANGUL": "Angul",
        "Anugal": "Anugul",
        "ANUGUL": "Anugul",
        "Baleshwar": "Baleswar",
        "Baudh": "Boudh",
        "Jagatsinghapur": "Jagatsinghpur",
        "JAJPUR": "Jajpur",
        "jajpur": "Jajpur",
        "Kendrapara *": "Kendrapara",
        "Khorda": "Khordha",
        "Nabarangpur": "Nabarangapur",
        "NUAPADA": "Nuapada",
        "Sonapur": "Subarnapur",
        "Sundergarh": "Sundargarh"
    },
    
    "Puducherry": {
        "Pondicherry": "Puducherry"
    },
    
    "Punjab": {
        "Firozpur": "Ferozepur",
        "Muktsar": "Sri Muktsar Sahib",
        "Nawanshahr": "Shaheed Bhagat Singh Nagar",
        "S.A.S Nagar": "SAS Nagar (Mohali)",
        "S.A.S Nagar(Mohali)": "SAS Nagar (Mohali)"
    },
    
    "Rajasthan": {
        "Chittaurgarh": "Chittorgarh",
        "Deeg ": "Deeg",
        "Dhaulpur": "Dholpur",
        "Jalore": "Jalor",
        "Jhunjhunun": "Jhunjhunu"
    },
    
    "Sikkim": {
        "East": "East Sikkim",
        "North": "North Sikkim",
        "South": "South Sikkim",
        "West": "West Sikkim"
    },
    
    "Tamil Nadu": {
        "Kancheepuram": "Kanchipuram",
        "Namakkal   *": "Namakkal",
        "The Nilgiris": "Nilgiris",
        "Tiruvallur": "Thiruvallur",
        "Tuticorin": "Thoothukkudi",
        "Viluppuram": "Villupuram"
    },
    
    "Telangana": {
        "Jangoan": "Jangaon",
        "K.v. Rangareddy": "Ranga Reddy",
        "Medchal-malkajgiri": "Medchal Malkajgiri",
        "Medchal?malkajgiri": "Medchal Malkajgiri",
        "Medchal−malkajgiri": "Medchal Malkajgiri",
        "Rangareddy": "Ranga Reddy",
        "Warangal (urban)": "Warangal Urban",
        "Yadadri.": "Yadadri Bhuvanagiri"
    },
    
    "Tripura": {
        "Dhalai  *": "Dhalai"
    },

    "Uttar Pradesh": {
        "Bagpat": "Baghpat",
        "Bara Banki": "Barabanki",
        "Budaun": "Badaun",
        "Bulandshahr": "Bulandshahar",
        "Faizabad": "Ayodhya",
        "Jyotiba Phule Nagar": "Amroha",
        "Kushi Nagar": "Kushinagar",
        "Kushinagar *": "Kushinagar",
        "Mahrajganj": "Maharajganj",
        "Raebareli": "Rae Bareli",
        "Sant Ravidas Nagar Bhadohi": "Sant Ravidas Nagar",
        "Shrawasti": "Shravasti",
        "Siddharth Nagar": "Siddharthnagar"
    },
    
    "Uttarakhand": {
        "Hardwar": "Haridwar",
        "Garhwal": None
    },
    
    "West Bengal": {
        "24 Paraganas North": "North 24 Parganas",
        "24 Paraganas South": "South 24 Parganas",
        "Barddhaman": "Bardhaman",
        "Burdwan": "Bardhaman",
        "Coochbehar": "Cooch Behar",
        "Darjiling": "Darjeeling",
        "East Midnapore": "Purba Medinipur",
        "East Midnapur": "Purba Medinipur",
        "Haora": "Howrah",
        "Hawrah": "Howrah",
        "Hooghiy": "Hooghly",
        "HOOGHLY": "Hooghly",
        "hooghly": "Hooghly",
        "HOWRAH": "Howrah",
        "Hugli": "Hooghly",
        "Koch Bihar": "Cooch Behar",
        "KOLKATA": "Kolkata",
        "MALDA": "Malda",
        "Maldah": "Malda",
        "Medinipur": None,
        "Medinipur West": "Paschim Medinipur",
        "North Twenty Four Parganas": "North 24 Parganas",
        "Puruliya": "Purulia",
        "South 24 Pargana": "South 24 Parganas",
        "South 24 parganas": "South 24 Parganas",
        "South Twenty Four Parganas": "South 24 Parganas",
        "West Medinipur": "Paschim Medinipur",
        "West Midnapore": "Paschim Medinipur"
    }
}


def replace_districts_statewise(df, state_col="state", district_col="district"):
    for state, repl in district_replace_map.items():
        mask = df[state_col] == state
        df.loc[mask, district_col] = df.loc[mask, district_col].replace(repl)
    return df





In [12]:
df1 = replace_districts_statewise(df1)
df2 = replace_districts_statewise(df2)
df3 = replace_districts_statewise(df3)

In [13]:
def aggregate_df(df):
    return df.groupby(
        ["date", "state", "district", "pincode"], 
        dropna=False, 
        as_index=False
    ).sum(numeric_only=True)

df1 = aggregate_df(df1)
df2 = aggregate_df(df2)
df3 = aggregate_df(df3)

In [14]:
df_original = pd.concat([df1, df2, df3], ignore_index=True)
len(df_original["state"].unique())

37

## Puting the data in new csv files 

In [15]:
#run this only once on your system
clean_dir.mkdir(parents=True, exist_ok=True)

df1.to_csv(clean_dir/csv_files[0], index=False)
df2.to_csv(clean_dir/csv_files[1], index=False)
df3.to_csv(clean_dir/csv_files[2], index=False)

## Loading the cleaned data

In [16]:
df_clean = pd.concat(
    (pd.read_csv(clean_dir / f) for f in csv_files),
    ignore_index=True
)
df_clean

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,year,total_enrolments
0,2025-03-02,Meghalaya,East Khasi Hills,793121,11,61,37,2025,109
1,2025-03-09,Bihar,Bhagalpur,812005,13,40,18,2025,71
2,2025-03-09,Bihar,East Champaran,845304,18,72,12,2025,102
3,2025-03-09,Bihar,East Champaran,845418,30,48,10,2025,88
4,2025-03-09,Bihar,Madhubani,847108,18,120,22,2025,160
...,...,...,...,...,...,...,...,...,...
946623,2025-12-31,West Bengal,Uttar Dinajpur,733156,8,9,0,2025,17
946624,2025-12-31,West Bengal,Uttar Dinajpur,733201,33,26,1,2025,60
946625,2025-12-31,West Bengal,Uttar Dinajpur,733207,22,23,0,2025,45
946626,2025-12-31,West Bengal,Uttar Dinajpur,733212,2,1,0,2025,3


In [None]:
monthly_national = (
    df_clean.groupby("month", as_index=False)["total_enrolments"]
      .sum()
      .sort_values("month")
)

monthly_national

In [None]:
plt.figure(figsize=(10,5))
plt.plot(
    monthly_national["month"].astype(str),
    monthly_national["total_enrolments"],
    marker='o'
)
plt.xticks(rotation=45)
plt.xlabel("Month")
plt.ylabel("Total Aadhaar Enrolments")
plt.title("Monthly Aadhaar Enrolments in 2025 (National)")

# Force exact numbers on y-axis
plt.gca().yaxis.set_major_formatter(mtick.StrMethodFormatter('{x:,.0f}'))

plt.tight_layout()
plt.show()

In [None]:
# Group by state and sum age-group counts
state_age = (
    df_clean.groupby("state")[["age_0_5", "age_5_17", "age_18_greater"]]
      .sum()
      .sort_values("age_18_greater", ascending=False)  # optional: show states with highest late enrolments first
)

state_age

In [None]:
state_age_ratios = state_age.div(state_age.sum(axis=1), axis=0)
state_age_ratios

In [None]:
state_age.plot(
    kind="bar",
    figsize=(12,6)
)
plt.ylabel("Total Enrolments")
plt.title("Aadhaar Enrolments by Age Group per State (2025)")
plt.xticks(rotation=60)
plt.gca().yaxis.set_major_formatter(mtick.StrMethodFormatter('{x:,.0f}'))
plt.tight_layout()
plt.show()
#not looking good at all, also make the names short

In [None]:
# shorten state names (clean and readable)
state_age_short = state_age.copy()
state_age_short.index = state_age_short.index.str.replace(" ", "\n")

state_age_short.plot(
    kind="bar",
    figsize=(38, 6),
    width=0.8
)

plt.ylabel("Total Enrolments")
plt.title("Aadhaar Enrolments by Age Group per State")
plt.xticks(rotation=60)
plt.gca().yaxis.set_major_formatter(mtick.StrMethodFormatter('{x:,.0f}'))
plt.legend(title="Age Group")
plt.gca().yaxis.set_major_locator(mtick.MaxNLocator(nbins=15))
plt.tight_layout()
plt.show()
#have to shorten the names
#arrange in order of highest enrollment(total) state first

In [None]:
state_age_pct = state_age.div(state_age.sum(axis=1), axis=0) * 100

state_age_pct_short = state_age_pct.copy()
state_age_pct_short.index = state_age_pct_short.index.str.replace(" ", "\n")

state_age_pct_short.plot(
    kind="bar",
    figsize=(38, 6),
    width=0.8
)

plt.ylabel("Percentage of Total Enrolments (%)")
plt.title("Age Group Distribution of Aadhaar Enrolments by State")
plt.xticks(rotation=60)
plt.gca().yaxis.set_major_formatter(mtick.StrMethodFormatter('{x:.0f}%'))
plt.gca().yaxis.set_major_locator(mtick.MaxNLocator(nbins=15))
plt.legend(title="Age Group")
plt.tight_layout()
plt.show()
#have to shorten the names
#arrange in order of highest enrollment(total) state first

In [None]:
state_total = state_age.sum(axis=1)

state_total_short = state_total.copy()
state_total_short.index = state_total_short.index.str.replace(" ", "\n")

state_total_short.plot(
    kind="bar",
    figsize=(38, 6),
    width=0.8
)

plt.ylabel("Total Enrolments")
plt.title("Total Aadhaar Enrolments by State")
plt.xticks(rotation=60)
plt.gca().yaxis.set_major_formatter(mtick.StrMethodFormatter('{x:,.0f}'))
plt.gca().yaxis.set_major_locator(mtick.MaxNLocator(nbins=15))
plt.tight_layout()
plt.show()
#arrange in order of highest enrollment(total) state first

In [None]:
monthly_age = (
    df_clean.groupby("month")[age_cols]
      .sum()
)

monthly_age.index = monthly_age.index.astype(str)

monthly_age.plot(
    kind="bar",
    figsize=(24, 6),
    width=0.8
)

plt.ylabel("Total Enrolments")
plt.title("Monthly Aadhaar Enrolments by Age Group (National)")
plt.xticks(rotation=45)
plt.gca().yaxis.set_major_formatter(mtick.StrMethodFormatter('{x:,.0f}'))
plt.gca().yaxis.set_major_locator(mtick.MaxNLocator(nbins=15))
plt.legend(title="Age Group")
plt.tight_layout()
plt.show()
#arrange in order of highest enrollment(total) state first

In [None]:
# top 10 states by total enrolments
top_states = (
    df_clean.groupby("state")["total_enrolments"]
      .sum()
      .sort_values(ascending=False)
      .head(10)
      .index
)

top_state_age = (
    df_clean[df_clean["state"].isin(top_states)]
    .groupby("state")[age_cols]
    .sum()
)

# shorten state names
top_state_age_short = top_state_age.copy()
top_state_age_short.index = top_state_age_short.index.str.replace(" ", "\n")

top_state_age_short.plot(
    kind="bar",
    figsize=(18, 6),
    width=0.8
)

plt.ylabel("Total Enrolments")
plt.title("Top 10 States by Aadhaar Enrolments (Age Group-wise)")
plt.xticks(rotation=45)
plt.gca().yaxis.set_major_formatter(mtick.StrMethodFormatter('{x:,.0f}'))
plt.gca().yaxis.set_major_locator(mtick.MaxNLocator(nbins=15))
plt.legend(title="Age Group")
plt.tight_layout()
plt.show()
#arrange in order of highest enrollment(total) state first

In [None]:
_states = sorted(df_clean['state'].dropna().unique().tolist())
_districts = sorted(df_clean['district'].dropna().unique().tolist())
_pincodes = sorted(df_clean['pincode'].dropna().unique().astype(str).tolist())

def find_state(name):
    # case-insensitive find; returns canonical name or None
    name_lower = name.strip().lower()
    for s in _states:
        if s.lower() == name_lower:
            return s
    # try partial match
    for s in _states:
        if name_lower in s.lower():
            return s
    return None
    
def find_district(name):
    name_lower = name.strip().lower()
    for d in _districts:
        if d.lower() == name_lower:
            return d
    for d in _districts:
        if name_lower in d.lower():
            return d
    return None
    
def find_pincode(code):
    c = str(code).strip()
    return c if c in _pincodes else None

In [None]:
def age_wise_analysis():
    state_name = input("Enter State name (case-insensitive): ").strip()
    district_name = input("Enter District name (leave blank for state-level): ").strip()

    # =========================
    # STATE LEVEL
    # =========================
    if district_name == "":
        state_df = df_clean[df_clean["state"].str.lower() == state_name.lower()]

        if state_df.empty:
            print("No data found for this state.")
            return

        age_df = state_df[[
            "age_0_5",
            "age_5_17",
            "age_18_greater"
        ]].sum().reset_index()

        age_df.columns = ["Age Group", "Total Enrolments"]

        age_df["Age Group"] = age_df["Age Group"].replace({
            "age_0_5": "0–5",
            "age_5_17": "5–17",
            "age_18_greater": "18+"
        })

        age_df.plot(
            x="Age Group",
            y="Total Enrolments",
            kind="bar",
            figsize=(8, 5),
            width=0.6,
            legend=False
        )

        plt.ylabel("Total Enrolments")
        plt.title(f"Aadhaar Enrolments by Age Group — {state_name}")
        plt.gca().yaxis.set_major_formatter(mtick.StrMethodFormatter('{x:,.0f}'))
        plt.gca().yaxis.set_major_locator(mtick.MaxNLocator(nbins=12))
        plt.tight_layout()
        plt.show()

    # =========================
    # DISTRICT LEVEL
    # =========================
    else:
        district_df = df_clean[
            (df_clean["state"].str.lower() == state_name.lower()) &
            (df_clean["district"].str.lower() == district_name.lower())
        ]

        if district_df.empty:
            print("No data found for this state/district.")
            return

        age_df = district_df[[
            "age_0_5",
            "age_5_17",
            "age_18_greater"
        ]].sum().reset_index()

        age_df.columns = ["Age Group", "Total Enrolments"]

        age_df["Age Group"] = age_df["Age Group"].replace({
            "age_0_5": "0–5",
            "age_5_17": "5–17",
            "age_18_greater": "18+"
        })

        age_df.plot(
            x="Age Group",
            y="Total Enrolments",
            kind="bar",
            figsize=(8, 5),
            width=0.6,
            legend=False
        )

        plt.ylabel("Total Enrolments")
        plt.title(f"Age-wise Aadhaar Enrolments — {district_name}, {state_name}")
        plt.gca().yaxis.set_major_formatter(mtick.StrMethodFormatter('{x:,.0f}'))
        plt.gca().yaxis.set_major_locator(mtick.MaxNLocator(nbins=12))
        plt.tight_layout()
        plt.show()
        #arrange in order of highest enrollment(total) state first

In [None]:
def enrolment_trend_analysis():
    raw = input("Enter District name OR Pincode: ").strip()

    # =========================
    # PINCODE LEVEL
    # =========================
    if raw.isdigit():
        pin_df = df[df["pincode"].astype(str) == raw]

        if pin_df.empty:
            print("No data found for this pincode.")
            return

        total = pin_df["total_enrolments"].sum()

        plt.figure(figsize=(6, 4))
        plt.bar(["Total Enrolments"], [total], width=0.5)

        plt.ylabel("Total Enrolments")
        plt.title(f"Aadhaar Enrolments — Pincode {raw}")
        plt.gca().yaxis.set_major_formatter(mtick.StrMethodFormatter('{x:,.0f}'))
        plt.gca().yaxis.set_major_locator(mtick.MaxNLocator(nbins=10))
        plt.tight_layout()
        plt.show()

    # =========================
    # DISTRICT LEVEL
    # =========================
    else:
        district_df = df_clean[df_clean["district"].str.lower() == raw.lower()]

        if district_df.empty:
            print("No data found for this district.")
            return

        plot_df = (
            district_df.groupby("month")["total_enrolments"]
            .sum()
            .reset_index()
            .sort_values("month")
        )

        plot_df_plot = plot_df.set_index(plot_df["month"].astype(str))
        plot_df_plot["total_enrolments"].plot(
            kind="bar",
            figsize=(12, 4),
            width=0.7
        )

        plt.ylabel("Total Enrolments")
        plt.title(f"Monthly Enrolments — {raw}")
        plt.gca().yaxis.set_major_formatter(mtick.StrMethodFormatter('{x:,.0f}'))
        plt.gca().yaxis.set_major_locator(mtick.MaxNLocator(nbins=8))
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

In [None]:
age_wise_analysis()

In [None]:
age_wise_analysis()

In [None]:
enrolment_trend_analysis()

In [None]:
enrolment_trend_analysis()

In [None]:
enrolment_trend_analysis()