# UIDAI Data Hackathon 2026  
## Step-by-Step System-Level Analysis


In [1]:
import pandas as pd
import os
import glob

## STEP 1: Data Loading, Integration, and Validation


In [2]:
def load_and_merge_csvs(folder_path):
    files = glob.glob(os.path.join(folder_path, "*.csv"))
    print(f"Loading {len(files)} files from {folder_path}")
    
    df_list = []
    for file in files:
        df = pd.read_csv(file)
        df_list.append(df)
    
    merged_df = pd.concat(df_list, ignore_index=True)
    return merged_df


In [3]:
biometric_df = load_and_merge_csvs(
    "datasets/api_data_aadhar_biometric"
)

demographic_df = load_and_merge_csvs(
    "datasets/api_data_aadhar_demographic"
)

enrolment_df = load_and_merge_csvs(
    "datasets/api_data_aadhar_enrolment"
)


Loading 4 files from datasets/api_data_aadhar_biometric
Loading 5 files from datasets/api_data_aadhar_demographic
Loading 3 files from datasets/api_data_aadhar_enrolment


In [4]:
print("Biometric:", biometric_df.shape)
print("Demographic:", demographic_df.shape)
print("Enrolment:", enrolment_df.shape)

Biometric: (1861108, 6)
Demographic: (2071700, 6)
Enrolment: (1006029, 7)


In [5]:
for df in [biometric_df, demographic_df, enrolment_df]:
    df["date"] = pd.to_datetime(df["date"], errors="coerce")


  df["date"] = pd.to_datetime(df["date"], errors="coerce")
  df["date"] = pd.to_datetime(df["date"], errors="coerce")
  df["date"] = pd.to_datetime(df["date"], errors="coerce")


In [6]:
biometric_df["date"].isna().sum(), demographic_df["date"].isna().sum(), enrolment_df["date"].isna().sum()


(np.int64(0), np.int64(0), np.int64(0))

In [7]:
def check_negatives(df, cols):
    for col in cols:
        print(col, (df[col] < 0).sum())

check_negatives(enrolment_df, ["age_0_5", "age_5_17", "age_18_greater"])
check_negatives(demographic_df, ["demo_age_5_17", "demo_age_17_"])
check_negatives(biometric_df, ["bio_age_5_17", "bio_age_17_"])


age_0_5 0
age_5_17 0
age_18_greater 0
demo_age_5_17 0
demo_age_17_ 0
bio_age_5_17 0
bio_age_17_ 0


In [8]:
def geo_nulls(df):
    return df[["state", "district", "pincode"]].isna().sum()

print(geo_nulls(enrolment_df))
print(geo_nulls(demographic_df))
print(geo_nulls(biometric_df))


state       0
district    0
pincode     0
dtype: int64
state       0
district    0
pincode     0
dtype: int64
state       0
district    0
pincode     0
dtype: int64


In [9]:
for df in [biometric_df, demographic_df, enrolment_df]:
    df["year_month"] = df["date"].dt.to_period("M")


In [10]:
os.makedirs("outputs", exist_ok=True)


In [11]:
biometric_df.to_csv("outputs/biometric_clean.csv", index=False)
demographic_df.to_csv("outputs/demographic_clean.csv", index=False)
enrolment_df.to_csv("outputs/enrolment_clean.csv", index=False)


## STEP 2: Aggregation and Indicator Construction


In [13]:
# Always good practice in analysis notebooks
pd.set_option("display.max_columns", 50)
pd.set_option("display.float_format", "{:.4f}".format)


In [14]:
# Explicitly define analysis dimensions
GEO_COLS = ["state", "district"]
TIME_COL = "year_month"


In [15]:
def aggregate_monthly(df, sum_cols):
    """
    Aggregates a dataset to district-month level.
    """
    return (
        df
        .groupby(GEO_COLS + [TIME_COL], as_index=False)[sum_cols]
        .sum()
    )


In [16]:
enrolment_dist_monthly = aggregate_monthly(
    enrolment_df,
    sum_cols=["age_0_5", "age_5_17", "age_18_greater"]
)

In [17]:
enrolment_dist_monthly["total_enrolment"] = (
    enrolment_dist_monthly["age_0_5"] +
    enrolment_dist_monthly["age_5_17"] +
    enrolment_dist_monthly["age_18_greater"]
)


In [18]:
demo_dist_monthly = aggregate_monthly(
    demographic_df,
    sum_cols=["demo_age_5_17", "demo_age_17_"]
)


In [19]:
demo_dist_monthly["total_demo_updates"] = (
    demo_dist_monthly["demo_age_5_17"] +
    demo_dist_monthly["demo_age_17_"]
)


In [20]:
bio_dist_monthly = aggregate_monthly(
    biometric_df,
    sum_cols=["bio_age_5_17", "bio_age_17_"]
)


In [21]:
bio_dist_monthly["total_bio_updates"] = (
    bio_dist_monthly["bio_age_5_17"] +
    bio_dist_monthly["bio_age_17_"]
)


In [22]:
combined_df = (
    enrolment_dist_monthly
    .merge(
        demo_dist_monthly,
        on=GEO_COLS + [TIME_COL],
        how="left"
    )
    .merge(
        bio_dist_monthly,
        on=GEO_COLS + [TIME_COL],
        how="left"
    )
)


In [23]:
combined_df.fillna(0, inplace=True)


In [25]:
# Avoid divide-by-zero while keeping meaning intact
combined_df["safe_total_enrolment"] = combined_df["total_enrolment"].replace(0, 1)
combined_df["safe_adult_population"] = combined_df["age_18_greater"].replace(0, 1)


In [26]:
combined_df["update_pressure_index"] = (
    (combined_df["total_demo_updates"] + combined_df["total_bio_updates"]) /
    combined_df["safe_total_enrolment"]
)


In [27]:
combined_df["identity_volatility_ratio"] = (
    combined_df["total_demo_updates"] /
    combined_df["safe_total_enrolment"]
)


In [28]:
combined_df["biometric_alignment_score"] = (
    combined_df["bio_age_17_"] /
    combined_df["safe_adult_population"]
)


In [29]:
# Log-normalized versions to reduce skew
import numpy as np

combined_df["log_update_pressure"] = np.log1p(combined_df["update_pressure_index"])
combined_df["log_identity_volatility"] = np.log1p(combined_df["identity_volatility_ratio"])


In [30]:
combined_df.sort_values(
    by=["state", "district", "year_month"],
    inplace=True
)


In [31]:
os.makedirs("outputs/step2", exist_ok=True)

combined_df.to_csv(
    "outputs/step2/combined_district_monthly_indicators.csv",
    index=False
)


## fixing

In [34]:
import pandas as pd
import numpy as np
import os

df = pd.read_csv(
    "outputs/step2/combined_district_monthly_indicators.csv"
)

df.head()


Unnamed: 0,state,district,year_month,age_0_5,age_5_17,age_18_greater,total_enrolment,demo_age_5_17,demo_age_17_,total_demo_updates,bio_age_5_17,bio_age_17_,total_bio_updates,safe_total_enrolment,safe_adult_population,update_pressure_index,identity_volatility_ratio,biometric_alignment_score,log_update_pressure,log_identity_volatility
0,100000,100000,2025-09,0,0,12,12,0.0,0.0,0.0,0.0,0.0,0.0,12,12,0.0,0.0,0.0,0.0,0.0
1,100000,100000,2025-10,0,1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0.0,0.0,0.0,0.0,0.0
2,100000,100000,2025-11,0,0,11,11,0.0,0.0,0.0,0.0,0.0,0.0,11,11,0.0,0.0,0.0,0.0,0.0
3,100000,100000,2025-12,0,0,194,194,0.0,2.0,2.0,0.0,0.0,0.0,194,194,0.0103,0.0103,0.0,0.0103,0.0103
4,Andaman & Nicobar Islands,Andamans,2025-09,23,4,0,27,3.0,159.0,162.0,76.0,241.0,317.0,27,1,17.7407,6.0,241.0,2.9307,1.9459


In [35]:
df["state"] = (
    df["state"]
    .astype(str)
    .str.strip()
    .str.title()
)


In [40]:

def normalize_state(text):
    if pd.isna(text):
        return text
    return (
        text.lower()
        .replace("&", "and")
        .replace(".", "")
        .strip()
    )

df["state_norm"] = df["state"].apply(normalize_state)


In [41]:
state_mapping = {
    # Odisha
    "odisha": "Odisha",
    "orissa": "Odisha",

    # West Bengal
    "west bengal": "West Bengal",
    "west bangal": "West Bengal",
    "westbengal": "West Bengal",
    "west  bengal": "West Bengal",

    # Jammu & Kashmir
    "jammu and kashmir": "Jammu & Kashmir",

    # Andaman & Nicobar
    "andaman and nicobar islands": "Andaman & Nicobar Islands",

    # Puducherry
    "puducherry": "Puducherry",
    "pondicherry": "Puducherry",

    # Dadra & Nagar Haveli and Daman & Diu (merged UT)
    "dadra and nagar haveli and daman and diu":
        "Dadra & Nagar Haveli and Daman & Diu",
    "the dadra and nagar haveli and daman and diu":
        "Dadra & Nagar Haveli and Daman & Diu",
    "dadra and nagar haveli":
        "Dadra & Nagar Haveli and Daman & Diu",
    "dadra and nagar haveli and daman and diu":
        "Dadra & Nagar Haveli and Daman & Diu",
    "daman and diu":
        "Dadra & Nagar Haveli and Daman & Diu",

    # Andhra Pradesh (safety)
    "andhra pradesh": "Andhra Pradesh",

    # Telangana
    "telangana": "Telangana",
}


In [42]:
df["state_clean"] = (
    df["state_norm"]
    .map(state_mapping)
    .fillna(df["state"])
)


In [43]:
df = df[~df["state_clean"].isin(["100000"])]


In [44]:
df["state"] = df["state_clean"]

df = df.drop(columns=["state_norm", "state_clean"])


In [46]:
def normalize_district(text):
    if pd.isna(text):
        return text
    return (
        text.lower()
        .replace("&", "and")
        .replace(".", "")
        .replace("-", " ")
        .strip()
    )

df["district_norm"] = df["district"].apply(normalize_district)


In [49]:
df["district_norm"].value_counts().head(40)


district_norm
hooghly                   27
dadra and nagar haveli    14
bijapur                   13
anugul                    12
balrampur                 12
nadia                     12
jajpur                    12
daman                     12
hyderabad                 12
kargil                    11
aurangabad                11
nuapada                   11
diu                       11
kamrup                    10
dibrugarh                  9
dhemaji                    9
ri bhoi                    9
lakhimpur                  9
aligarh                    9
agra                       9
barpeta                    9
morbi                      9
muzaffarpur                9
madhubani                  9
kachchh                    9
patan                      9
patna                      9
west jaintia hills         9
baksa                      9
thane                      9
east khasi hills           9
marigaon                   9
kokrajhar                  9
gwalior                    9


In [50]:
district_mapping = {
    # Andhra Pradesh – Anantapur
    "anantapur": "Anantapur",
    "ananthapur": "Anantapur",
    "ananthapuramu": "Anantapur",

    # Andhra Pradesh – Kadapa
    "cuddapah": "Kadapa",
    "ysr kadapa": "Kadapa",
    "kadapa": "Kadapa",

    # Andhra Pradesh – Chittoor
    "chittoor": "Chittoor",

    # Andhra Pradesh – Rangareddy
    "rangareddi": "Rangareddy",
    "rangareddy": "Rangareddy",
    "kv rangareddy": "Rangareddy",
    "k v rangareddy": "Rangareddy",

    # West Bengal
    "west bengal": "West Bengal",  # safety if misfiled
}


In [51]:
df["district_clean"] = (
    df["district_norm"]
    .map(district_mapping)
    .fillna(df["district"])
)


In [52]:
df = df.drop(columns=["district_norm"])


In [53]:
AGG_COLS = [
    "age_0_5", "age_5_17", "age_18_greater",
    "total_enrolment",
    "total_demo_updates",
    "total_bio_updates"
]


In [54]:
df_final = (
    df
    .groupby(["state", "district_clean", "year_month"], as_index=False)[AGG_COLS]
    .sum()
)


In [55]:
df_final = df_final.rename(columns={"district_clean": "district"})
df_final["safe_total_enrolment"] = df_final["total_enrolment"].replace(0, 1)
df_final["safe_adult_population"] = df_final["age_18_greater"].replace(0, 1)

df_final["update_pressure_index"] = (
    (df_final["total_demo_updates"] + df_final["total_bio_updates"]) /
    df_final["safe_total_enrolment"]
)

df_final["identity_volatility_ratio"] = (
    df_final["total_demo_updates"] /
    df_final["safe_total_enrolment"]
)

df_final["biometric_alignment_score"] = (
    df_final["total_bio_updates"] /
    df_final["safe_adult_population"]
)

df_final["log_update_pressure"] = np.log1p(df_final["update_pressure_index"])
df_final["log_identity_volatility"] = np.log1p(df_final["identity_volatility_ratio"])


In [56]:
MIN_ENROLMENT = 20

df_final["low_volume_flag"] = (
    df_final["total_enrolment"] < MIN_ENROLMENT
)


In [57]:
df_final = df_final.sort_values(
    by=["state", "district", "year_month"]
)

os.makedirs("outputs/step2_final", exist_ok=True)

df_final.to_csv(
    "outputs/step2_final/combined_district_monthly_indicators_final.csv",
    index=False
)
