In [20]:
import pandas as pd
import numpy as np
import sys
import os

sys.path.append(os.path.abspath(".."))


In [3]:
biometric_df = pd.read_csv(r"D:\Apna College\Jupyter\Aadharcard\data\clean_biometric.csv")
demographic_df = pd.read_csv(r"D:\Apna College\Jupyter\Aadharcard\data\clean_demographic.csv")
enrolment_df = pd.read_csv(r"D:\Apna College\Jupyter\Aadharcard\data\clean_enrolment.csv")

print("Loaded shapes:")
print("Biometric:", biometric_df.shape)
print("Demographic:", demographic_df.shape)
print("Enrolment:", enrolment_df.shape)

Loaded shapes:
Biometric: (1766212, 6)
Demographic: (1598099, 6)
Enrolment: (983072, 7)


In [4]:
df = enrolment_df.merge(
        demographic_df,
        on=["date", "state", "district", "pincode"],
        how="left"
     ).merge(
        biometric_df,
        on=["date", "state", "district", "pincode"],
        how="left"
     )

print("Merged shape:", df.shape)

Merged shape: (983072, 11)


In [5]:
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = df[num_cols].fillna(0)

In [21]:
from rapidfuzz import process, fuzz
from reference.canonical_states import CANONICAL_STATES
from reference.canonical_districts import CANONICAL_DISTRICTS

def normalize_name(x):
    if pd.isna(x):
        return x
    x = str(x).strip().lower()
    x = x.replace("&", "and")
    x = x.replace("_", " ")
    x = x.replace("-", " ")
    x = x.replace(".", "")
    x = " ".join(x.split())
    return x.title()

# Basic normalization
df["state"] = df["state"].apply(normalize_name)
df["district"] = df["district"].apply(normalize_name)

def fuzzy_canonical(name, choices, threshold=85):
    if pd.isna(name):
        return name
    match, score, _ = process.extractOne(name, choices, scorer=fuzz.ratio)
    if score >= threshold:
        return match
    return name

# Automatic canonicalization
df["state"] = df["state"].apply(lambda x: fuzzy_canonical(x, CANONICAL_STATES))
df["district"] = df["district"].apply(lambda x: fuzzy_canonical(x, CANONICAL_DISTRICTS))

In [22]:
df["total_enrolment"] = (
    df["age_0_5"] +
    df["age_5_17"] +
    df["age_18_greater"]
)

# Population shares
df["child_share"] = df["age_0_5"] / (df["total_enrolment"] + 1)
df["youth_share"] = df["age_5_17"] / (df["total_enrolment"] + 1)
df["adult_share"] = df["age_18_greater"] / (df["total_enrolment"] + 1)

# Biometric vs Demographic pressure ratios
df["bio_ratio_5_17"] = df["bio_age_5_17"] / (df["demo_age_5_17"] + 1)
df["bio_ratio_17"]   = df["bio_age_17"] / (df["demo_age_17"] + 1)

# Log-scaled load (helps ML)
df["log_load"] = np.log1p(df["total_enrolment"])

In [23]:
high_load = df["total_enrolment"].quantile(0.75)
mid_bio   = df["bio_ratio_17"].quantile(0.60)
high_bio  = df["bio_ratio_17"].quantile(0.85)

def make_risk(row):
    if row["bio_ratio_17"] >= high_bio and row["total_enrolment"] >= high_load:
        return "High"
    elif row["bio_ratio_17"] >= mid_bio:
        return "Medium"
    else:
        return "Low"

df["risk_label"] = df.apply(make_risk, axis=1)

print("Risk distribution:")
print(df["risk_label"].value_counts())

Risk distribution:
risk_label
Low       589691
Medium    344135
High       49246
Name: count, dtype: int64


In [24]:
final_cols = [
    "state", "district", "pincode", "date",
    "total_enrolment",
    "child_share", "youth_share", "adult_share",
    "bio_ratio_5_17", "bio_ratio_17",
    "log_load",
    "risk_label"
]

final_df = df[final_cols]

In [25]:
final_df.to_csv("../data/final_ml_dataset.csv", index=False)

print("\nSaved ML dataset:")
print("../data/final_ml_dataset.csv")
print("Shape:", final_df.shape)

final_df.head()


Saved ML dataset:
../data/final_ml_dataset.csv
Shape: (983072, 12)


Unnamed: 0,state,district,pincode,date,total_enrolment,child_share,youth_share,adult_share,bio_ratio_5_17,bio_ratio_17,log_load,risk_label
0,Meghalaya,East Khasi Hills,793121,02-03-2025,109,0.1,0.554545,0.336364,0.0,0.0,4.70048,Low
1,Karnataka,Bengaluru Urban,560043,09-03-2025,86,0.16092,0.37931,0.448276,0.0,0.0,4.465908,Low
2,Uttar Pradesh,Kanpur Nagar,208001,09-03-2025,123,0.233871,0.66129,0.096774,0.0,0.0,4.820282,Low
3,Uttar Pradesh,Aligarh,202133,09-03-2025,106,0.579439,0.271028,0.140187,0.0,0.0,4.672829,Low
4,Karnataka,Bengaluru Urban,560016,09-03-2025,51,0.269231,0.307692,0.403846,0.0,0.0,3.951244,Low
