In [1]:
import pandas as pd

bio = pd.read_csv("data/combined_biometric_data.xls")
demo = pd.read_csv("data/combined_demographic_data.xls")
enroll = pd.read_csv("data/combined_Enrollment_data.xls")

print(bio.head())
print(demo.head())
print(enroll.head())

print(bio.shape, demo.shape, enroll.shape)


         date              state      district  pincode  bio_age_5_17  \
0  01-03-2025            Haryana  Mahendragarh   123029           280   
1  01-03-2025              Bihar     Madhepura   852121           144   
2  01-03-2025  Jammu and Kashmir         Punch   185101           643   
3  01-03-2025              Bihar       Bhojpur   802158           256   
4  01-03-2025         Tamil Nadu       Madurai   625514           271   

   bio_age_17_  
0          577  
1          369  
2         1091  
3          980  
4          815  
         date           state    district  pincode  demo_age_5_17  \
0  01-03-2025   Uttar Pradesh   Gorakhpur   273213             49   
1  01-03-2025  Andhra Pradesh    Chittoor   517132             22   
2  01-03-2025         Gujarat      Rajkot   360006             65   
3  01-03-2025  Andhra Pradesh  Srikakulam   532484             24   
4  01-03-2025       Rajasthan     Udaipur   313801             45   

   demo_age_17_  
0           529  
1       

In [3]:
def clean_keys(df):
    df = df.copy()

    df["date"] = pd.to_datetime(df["date"], errors="coerce", dayfirst=True)

    df["state"] = df["state"].astype(str).str.strip().str.title()
    df["district"] = df["district"].astype(str).str.strip().str.title()

    df["pincode"] = (
        df["pincode"].astype(str)
        .str.replace(r"\.0$", "", regex=True)
        .str.strip()
        .str.zfill(6)
    )
    return df

enroll = clean_keys(enroll)
demo  = clean_keys(demo)
bio   = clean_keys(bio)
print(enroll.head())
print(demo.head())
print(bio.head())

        date          state          district pincode  age_0_5  age_5_17  \
0 2025-03-02      Meghalaya  East Khasi Hills  793121       11        61   
1 2025-03-09      Karnataka   Bengaluru Urban  560043       14        33   
2 2025-03-09  Uttar Pradesh      Kanpur Nagar  208001       29        82   
3 2025-03-09  Uttar Pradesh           Aligarh  202133       62        29   
4 2025-03-09      Karnataka   Bengaluru Urban  560016       14        16   

   age_18_greater  
0              37  
1              39  
2              12  
3              15  
4              21  
        date           state    district pincode  demo_age_5_17  demo_age_17_
0 2025-03-01   Uttar Pradesh   Gorakhpur  273213             49           529
1 2025-03-01  Andhra Pradesh    Chittoor  517132             22           375
2 2025-03-01         Gujarat      Rajkot  360006             65           765
3 2025-03-01  Andhra Pradesh  Srikakulam  532484             24           314
4 2025-03-01       Rajasthan     

In [4]:
key_cols = ["date","state","district","pincode"]


print("Duplicates in enrol BEFORE:", enroll.duplicated(subset=key_cols).sum())
print("Duplicates in demo BEFORE:", demo.duplicated(subset=key_cols).sum())
print("Duplicates in bio BEFORE:", bio.duplicated(subset=key_cols).sum())

before = enroll.shape[0]
enroll_nodup = enroll.drop_duplicates(subset=key_cols)
after = enroll_nodup.shape[0]
print("Enroll Rows before:", before)
print("Enroll Rows after :", after)
print("Enroll Duplicates removed:", before - after)

before = demo.shape[0]
demo_nodup = demo.drop_duplicates(subset=key_cols)
after = demo_nodup.shape[0]
print("Demo Rows before:", before)
print("Demo Rows after :", after)
print("Demo Duplicates removed:", before - after)

before = bio.shape[0]
bio_nodup = bio.drop_duplicates(subset=key_cols)
after = bio_nodup.shape[0]
print("Bio Rows before:", before)
print("Bio Rows after :", after)
print("Bio Duplicates removed:", before - after)

dups = enroll[enroll.duplicated(subset=key_cols, keep=False)]
print(dups.head(20))
dups1 = demo[demo.duplicated(subset=key_cols, keep=False)]
print(dups1.head(20))
dups2 = bio[bio.duplicated(subset=key_cols, keep=False)]
print(dups2.head(20))
#only enroll has duplicates so we are just checking dangerous duplicates for enroll
dup_groups = enroll.groupby(key_cols).nunique()
dup_groups = dup_groups[dup_groups.max(axis=1) > 1]
print("Keys with conflicting duplicate values:", dup_groups.shape[0])
print(dup_groups.head())


Duplicates in enrol BEFORE: 23739
Duplicates in demo BEFORE: 475181
Duplicates in bio BEFORE: 96856
Enroll Rows before: 1006029
Enroll Rows after : 982290
Enroll Duplicates removed: 23739
Demo Rows before: 2071700
Demo Rows after : 1596519
Demo Duplicates removed: 475181
Bio Rows before: 1861108
Bio Rows after : 1764252
Bio Duplicates removed: 96856
            date        state             district pincode  age_0_5  age_5_17  \
4151  2025-09-01    Jharkhand  Seraikela-Kharsawan  833219        1         1   
4153  2025-09-01    Jharkhand  Seraikela-Kharsawan  832403        3         0   
4154  2025-09-01    Jharkhand  Seraikela-Kharsawan  833219        1         0   
4377  2025-09-01    Karnataka               Yadgir  585214        3         0   
4379  2025-09-01    Karnataka               Yadgir  585220        4         0   
4384  2025-09-01    Karnataka               Yadgir  585355        2         0   
6373  2025-09-01  West Bengal              Hooghly  712305        6         2   


In [21]:
print("Enroll Missing values BEFORE:\n", enroll.isna().sum())
print("Demo Missing values BEFORE:\n", demo.isna().sum())
print("Bio Missing values BEFORE:\n", bio.isna().sum())
enroll.fillna(0, inplace=True)
print("Enroll Missing values AFTER:\n", enroll.isna().sum())
demo.fillna(0, inplace=True)
print("Demo Missing values AFTER:\n", demo.isna().sum())
bio.fillna(0, inplace=True)
print("Bio Missing values AFTER:\n", bio.isna().sum())

Enroll Missing values BEFORE:
 date              0
state             0
district          0
pincode           0
age_0_5           0
age_5_17          0
age_18_greater    0
dtype: int64
Demo Missing values BEFORE:
 date             0
state            0
district         0
pincode          0
demo_age_5_17    0
demo_age_17_     0
dtype: int64
Bio Missing values BEFORE:
 date            0
state           0
district        0
pincode         0
bio_age_5_17    0
bio_age_17_     0
dtype: int64
Enroll Missing values AFTER:
 date              0
state             0
district          0
pincode           0
age_0_5           0
age_5_17          0
age_18_greater    0
dtype: int64
Demo Missing values AFTER:
 date             0
state            0
district         0
pincode          0
demo_age_5_17    0
demo_age_17_     0
dtype: int64
Bios Missing values AFTER:
 date            0
state           0
district        0
pincode         0
bio_age_5_17    0
bio_age_17_     0
dtype: int64


In [5]:
master = enroll.merge(demo, on=key_cols, how="outer")
master = master.merge(bio, on=key_cols, how="outer")

master.fillna(0, inplace=True)

print("Final merged dataset:", master.shape)
print(master.head())


Final merged dataset: (2952421, 11)
        date                        state  district pincode  age_0_5  \
0 2025-03-01    Andaman & Nicobar Islands  Andamans  744101      0.0   
1 2025-03-01  Andaman And Nicobar Islands   Nicobar  744301      0.0   
2 2025-03-01  Andaman And Nicobar Islands   Nicobar  744301      0.0   
3 2025-03-01  Andaman And Nicobar Islands   Nicobar  744302      0.0   
4 2025-03-01  Andaman And Nicobar Islands   Nicobar  744303      0.0   

   age_5_17  age_18_greater  demo_age_5_17  demo_age_17_  bio_age_5_17  \
0       0.0             0.0            0.0           0.0          16.0   
1       0.0             0.0           16.0         180.0         101.0   
2       0.0             0.0           16.0         180.0         101.0   
3       0.0             0.0            0.0           0.0          15.0   
4       0.0             0.0            0.0           0.0          46.0   

   bio_age_17_  
0        193.0  
1         48.0  
2         48.0  
3         12.0  
4

In [6]:
#was checking if there is  overlapping values 
key_cols = ["date","state","district","pincode"]

enroll_keys = enroll[key_cols].drop_duplicates()
bio_keys   = bio[key_cols].drop_duplicates()

common = enroll_keys.merge(bio_keys, on=key_cols, how="inner")

print("Unique enrolment keys:", len(enroll_keys))
print("Unique biometric keys:", len(bio_keys))
print("Common keys (match):", len(common))


Unique enrolment keys: 982290
Unique biometric keys: 1764252
Common keys (match): 725700


In [8]:
master = enroll.merge(demo, on=key_cols, how="outer")
master = master.merge(bio, on=key_cols, how="outer")

master.fillna(0, inplace=True)

print("Master shape:", master.shape)
master.head(20)


Master shape: (2952421, 11)


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_
0,2025-03-01,Andaman & Nicobar Islands,Andamans,744101,0.0,0.0,0.0,0.0,0.0,16.0,193.0
1,2025-03-01,Andaman And Nicobar Islands,Nicobar,744301,0.0,0.0,0.0,16.0,180.0,101.0,48.0
2,2025-03-01,Andaman And Nicobar Islands,Nicobar,744301,0.0,0.0,0.0,16.0,180.0,101.0,48.0
3,2025-03-01,Andaman And Nicobar Islands,Nicobar,744302,0.0,0.0,0.0,0.0,0.0,15.0,12.0
4,2025-03-01,Andaman And Nicobar Islands,Nicobar,744303,0.0,0.0,0.0,0.0,0.0,46.0,27.0
5,2025-03-01,Andaman And Nicobar Islands,Nicobar,744304,0.0,0.0,0.0,0.0,0.0,16.0,14.0
6,2025-03-01,Andaman And Nicobar Islands,North And Middle Andaman,744201,0.0,0.0,0.0,0.0,0.0,41.0,40.0
7,2025-03-01,Andaman And Nicobar Islands,North And Middle Andaman,744202,0.0,0.0,0.0,10.0,201.0,167.0,131.0
8,2025-03-01,Andaman And Nicobar Islands,North And Middle Andaman,744202,0.0,0.0,0.0,10.0,201.0,167.0,131.0
9,2025-03-01,Andaman And Nicobar Islands,North And Middle Andaman,744204,0.0,0.0,0.0,0.0,0.0,103.0,76.0


In [11]:
enroll[(enroll["date"]=="2025-03-01") & (enroll["pincode"]=="744101")] # i did this cuz as u can see few rows are completely empty (i tried for only enroll)
#that's cuz certain records are in some dataset and it is not present another its not really duplicate values these records are imp cuz That means those specific PINs on that date had biometric updates, but no enrolments (and maybe no demographic updates).

#That is totally realistic in real life:
#✔ people update biometrics
#❌ not everyone enrolls new Aadhaar daily in that PIN
#so yea sometimes they update their aadhar they need not necessarily go for enroll ment 

Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater


In [None]:
#mismatch heppens cuz our three datasets deals with different stuffs :
''' Enrolment dataset → new Aadhaar registrations

Demographic updates → address/mobile/name changes

Biometric updates → fingerprints/iris updates'''


## Creating dataset flags for the model to understand 

In [None]:
'''In your merged master dataset, a column can become 0 because:

Case A: Truly no activity happened

Example: no enrolments happened in that PIN on that day → 0 is real

Case B: That dataset didn’t have a record for that PIN+date

Example: biometric dataset has the row but enrolment dataset doesn’t → after merge it becomes NaN → you filled it as 0

So both look identical (0) in the table.

⚠️ Without flags, the model can’t distinguish between:

“no enrolment happened”
vs

“enrolment data wasn’t present for that row”

Even if both are acceptable as 0, the presence information itself is useful. It can also help the model to learn properly '''

In [21]:
master["has_enroll"] = (master["age_0_5"] + master["age_5_17"] + master["age_18_greater"]) > 0
master["has_demo"]  = (master["demo_age_5_17"] + master["demo_age_17_"]) > 0
master["has_bio"]   = (master["bio_age_5_17"] + master["bio_age_17_"]) > 0


In [22]:
print("Enroll")
print(master["has_enroll"].value_counts()) #am just checking whether the flags are updated 
print("Demo")
print(master["has_demo"].value_counts())
print("Bio")
print(master["has_bio"].value_counts())



Enroll
has_enroll
False    1696652
True     1255769
Name: count, dtype: int64
Demo
has_demo
True     2181321
False     771100
Name: count, dtype: int64
Bio
has_bio
True     2258797
False     693624
Name: count, dtype: int64


In [23]:
master.loc[master["has_enroll"] == True, ["age_0_5","age_5_17","age_18_greater","has_enroll"]].head(10)#again am just crosschecking whether flags are updated  



Unnamed: 0,age_0_5,age_5_17,age_18_greater,has_enroll
27113,11.0,61.0,37.0,True
27114,13.0,40.0,18.0,True
27115,18.0,120.0,22.0,True
27116,18.0,72.0,12.0,True
27117,30.0,48.0,10.0,True
27118,35.0,94.0,16.0,True
27119,49.0,186.0,34.0,True
27120,23.0,24.0,42.0,True
27121,20.0,49.0,12.0,True
27122,29.0,11.0,15.0,True


In [26]:
print(master.loc[master["has_enroll"] == False, 
           ["age_0_5","age_5_17","age_18_greater","has_enroll"]].head(10))
'''i did this cuz there were no false values in first 30 rows so i skeptical so i checked whether there were any false values in the following 
codes i will be checking for bio, demo'''


   age_0_5  age_5_17  age_18_greater  has_enroll
0      0.0       0.0             0.0       False
1      0.0       0.0             0.0       False
2      0.0       0.0             0.0       False
3      0.0       0.0             0.0       False
4      0.0       0.0             0.0       False
5      0.0       0.0             0.0       False
6      0.0       0.0             0.0       False
7      0.0       0.0             0.0       False
8      0.0       0.0             0.0       False
9      0.0       0.0             0.0       False


'i did this cuz there were no false values in first 30 rows so i skeptical so i checked whether there were any false values in the following \ncodes i will be checking for bio, demo'

In [28]:
master.loc[master["has_demo"] == True, ["age_0_5","age_5_17","age_18_greater","has_demo"]].head(10)

Unnamed: 0,age_0_5,age_5_17,age_18_greater,has_demo
1,0.0,0.0,0.0,True
2,0.0,0.0,0.0,True
7,0.0,0.0,0.0,True
8,0.0,0.0,0.0,True
13,0.0,0.0,0.0,True
15,0.0,0.0,0.0,True
17,0.0,0.0,0.0,True
18,0.0,0.0,0.0,True
24,0.0,0.0,0.0,True
26,0.0,0.0,0.0,True


In [29]:
print(master.loc[master["has_demo"] == False, 
           ["age_0_5","age_5_17","age_18_greater","has_demo"]].head(10))

    age_0_5  age_5_17  age_18_greater  has_demo
0       0.0       0.0             0.0     False
3       0.0       0.0             0.0     False
4       0.0       0.0             0.0     False
5       0.0       0.0             0.0     False
6       0.0       0.0             0.0     False
9       0.0       0.0             0.0     False
10      0.0       0.0             0.0     False
11      0.0       0.0             0.0     False
12      0.0       0.0             0.0     False
14      0.0       0.0             0.0     False


In [30]:
master.loc[master["has_bio"] == True, ["age_0_5","age_5_17","age_18_greater","has_bio"]].head(10)

Unnamed: 0,age_0_5,age_5_17,age_18_greater,has_bio
0,0.0,0.0,0.0,True
1,0.0,0.0,0.0,True
2,0.0,0.0,0.0,True
3,0.0,0.0,0.0,True
4,0.0,0.0,0.0,True
5,0.0,0.0,0.0,True
6,0.0,0.0,0.0,True
7,0.0,0.0,0.0,True
8,0.0,0.0,0.0,True
9,0.0,0.0,0.0,True


In [32]:
print(master.loc[master["has_bio"] == False, 
           ["age_0_5","age_5_17","age_18_greater","has_bio"]].head(10))

     age_0_5  age_5_17  age_18_greater  has_bio
27       0.0       0.0             0.0    False
28       0.0       0.0             0.0    False
91       0.0       0.0             0.0    False
92       0.0       0.0             0.0    False
93       0.0       0.0             0.0    False
94       0.0       0.0             0.0    False
99       0.0       0.0             0.0    False
102      0.0       0.0             0.0    False
106      0.0       0.0             0.0    False
107      0.0       0.0             0.0    False


In [34]:
master["total_updates"] = (
    master["demo_age_5_17"] + master["demo_age_17_"] +
    master["bio_age_5_17"] + master["bio_age_17_"]
)
print(master.head())#Then you forecast total_updates or load_score.This way, even if enrolment is 0, the model still learns from updates.

        date                        state  district pincode  age_0_5  \
0 2025-03-01    Andaman & Nicobar Islands  Andamans  744101      0.0   
1 2025-03-01  Andaman And Nicobar Islands   Nicobar  744301      0.0   
2 2025-03-01  Andaman And Nicobar Islands   Nicobar  744301      0.0   
3 2025-03-01  Andaman And Nicobar Islands   Nicobar  744302      0.0   
4 2025-03-01  Andaman And Nicobar Islands   Nicobar  744303      0.0   

   age_5_17  age_18_greater  demo_age_5_17  demo_age_17_  bio_age_5_17  \
0       0.0             0.0            0.0           0.0          16.0   
1       0.0             0.0           16.0         180.0         101.0   
2       0.0             0.0           16.0         180.0         101.0   
3       0.0             0.0            0.0           0.0          15.0   
4       0.0             0.0            0.0           0.0          46.0   

   bio_age_17_  has_enrol  has_demo  has_bio  has_enroll  total_updates  
0        193.0      False     False     True    

In [36]:
activity_cols = [
    "age_0_5","age_5_17","age_18_greater",
    "demo_age_5_17","demo_age_17_",
    "bio_age_5_17","bio_age_17_"
]

master = master[master[activity_cols].sum(axis=1) > 0] 
master.head()# removing rows in which all values are zero cleaning the noise.


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater,demo_age_5_17,demo_age_17_,bio_age_5_17,bio_age_17_,has_enrol,has_demo,has_bio,has_enroll,total_updates
0,2025-03-01,Andaman & Nicobar Islands,Andamans,744101,0.0,0.0,0.0,0.0,0.0,16.0,193.0,False,False,True,False,209.0
1,2025-03-01,Andaman And Nicobar Islands,Nicobar,744301,0.0,0.0,0.0,16.0,180.0,101.0,48.0,False,True,True,False,345.0
2,2025-03-01,Andaman And Nicobar Islands,Nicobar,744301,0.0,0.0,0.0,16.0,180.0,101.0,48.0,False,True,True,False,345.0
3,2025-03-01,Andaman And Nicobar Islands,Nicobar,744302,0.0,0.0,0.0,0.0,0.0,15.0,12.0,False,False,True,False,27.0
4,2025-03-01,Andaman And Nicobar Islands,Nicobar,744303,0.0,0.0,0.0,0.0,0.0,46.0,27.0,False,False,True,False,73.0


In [37]:
master.shape

(2951501, 16)

In [38]:
master.to_csv("aadhaar_master_merged.csv", index=False, encoding="utf-8-sig")
print("Saved successfully!")


Saved successfully!
