In [1]:
import pandas as pd 
import os

In [2]:
DEMOGRAPHIC_RAW_PATH="../raw_dataset/api_data_aadhar_demographic"

In [3]:
demographic_files=[os.path.join(DEMOGRAPHIC_RAW_PATH,file)
                   for file in os.listdir(DEMOGRAPHIC_RAW_PATH)
                   if file.endswith(".csv")
]

In [4]:
demographic_raw_df=pd.concat([pd.read_csv(file) for file in demographic_files],ignore_index=True)
demographic_raw_df.shape

(2071700, 6)

In [5]:
demographic_raw_df.head()

Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,01-03-2025,Uttar Pradesh,Gorakhpur,273213,49,529
1,01-03-2025,Andhra Pradesh,Chittoor,517132,22,375
2,01-03-2025,Gujarat,Rajkot,360006,65,765
3,01-03-2025,Andhra Pradesh,Srikakulam,532484,24,314
4,01-03-2025,Rajasthan,Udaipur,313801,45,785


In [6]:
demographic_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2071700 entries, 0 to 2071699
Data columns (total 6 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   date           object
 1   state          object
 2   district       object
 3   pincode        int64 
 4   demo_age_5_17  int64 
 5   demo_age_17_   int64 
dtypes: int64(3), object(3)
memory usage: 94.8+ MB


In [7]:
demographic_raw_df.columns

Index(['date', 'state', 'district', 'pincode', 'demo_age_5_17',
       'demo_age_17_'],
      dtype='object')

In [8]:
data_space_issues=(demographic_raw_df["date"].str.startswith(" ")|
                   demographic_raw_df["date"].str.startswith(" ")).sum()  

data_space_issues

np.int64(0)

In [9]:
demographic_raw_df["date"].unique()[:10]

array(['01-03-2025', '01-04-2025', '01-05-2025', '01-06-2025',
       '01-07-2025', '01-09-2025', '02-09-2025', '03-09-2025',
       '04-09-2025', '06-09-2025'], dtype=object)

In [10]:
invalid_dayfirst=pd.to_datetime(demographic_raw_df["date"],dayfirst=True,errors="coerce").isna().sum()

In [11]:
invalid_monthfirst=pd.to_datetime(demographic_raw_df["date"],dayfirst=False,errors="coerce").isna().sum()

In [12]:
invalid_dayfirst,invalid_monthfirst

(np.int64(0), np.int64(1187968))

In [13]:
demographic_clean_df=demographic_raw_df.copy()
demographic_clean_df["date"]=pd.to_datetime(demographic_clean_df["date"],dayfirst=True)

In [14]:
demographic_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2071700 entries, 0 to 2071699
Data columns (total 6 columns):
 #   Column         Dtype         
---  ------         -----         
 0   date           datetime64[ns]
 1   state          object        
 2   district       object        
 3   pincode        int64         
 4   demo_age_5_17  int64         
 5   demo_age_17_   int64         
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 94.8+ MB


In [15]:
demographic_clean_df["date"].head()

0   2025-03-01
1   2025-03-01
2   2025-03-01
3   2025-03-01
4   2025-03-01
Name: date, dtype: datetime64[ns]

In [16]:
demographic_clean_df["state"].nunique()

65

In [17]:
demographic_clean_df["state"].unique()

array(['Uttar Pradesh', 'Andhra Pradesh', 'Gujarat', 'Rajasthan',
       'Karnataka', 'West Bengal', 'Telangana', 'Odisha', 'Maharashtra',
       'Kerala', 'Bihar', 'Tamil Nadu', 'Madhya Pradesh', 'Assam',
       'Tripura', 'Arunachal Pradesh', 'Punjab', 'Jharkhand', 'Delhi',
       'Chandigarh', 'Chhattisgarh', 'Jammu and Kashmir', 'Mizoram',
       'Nagaland', 'Himachal Pradesh', 'Goa', 'Haryana', 'Meghalaya',
       'Uttarakhand', 'Manipur', 'Daman and Diu', 'Puducherry', 'Sikkim',
       'Ladakh', 'Dadra and Nagar Haveli and Daman and Diu',
       'Dadra and Nagar Haveli', 'Orissa', 'Pondicherry',
       'Andaman & Nicobar Islands', 'Andaman and Nicobar Islands',
       'west Bengal', 'Daman & Diu', 'West  Bengal', 'odisha',
       'Jammu & Kashmir', 'Lakshadweep', 'Dadra & Nagar Haveli',
       'Westbengal', 'andhra pradesh', 'WEST BENGAL', 'West Bangal',
       'West bengal', 'ODISHA', 'WESTBENGAL', 'Chhatisgarh',
       'West Bengli', 'Darbhanga', 'Puttenahalli', 'Uttaranchal',


In [18]:
state_space_issues=(demographic_clean_df["state"].str.startswith(" ")|demographic_clean_df["state"].str.endswith(" ")).sum()

In [19]:
state_space_issues

np.int64(0)

In [20]:
demographic_clean_df["state_clean"]=(demographic_clean_df["state"].str.lower().str.strip().str.replace("&","and",regex=False).str.replace(r"\s+"," ",regex=True))

In [21]:
canonical_states=sorted(["andhra pradesh","arunachal pradesh","assam","bihar","chhattisgarh","goa","gujarat","haryana","himachal pradesh","jharkhand","karnataka","kerala","madhya pradesh","maharashtra","manipur","meghalaya","mizoram","nagaland","odisha","punjab","rajasthan","sikkim","tamil nadu","telangana","tripura","uttar pradesh","uttarakhand","west bengal","andaman and nicobar islands","chandigarh","dadra and nagar haveli and daman and diu","delhi","jammu and kashmir","ladakh","lakshadweep","puducherry"])

In [22]:
state_mapping={
  "orissa":"odisha",
  "pondicherry":"puducherry",
  "uttaranchal":"uttarakhand",

  "jammu and Kashmir":"jammu and kashmir",

  "daman and diu":"dadra and nagar haveli and daman and diu",
  "dadra and nagar haveli":"dadra and nagar haveli and daman and diu",

  "westbengal":"west bengal",
  "west bangal":"west bengal",
  "west bengli":"west bengal",
  "west bengali":"west bengal"
  


}

In [23]:
demographic_clean_df["state_final"]=demographic_clean_df["state_clean"].apply(
  lambda x:state_mapping[x]
  if x in state_mapping
  else x 
  if x in canonical_states
  else "Unknown/Invalid"

)

In [24]:
demographic_clean_df["state_final"].nunique()

37

In [25]:
demographic_clean_df["state_final"].value_counts()

state_final
andhra pradesh                              207740
tamil nadu                                  196857
west bengal                                 169070
uttar pradesh                               167889
maharashtra                                 162242
karnataka                                   153957
odisha                                      105935
kerala                                      105515
bihar                                        97621
gujarat                                      96399
rajasthan                                    89508
telangana                                    89086
madhya pradesh                               76364
assam                                        62834
punjab                                       49611
jharkhand                                    39653
chhattisgarh                                 35726
haryana                                      28554
himachal pradesh                             28037
uttarakhand        

In [26]:
demographic_clean_df[demographic_clean_df["state_clean"].str.contains("beng",na=False)][["state","state_clean","state_final"]].drop_duplicates()

Unnamed: 0,state,state_clean,state_final
9,West Bengal,west bengal,west bengal
8159,west Bengal,west bengal,west bengal
11158,West Bengal,west bengal,west bengal
28365,Westbengal,westbengal,west bengal
31248,WEST BENGAL,west bengal,west bengal
51262,West bengal,west bengal,west bengal
102667,WESTBENGAL,westbengal,west bengal
197588,West Bengli,west bengli,west bengal


In [27]:
demographic_clean_df["district"].dtype


dtype('O')

In [28]:
demographic_clean_df["district"].isna().sum()


np.int64(0)

In [29]:
demographic_clean_df["district"].nunique()

983

In [30]:
demographic_clean_df["district"].unique()[:30]

array(['Gorakhpur', 'Chittoor', 'Rajkot', 'Srikakulam', 'Udaipur',
       'Sikar', 'Tumakuru', 'Kurnool', 'Paschim Medinipur', 'Ghazipur',
       'Patan', 'Mulugu', 'Ganganagar', 'Nayagarh', 'Nashik',
       'Shivamogga', 'Thrissur', 'Hassan', 'Patna', 'Belgaum',
       'Kancheepuram', 'Jabalpur', 'Chennai', 'Tinsukia', 'Jamui', 'Gaya',
       'Bengaluru', 'Tiruppur', 'Jalgaon', 'Dhalai'], dtype=object)

In [31]:
(
  demographic_clean_df["district"].str.startswith(" ")|demographic_clean_df["district"].str.endswith(" ")
).sum()

np.int64(0)

In [32]:
demographic_clean_df["district_norm"]=(
  demographic_clean_df["district"].str.lower().str.strip().str.replace(r"\s+"," ",regex=True)
)

In [33]:
print("Original district unique count:",demographic_clean_df["district"].nunique())

Original district unique count: 983


In [34]:
print("Normalized district unique count:",demographic_clean_df["district_norm"].nunique())

Normalized district unique count: 960


In [35]:
(demographic_clean_df.groupby(["state_final","district_norm"])["district"].nunique().sort_values(ascending=False).head(10))

state_final     district_norm      
odisha          jajpur                 3
west bengal     east midnapore         3
                hooghly                3
                nadia                  3
andhra pradesh  rangareddi             2
jharkhand       seraikela-kharsawan    2
andhra pradesh  chittoor               2
karnataka       yadgir                 2
bihar           aurangabad(bh)         2
odisha          angul                  2
Name: district, dtype: int64

In [36]:
demographic_clean_df["pincode"].dtype

dtype('int64')

In [37]:
demographic_clean_df["pincode"].isna().sum()

np.int64(0)

In [38]:
demographic_clean_df["pincode"].nunique()

19742

In [39]:
demographic_clean_df["pincode"].astype(str).str.len().value_counts()

pincode
6    2071700
Name: count, dtype: int64

In [40]:
demographic_clean_df["pincode"].value_counts().head(10)

pincode
533464    488
500055    474
500018    424
491888    412
509105    401
713130    397
506164    397
507111    397
509130    393
743329    392
Name: count, dtype: int64

In [41]:
demographic_clean_df["pincode"]=demographic_clean_df["pincode"].astype(str)

In [42]:
demographic_clean_df["pincode"].dtype

dtype('O')

In [43]:
demographic_agg_df=(
  demographic_clean_df.groupby(["date","state_final","district_norm"],as_index=False).agg({"demo_age_5_17":"sum","demo_age_17_":"sum"})
)

In [44]:
demographic_agg_df["total_demographic_updates"]=(demographic_agg_df["demo_age_5_17"]+demographic_agg_df["demo_age_17_"])

In [45]:
print("Aggregated shape:",demographic_agg_df.shape)

Aggregated shape: (82536, 6)


In [46]:
demographic_agg_df.head()

Unnamed: 0,date,state_final,district_norm,demo_age_5_17,demo_age_17_,total_demographic_updates
0,2025-03-01,andaman and nicobar islands,nicobar,32,360,392
1,2025-03-01,andaman and nicobar islands,north and middle andaman,20,402,422
2,2025-03-01,andaman and nicobar islands,south andaman,74,450,524
3,2025-03-01,andhra pradesh,adilabad,390,3950,4340
4,2025-03-01,andhra pradesh,alluri sitharama raju,507,4448,4955


In [47]:
print("Original total updates:",demographic_clean_df[["demo_age_5_17","demo_age_17_"]].sum().sum())

Original total updates: 49295187


In [48]:
print("Aggegated total updates:",demographic_agg_df["total_demographic_updates"].sum())

Aggegated total updates: 49295187


In [49]:
(demographic_agg_df["total_demographic_updates"]>=0).all()

np.True_

In [50]:
district_lookup_demo=(
  demographic_clean_df.groupby(["state_final","district_norm"])["district"].agg(lambda x:x.mode().iloc[0]).reset_index()
)

In [51]:
demographic_agg_df=demographic_agg_df.merge(district_lookup_demo,on=["state_final","district_norm"],how="left")

In [52]:
demographic_agg_df.columns

Index(['date', 'state_final', 'district_norm', 'demo_age_5_17', 'demo_age_17_',
       'total_demographic_updates', 'district'],
      dtype='object')

In [53]:
print("Aggreagted shape:",demographic_agg_df.shape)

Aggreagted shape: (82536, 7)


In [54]:
print("Original total updates:",demographic_clean_df[["demo_age_5_17","demo_age_17_"]].sum().sum())

Original total updates: 49295187


In [55]:
print("Aggregated total updates:",demographic_agg_df["total_demographic_updates"].sum())

Aggregated total updates: 49295187


In [56]:
(demographic_agg_df["total_demographic_updates"]>=0).all()

np.True_

In [57]:
os.makedirs("../final_dataset",exist_ok=True)
demographic_agg_df.to_csv("../final_dataset/demographic_aggregated.csv",index=False)

In [58]:
os.listdir("../final_dataset")

['demographic_aggregated.csv', 'enrolment_aggregated.csv']