# UIDAI Data Analysis
## Data Cleaning and Preprocessing

This notebook cleans the UIDAI datasets by:
- Converting all column names to lowercase
- Removing spaces from column names (replacing with underscores)

In [1]:
import pandas as pd
import numpy as np
import os

# Display settings for better viewing
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1. Load and Clean Demographic Data

In [2]:
# Load demographic data
print("Loading Merged_Aadhar_demographic.csv...")
df_demographic = pd.read_csv('Merged_Aadhar_demographic.csv')

print(f"\nOriginal shape: {df_demographic.shape}")
print(f"\nOriginal columns:")
print(df_demographic.columns.tolist())

Loading Merged_Aadhar_demographic.csv...

Original shape: (2071700, 6)

Original columns:
['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']


In [3]:
# Clean column names
df_demographic.columns = df_demographic.columns.str.lower().str.replace(' ', '_', regex=False)

print(f"\nCleaned columns:")
print(df_demographic.columns.tolist())
print(f"\nPreview:")
df_demographic.head()


Cleaned columns:
['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']

Preview:


Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,01-03-2025,Uttar Pradesh,Gorakhpur,273213,49,529
1,01-03-2025,Andhra Pradesh,Chittoor,517132,22,375
2,01-03-2025,Gujarat,Rajkot,360006,65,765
3,01-03-2025,Andhra Pradesh,Srikakulam,532484,24,314
4,01-03-2025,Rajasthan,Udaipur,313801,45,785


## 2. Load and Clean Biometric Data

In [4]:
# Load biometric data
print("Loading merged_aadhar_biometric.csv...")
df_biometric = pd.read_csv('merged_aadhar_biometric.csv')

print(f"\nOriginal shape: {df_biometric.shape}")
print(f"\nOriginal columns:")
print(df_biometric.columns.tolist())

Loading merged_aadhar_biometric.csv...

Original shape: (1861108, 6)

Original columns:
['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_']


In [5]:
# Clean column names
df_biometric.columns = df_biometric.columns.str.lower().str.replace(' ', '_', regex=False)

print(f"\nCleaned columns:")
print(df_biometric.columns.tolist())
print(f"\nPreview:")
df_biometric.head()


Cleaned columns:
['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_']

Preview:


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,01-03-2025,Haryana,Mahendragarh,123029,280,577
1,01-03-2025,Bihar,Madhepura,852121,144,369
2,01-03-2025,Jammu and Kashmir,Punch,185101,643,1091
3,01-03-2025,Bihar,Bhojpur,802158,256,980
4,01-03-2025,Tamil Nadu,Madurai,625514,271,815


## 3. Load and Clean Enrollment Data

In [6]:
# Load enrollment data
print("Loading merged_enrollement.csv...")
df_enrollment = pd.read_csv('merged_enrollement.csv')

print(f"\nOriginal shape: {df_enrollment.shape}")
print(f"\nOriginal columns:")
print(df_enrollment.columns.tolist())

Loading merged_enrollement.csv...

Original shape: (1006029, 7)

Original columns:
['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']


In [7]:
# Clean column names
df_enrollment.columns = df_enrollment.columns.str.lower().str.replace(' ', '_', regex=False)

print(f"\nCleaned columns:")
print(df_enrollment.columns.tolist())
print(f"\nPreview:")
df_enrollment.head()


Cleaned columns:
['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']

Preview:


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,26-10-2025,Andhra Pradesh,Nalgonda,508004,0,1,0
1,26-10-2025,Andhra Pradesh,Nalgonda,508238,1,0,0
2,26-10-2025,Andhra Pradesh,Nalgonda,508278,1,0,0
3,26-10-2025,Andhra Pradesh,Nandyal,518432,0,1,0
4,26-10-2025,Andhra Pradesh,Nandyal,518543,1,0,0


## 4. Save Cleaned Data

In [8]:
# Save cleaned datasets
print("Saving cleaned datasets...")

df_demographic.to_csv('cleaned_demographic.csv', index=False)
print(f"✓ Saved cleaned_demographic.csv ({df_demographic.shape[0]:,} rows, {df_demographic.shape[1]} columns)")

df_biometric.to_csv('cleaned_biometric.csv', index=False)
print(f"✓ Saved cleaned_biometric.csv ({df_biometric.shape[0]:,} rows, {df_biometric.shape[1]} columns)")

df_enrollment.to_csv('cleaned_enrollment.csv', index=False)
print(f"✓ Saved cleaned_enrollment.csv ({df_enrollment.shape[0]:,} rows, {df_enrollment.shape[1]} columns)")

print("\n✓ All data cleaned successfully!")

Saving cleaned datasets...
✓ Saved cleaned_demographic.csv (2,071,700 rows, 6 columns)
✓ Saved cleaned_biometric.csv (1,861,108 rows, 6 columns)
✓ Saved cleaned_enrollment.csv (1,006,029 rows, 7 columns)

✓ All data cleaned successfully!


## 5. Initial Data Exploration

In [9]:
# Summary statistics for demographic data
print("=== Demographic Data Summary ===")
df_demographic.info()
print("\nBasic statistics:")
df_demographic.describe()

=== Demographic Data Summary ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2071700 entries, 0 to 2071699
Data columns (total 6 columns):
 #   Column         Dtype 
---  ------         ----- 
 0   date           object
 1   state          object
 2   district       object
 3   pincode        int64 
 4   demo_age_5_17  int64 
 5   demo_age_17_   int64 
dtypes: int64(3), object(3)
memory usage: 94.8+ MB

Basic statistics:


Unnamed: 0,pincode,demo_age_5_17,demo_age_17_
count,2071700.0,2071700.0,2071700.0
mean,527831.8,2.347552,21.44701
std,197293.3,14.90355,125.2498
min,100000.0,0.0,0.0
25%,396469.0,0.0,2.0
50%,524322.0,1.0,6.0
75%,695507.0,2.0,15.0
max,855456.0,2690.0,16166.0


In [10]:
# Check for missing values
print("=== Missing Values ===")
print("\nDemographic:")
print(df_demographic.isnull().sum())

print("\nBiometric:")
print(df_biometric.isnull().sum())

print("\nEnrollment:")
print(df_enrollment.isnull().sum())

=== Missing Values ===

Demographic:
date             0
state            0
district         0
pincode          0
demo_age_5_17    0
demo_age_17_     0
dtype: int64

Biometric:
date            0
state           0
district        0
pincode         0
bio_age_5_17    0
bio_age_17_     0
dtype: int64

Enrollment:
date              0
state             0
district          0
pincode           0
age_0_5           0
age_5_17          0
age_18_greater    0
dtype: int64


## Next Steps

Now that the data is cleaned, you can:
- Perform exploratory data analysis (EDA)
- Create visualizations
- Analyze patterns and trends
- Merge datasets if needed
- Apply statistical analysis