In [1]:
# Import required libraries
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Custom modules
from data_loader import AadhaarDataLoader
from preprocessing import AadhaarDataPreprocessor
from visualization import AadhaarVisualizer

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("✓ Libraries imported successfully")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

✓ Libraries imported successfully
Pandas version: 2.1.4
NumPy version: 1.26.2


## 1. Initialize Data Loader and Preprocessor

In [2]:
# Initialize
BASE_PATH = '/Users/satyamsharma/Satverse AI/UIDAI Data Hackathon 2026'
loader = AadhaarDataLoader(BASE_PATH)
preprocessor = AadhaarDataPreprocessor()
visualizer = AadhaarVisualizer(output_dir='../outputs/figures')

print("✓ Initialized data loader, preprocessor, and visualizer")

✓ Initialized data loader, preprocessor, and visualizer


## 2. Dataset Information

Get overview of available datasets before loading

In [3]:
# Get dataset information
info = loader.get_dataset_info()

print("="*80)
print("DATASET INFORMATION")
print("="*80)

for dataset_type, details in info.items():
    print(f"\n{dataset_type.upper()}")
    print(f"  Number of files: {details['num_files']}")
    print(f"  Total rows: {details['total_rows']:,}")
    print(f"  Files:")
    for file in details['files']:
        print(f"    - {file}")

total_rows = sum(d['total_rows'] for d in info.values())
print(f"\n{'='*80}")
print(f"TOTAL RECORDS ACROSS ALL DATASETS: {total_rows:,}")
print(f"{'='*80}")

DATASET INFORMATION

ENROLMENT
  Number of files: 3
  Total rows: 1,006,029
  Files:
    - api_data_aadhar_enrolment_1000000_1006029.csv
    - api_data_aadhar_enrolment_500000_1000000.csv
    - api_data_aadhar_enrolment_0_500000.csv

DEMOGRAPHIC
  Number of files: 5
  Total rows: 2,071,700
  Files:
    - api_data_aadhar_demographic_1500000_2000000.csv
    - api_data_aadhar_demographic_1000000_1500000.csv
    - api_data_aadhar_demographic_2000000_2071700.csv
    - api_data_aadhar_demographic_0_500000.csv
    - api_data_aadhar_demographic_500000_1000000.csv

BIOMETRIC
  Number of files: 4
  Total rows: 1,861,108
  Files:
    - api_data_aadhar_biometric_500000_1000000.csv
    - api_data_aadhar_biometric_1500000_1861108.csv
    - api_data_aadhar_biometric_0_500000.csv
    - api_data_aadhar_biometric_1000000_1500000.csv

TOTAL RECORDS ACROSS ALL DATASETS: 4,938,837


## 3. Load Datasets

Load all datasets. We'll start with a sample for faster processing, then can load full data.

In [4]:
# Load datasets - adjust sample_frac as needed
# sample_frac=0.1 means 10% of data (faster for initial exploration)
# sample_frac=1.0 means full data (use for final analysis)

SAMPLE_FRACTION = 0.2  # Start with 20% of data

print("Loading datasets...")
print(f"Sample fraction: {SAMPLE_FRACTION * 100}%\n")

datasets = loader.load_all_datasets(sample_frac=SAMPLE_FRACTION)

enrolment_df = datasets['enrolment']
demographic_df = datasets['demographic']
biometric_df = datasets['biometric']

print("\n✓ All datasets loaded successfully")

Loading datasets...
Sample fraction: 20.0%

Loading enrolment dataset from 3 files...


  0%|          | 0/3 [00:00<?, ?it/s]

 33%|███▎      | 1/3 [00:00<00:00,  7.65it/s]

100%|██████████| 3/3 [00:00<00:00, 12.76it/s]

100%|██████████| 3/3 [00:00<00:00, 11.94it/s]




Loaded 201,206 records for enrolment
Loading demographic dataset from 5 files...


  0%|          | 0/5 [00:00<?, ?it/s]

 20%|██        | 1/5 [00:00<00:00,  8.91it/s]

 40%|████      | 2/5 [00:00<00:00,  8.98it/s]

 60%|██████    | 3/5 [00:00<00:00,  9.01it/s]

100%|██████████| 5/5 [00:00<00:00, 11.81it/s]

100%|██████████| 5/5 [00:00<00:00, 10.80it/s]




Loaded 414,340 records for demographic
Loading biometric dataset from 4 files...


  0%|          | 0/4 [00:00<?, ?it/s]

 25%|██▌       | 1/4 [00:00<00:00,  8.99it/s]

 50%|█████     | 2/4 [00:00<00:00,  9.05it/s]

100%|██████████| 4/4 [00:00<00:00,  9.92it/s]

100%|██████████| 4/4 [00:00<00:00,  9.72it/s]

Loaded 372,222 records for biometric

✓ All datasets loaded successfully





## 4. Initial Data Exploration

In [5]:
# Enrolment dataset
print("="*80)
print("ENROLMENT DATASET")
print("="*80)
print(f"Shape: {enrolment_df.shape}")
print(f"\nColumns: {enrolment_df.columns.tolist()}")
print(f"\nFirst few rows:")
display(enrolment_df.head())
print(f"\nData types:")
print(enrolment_df.dtypes)
print(f"\nBasic statistics:")
display(enrolment_df.describe())

ENROLMENT DATASET
Shape: (201206, 7)

Columns: ['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']

First few rows:


Unnamed: 0,date,state,district,pincode,age_0_5,age_5_17,age_18_greater
0,08-09-2025,Telangana,Sangareddy,502273,2,0,0
1,15-09-2025,Uttarakhand,Chamoli,246486,0,1,0
2,10-09-2025,Tamil Nadu,Tiruvannamalai,606704,2,0,0
3,10-09-2025,Telangana,Nizamabad,503180,2,0,0
4,17-10-2025,Karnataka,Belgaum,591222,2,0,0



Data types:
date              object
state             object
district          object
pincode            int64
age_0_5            int64
age_5_17           int64
age_18_greater     int64
dtype: object

Basic statistics:


Unnamed: 0,pincode,age_0_5,age_5_17,age_18_greater
count,201206.0,201206.0,201206.0,201206.0
mean,518717.662664,3.561792,1.726887,0.180715
std,205831.597391,18.889037,14.780721,3.461224
min,100000.0,0.0,0.0,0.0
25%,364001.0,1.0,0.0,0.0
50%,517417.0,2.0,0.0,0.0
75%,700115.75,3.0,1.0,0.0
max,855456.0,2688.0,1376.0,430.0


In [6]:
# Demographic dataset
print("="*80)
print("DEMOGRAPHIC UPDATE DATASET")
print("="*80)
print(f"Shape: {demographic_df.shape}")
print(f"\nColumns: {demographic_df.columns.tolist()}")
print(f"\nFirst few rows:")
display(demographic_df.head())
print(f"\nData types:")
print(demographic_df.dtypes)
print(f"\nBasic statistics:")
display(demographic_df.describe())

DEMOGRAPHIC UPDATE DATASET
Shape: (414340, 6)

Columns: ['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']

First few rows:


Unnamed: 0,date,state,district,pincode,demo_age_5_17,demo_age_17_
0,16-10-2025,Kerala,Thiruvananthapuram,695561,2,10
1,07-11-2025,Madhya Pradesh,Tikamgarh,472001,4,43
2,18-11-2025,Goa,North Goa,403104,0,2
3,21-10-2025,Assam,Nagaon,782141,0,1
4,28-12-2025,Andhra Pradesh,Guntur,522264,1,1



Data types:
date             object
state            object
district         object
pincode           int64
demo_age_5_17     int64
demo_age_17_      int64
dtype: object

Basic statistics:


Unnamed: 0,pincode,demo_age_5_17,demo_age_17_
count,414340.0,414340.0,414340.0
mean,527838.310535,2.356591,21.636704
std,197325.420169,15.67938,132.894713
min,100000.0,0.0,0.0
25%,396466.0,0.0,2.0
50%,524320.0,1.0,6.0
75%,695522.0,2.0,15.0
max,855456.0,2690.0,15090.0


In [7]:
# Biometric dataset
print("="*80)
print("BIOMETRIC UPDATE DATASET")
print("="*80)
print(f"Shape: {biometric_df.shape}")
print(f"\nColumns: {biometric_df.columns.tolist()}")
print(f"\nFirst few rows:")
display(biometric_df.head())
print(f"\nData types:")
print(biometric_df.dtypes)
print(f"\nBasic statistics:")
display(biometric_df.describe())

BIOMETRIC UPDATE DATASET
Shape: (372222, 6)

Columns: ['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_']

First few rows:


Unnamed: 0,date,state,district,pincode,bio_age_5_17,bio_age_17_
0,01-07-2025,Tamil Nadu,Salem,636303,142,417
1,04-09-2025,Uttar Pradesh,Fatehpur,212657,6,5
2,02-09-2025,Tamil Nadu,Tirunelveli,627009,0,1
3,02-09-2025,Andaman and Nicobar Islands,South Andaman,744106,2,0
4,15-09-2025,West Bengal,Paschim Medinipur,721503,2,5



Data types:
date            object
state           object
district        object
pincode          int64
bio_age_5_17     int64
bio_age_17_      int64
dtype: object

Basic statistics:


Unnamed: 0,pincode,bio_age_5_17,bio_age_17_
count,372222.0,372222.0,372222.0
mean,521993.778997,18.429684,19.054204
std,197998.630616,84.308845,87.948099
min,110001.0,0.0,0.0
25%,391530.0,1.0,1.0
50%,522412.0,4.0,4.0
75%,686611.0,11.0,10.0
max,855456.0,7657.0,7201.0


## 5. Data Validation

Check data quality issues

In [8]:
# Validate enrolment data
print("VALIDATING ENROLMENT DATA")
print("="*80)
enrolment_validation = preprocessor.validate_data(enrolment_df, 'enrolment')

for key, value in enrolment_validation.items():
    print(f"{key}: {value}")

VALIDATING ENROLMENT DATA
total_rows: 201206
missing_values: {'date': 0, 'state': 0, 'district': 0, 'pincode': 0, 'age_0_5': 0, 'age_5_17': 0, 'age_18_greater': 0}
duplicate_rows: 949
date_range: (Timestamp('2025-03-09 00:00:00'), Timestamp('2025-12-31 00:00:00'))
unique_states: 51
unique_districts: 933
unique_pincodes: 18936
negative_values: {}
invalid_dates: 0
invalid_pincodes: 0


In [9]:
# Validate demographic data
print("\nVALIDATING DEMOGRAPHIC DATA")
print("="*80)
demographic_validation = preprocessor.validate_data(demographic_df, 'demographic')

for key, value in demographic_validation.items():
    print(f"{key}: {value}")


VALIDATING DEMOGRAPHIC DATA


total_rows: 414340
missing_values: {'date': 0, 'state': 0, 'district': 0, 'pincode': 0, 'demo_age_5_17': 0, 'demo_age_17_': 0}
duplicate_rows: 18925
date_range: (Timestamp('2025-03-01 00:00:00'), Timestamp('2025-12-29 00:00:00'))
unique_states: 59
unique_districts: 958
unique_pincodes: 19598
negative_values: {}
invalid_dates: 0
invalid_pincodes: 0


In [10]:
# Validate biometric data
print("\nVALIDATING BIOMETRIC DATA")
print("="*80)
biometric_validation = preprocessor.validate_data(biometric_df, 'biometric')

for key, value in biometric_validation.items():
    print(f"{key}: {value}")


VALIDATING BIOMETRIC DATA


total_rows: 372222
missing_values: {'date': 0, 'state': 0, 'district': 0, 'pincode': 0, 'bio_age_5_17': 0, 'bio_age_17_': 0}
duplicate_rows: 3902
date_range: (Timestamp('2025-03-01 00:00:00'), Timestamp('2025-12-29 00:00:00'))
unique_states: 55
unique_districts: 949
unique_pincodes: 19508
negative_values: {}
invalid_dates: 0
invalid_pincodes: 0


## 6. Data Cleaning

Clean and standardize all datasets

In [11]:
# Clean datasets
print("Cleaning datasets...")

enrolment_clean = preprocessor.clean_data(enrolment_df, 'enrolment')
demographic_clean = preprocessor.clean_data(demographic_df, 'demographic')
biometric_clean = preprocessor.clean_data(biometric_df, 'biometric')

print(f"\nEnrolment: {len(enrolment_df):,} → {len(enrolment_clean):,} rows")
print(f"Demographic: {len(demographic_df):,} → {len(demographic_clean):,} rows")
print(f"Biometric: {len(biometric_df):,} → {len(biometric_clean):,} rows")

print("\n✓ Data cleaning completed")

Cleaning datasets...



Enrolment: 201,206 → 200,255 rows
Demographic: 414,340 → 395,409 rows
Biometric: 372,222 → 368,316 rows

✓ Data cleaning completed


## 7. Feature Engineering

Add derived features for analysis

In [12]:
# Add derived features
print("Adding derived features...")

enrolment_enhanced = preprocessor.add_derived_features(enrolment_clean, 'enrolment')
demographic_enhanced = preprocessor.add_derived_features(demographic_clean, 'demographic')
biometric_enhanced = preprocessor.add_derived_features(biometric_clean, 'biometric')

print("\nEnrolment enhanced columns:")
print(enrolment_enhanced.columns.tolist())

print("\nDemographic enhanced columns:")
print(demographic_enhanced.columns.tolist())

print("\nBiometric enhanced columns:")
print(biometric_enhanced.columns.tolist())

print("\n✓ Feature engineering completed")

Adding derived features...



Enrolment enhanced columns:
['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater', 'year', 'month', 'quarter', 'day_of_week', 'week_of_year', 'month_name', 'total_enrolments', 'prop_age_0_5', 'prop_age_5_17', 'prop_age_18_greater', 'state_district']

Demographic enhanced columns:
['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_', 'year', 'month', 'quarter', 'day_of_week', 'week_of_year', 'month_name', 'total_demo_updates', 'prop_demo_youth', 'prop_demo_adult', 'state_district']

Biometric enhanced columns:
['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17_', 'year', 'month', 'quarter', 'day_of_week', 'week_of_year', 'month_name', 'total_bio_updates', 'prop_bio_youth', 'prop_bio_adult', 'state_district']

✓ Feature engineering completed


## 8. Geographic Analysis - Unique Values

In [13]:
# Analyze geographic coverage
print("GEOGRAPHIC COVERAGE ANALYSIS")
print("="*80)

print(f"\nUnique States: {enrolment_enhanced['state'].nunique()}")
print(f"Top 10 States by records:")
print(enrolment_enhanced['state'].value_counts().head(10))

print(f"\n\nUnique Districts: {enrolment_enhanced['district'].nunique()}")
print(f"Top 10 Districts by records:")
print(enrolment_enhanced['district'].value_counts().head(10))

print(f"\n\nUnique PIN Codes: {enrolment_enhanced['pincode'].nunique()}")
print(f"Sample PIN codes:")
print(enrolment_enhanced['pincode'].head(20).tolist())

GEOGRAPHIC COVERAGE ANALYSIS

Unique States: 48
Top 10 States by records:
state
Uttar Pradesh     21890
Tamil Nadu        18174
Maharashtra       15452
West Bengal       15393
Karnataka         14115
Andhra Pradesh    12928
Bihar             11947
Rajasthan         11014
Madhya Pradesh    10079
Gujarat            9210
Name: count, dtype: int64


Unique Districts: 922
Top 10 Districts by records:
district
Pune                 1294
North 24 Parganas    1291
Bengaluru            1071
Barddhaman           1066
Hyderabad            1014
Malappuram            973
Murshidabad           922
K.V. Rangareddy       909
Hooghly               909
Tirunelveli           899
Name: count, dtype: int64


Unique PIN Codes: 18936
Sample PIN codes:
['502273', '246486', '606704', '503180', '591222', '741167', '481441', '700037', '493195', '388540', '122505', '803213', '464776', '394365', '583135', '470118', '481224', '480115', '768121', '673508']


## 9. Temporal Coverage

In [14]:
# Temporal analysis
print("TEMPORAL COVERAGE ANALYSIS")
print("="*80)

for name, df in [('Enrolment', enrolment_enhanced), 
                  ('Demographic', demographic_enhanced), 
                  ('Biometric', biometric_enhanced)]:
    print(f"\n{name}:")
    print(f"  Date range: {df['date'].min()} to {df['date'].max()}")
    print(f"  Number of days: {(df['date'].max() - df['date'].min()).days}")
    print(f"  Records per day (avg): {len(df) / df['date'].nunique():.1f}")

TEMPORAL COVERAGE ANALYSIS

Enrolment:
  Date range: 2025-03-09 00:00:00 to 2025-12-31 00:00:00
  Number of days: 297
  Records per day (avg): 2275.6

Demographic:
  Date range: 2025-03-01 00:00:00 to 2025-12-29 00:00:00
  Number of days: 303
  Records per day (avg): 4162.2

Biometric:
  Date range: 2025-03-01 00:00:00 to 2025-12-29 00:00:00
  Number of days: 303
  Records per day (avg): 4138.4


## 10. Save Processed Data

Save cleaned and enhanced datasets for analysis

In [15]:
# Save processed data
output_dir = '../outputs'
os.makedirs(output_dir, exist_ok=True)

print("Saving processed datasets...")

enrolment_enhanced.to_parquet(f'{output_dir}/enrolment_processed.parquet', index=False)
demographic_enhanced.to_parquet(f'{output_dir}/demographic_processed.parquet', index=False)
biometric_enhanced.to_parquet(f'{output_dir}/biometric_processed.parquet', index=False)

print("✓ Processed datasets saved to outputs/")
print("  - enrolment_processed.parquet")
print("  - demographic_processed.parquet")
print("  - biometric_processed.parquet")

Saving processed datasets...


✓ Processed datasets saved to outputs/
  - enrolment_processed.parquet
  - demographic_processed.parquet
  - biometric_processed.parquet


## 11. Summary

**Data Loading Summary:**
- Loaded enrolment, demographic, and biometric update datasets
- Validated data quality and identified issues
- Cleaned data (removed duplicates, invalid dates, negative values)
- Added derived features (temporal features, totals, proportions)
- Saved processed data for analysis

**Next Steps:**
1. Exploratory Data Analysis (EDA)
2. Temporal and Spatial Analysis
3. Anomaly Detection
4. Predictive Modeling
5. Insights Generation