In [None]:
# Import required libraries
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Custom modules
from data_loader import AadhaarDataLoader
from preprocessing import AadhaarDataPreprocessor
from visualization import AadhaarVisualizer

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("✓ Libraries imported successfully")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## 1. Initialize Data Loader and Preprocessor

In [None]:
# Initialize
BASE_PATH = '/Users/satyamsharma/Satverse AI/UIDAI Data Hackathon 2026'
loader = AadhaarDataLoader(BASE_PATH)
preprocessor = AadhaarDataPreprocessor()
visualizer = AadhaarVisualizer(output_dir='../outputs/figures')

print("✓ Initialized data loader, preprocessor, and visualizer")

## 2. Dataset Information

Get overview of available datasets before loading

In [None]:
# Get dataset information
info = loader.get_dataset_info()

print("="*80)
print("DATASET INFORMATION")
print("="*80)

for dataset_type, details in info.items():
    print(f"\n{dataset_type.upper()}")
    print(f"  Number of files: {details['num_files']}")
    print(f"  Total rows: {details['total_rows']:,}")
    print(f"  Files:")
    for file in details['files']:
        print(f"    - {file}")

total_rows = sum(d['total_rows'] for d in info.values())
print(f"\n{'='*80}")
print(f"TOTAL RECORDS ACROSS ALL DATASETS: {total_rows:,}")
print(f"{'='*80}")

## 3. Load Datasets

Load all datasets. We'll start with a sample for faster processing, then can load full data.

In [None]:
# Load datasets - adjust sample_frac as needed
# sample_frac=0.1 means 10% of data (faster for initial exploration)
# sample_frac=1.0 means full data (use for final analysis)

SAMPLE_FRACTION = 0.2  # Start with 20% of data

print("Loading datasets...")
print(f"Sample fraction: {SAMPLE_FRACTION * 100}%\n")

datasets = loader.load_all_datasets(sample_frac=SAMPLE_FRACTION)

enrolment_df = datasets['enrolment']
demographic_df = datasets['demographic']
biometric_df = datasets['biometric']

print("\n✓ All datasets loaded successfully")

## 4. Initial Data Exploration

In [None]:
# Enrolment dataset
print("="*80)
print("ENROLMENT DATASET")
print("="*80)
print(f"Shape: {enrolment_df.shape}")
print(f"\nColumns: {enrolment_df.columns.tolist()}")
print(f"\nFirst few rows:")
display(enrolment_df.head())
print(f"\nData types:")
print(enrolment_df.dtypes)
print(f"\nBasic statistics:")
display(enrolment_df.describe())

In [None]:
# Demographic dataset
print("="*80)
print("DEMOGRAPHIC UPDATE DATASET")
print("="*80)
print(f"Shape: {demographic_df.shape}")
print(f"\nColumns: {demographic_df.columns.tolist()}")
print(f"\nFirst few rows:")
display(demographic_df.head())
print(f"\nData types:")
print(demographic_df.dtypes)
print(f"\nBasic statistics:")
display(demographic_df.describe())

In [None]:
# Biometric dataset
print("="*80)
print("BIOMETRIC UPDATE DATASET")
print("="*80)
print(f"Shape: {biometric_df.shape}")
print(f"\nColumns: {biometric_df.columns.tolist()}")
print(f"\nFirst few rows:")
display(biometric_df.head())
print(f"\nData types:")
print(biometric_df.dtypes)
print(f"\nBasic statistics:")
display(biometric_df.describe())

## 5. Data Validation

Check data quality issues

In [None]:
# Validate enrolment data
print("VALIDATING ENROLMENT DATA")
print("="*80)
enrolment_validation = preprocessor.validate_data(enrolment_df, 'enrolment')

for key, value in enrolment_validation.items():
    print(f"{key}: {value}")

In [None]:
# Validate demographic data
print("\nVALIDATING DEMOGRAPHIC DATA")
print("="*80)
demographic_validation = preprocessor.validate_data(demographic_df, 'demographic')

for key, value in demographic_validation.items():
    print(f"{key}: {value}")

In [None]:
# Validate biometric data
print("\nVALIDATING BIOMETRIC DATA")
print("="*80)
biometric_validation = preprocessor.validate_data(biometric_df, 'biometric')

for key, value in biometric_validation.items():
    print(f"{key}: {value}")

## 6. Data Cleaning

Clean and standardize all datasets

In [None]:
# Clean datasets
print("Cleaning datasets...")

enrolment_clean = preprocessor.clean_data(enrolment_df, 'enrolment')
demographic_clean = preprocessor.clean_data(demographic_df, 'demographic')
biometric_clean = preprocessor.clean_data(biometric_df, 'biometric')

print(f"\nEnrolment: {len(enrolment_df):,} → {len(enrolment_clean):,} rows")
print(f"Demographic: {len(demographic_df):,} → {len(demographic_clean):,} rows")
print(f"Biometric: {len(biometric_df):,} → {len(biometric_clean):,} rows")

print("\n✓ Data cleaning completed")

## 7. Feature Engineering

Add derived features for analysis

In [None]:
# Add derived features
print("Adding derived features...")

enrolment_enhanced = preprocessor.add_derived_features(enrolment_clean, 'enrolment')
demographic_enhanced = preprocessor.add_derived_features(demographic_clean, 'demographic')
biometric_enhanced = preprocessor.add_derived_features(biometric_clean, 'biometric')

print("\nEnrolment enhanced columns:")
print(enrolment_enhanced.columns.tolist())

print("\nDemographic enhanced columns:")
print(demographic_enhanced.columns.tolist())

print("\nBiometric enhanced columns:")
print(biometric_enhanced.columns.tolist())

print("\n✓ Feature engineering completed")

## 8. Geographic Analysis - Unique Values

In [None]:
# Analyze geographic coverage
print("GEOGRAPHIC COVERAGE ANALYSIS")
print("="*80)

print(f"\nUnique States: {enrolment_enhanced['state'].nunique()}")
print(f"Top 10 States by records:")
print(enrolment_enhanced['state'].value_counts().head(10))

print(f"\n\nUnique Districts: {enrolment_enhanced['district'].nunique()}")
print(f"Top 10 Districts by records:")
print(enrolment_enhanced['district'].value_counts().head(10))

print(f"\n\nUnique PIN Codes: {enrolment_enhanced['pincode'].nunique()}")
print(f"Sample PIN codes:")
print(enrolment_enhanced['pincode'].head(20).tolist())

## 9. Temporal Coverage

In [None]:
# Temporal analysis
print("TEMPORAL COVERAGE ANALYSIS")
print("="*80)

for name, df in [('Enrolment', enrolment_enhanced), 
                  ('Demographic', demographic_enhanced), 
                  ('Biometric', biometric_enhanced)]:
    print(f"\n{name}:")
    print(f"  Date range: {df['date'].min()} to {df['date'].max()}")
    print(f"  Number of days: {(df['date'].max() - df['date'].min()).days}")
    print(f"  Records per day (avg): {len(df) / df['date'].nunique():.1f}")

## 10. Save Processed Data

Save cleaned and enhanced datasets for analysis

In [None]:
# Save processed data
output_dir = '../outputs'
os.makedirs(output_dir, exist_ok=True)

print("Saving processed datasets...")

enrolment_enhanced.to_parquet(f'{output_dir}/enrolment_processed.parquet', index=False)
demographic_enhanced.to_parquet(f'{output_dir}/demographic_processed.parquet', index=False)
biometric_enhanced.to_parquet(f'{output_dir}/biometric_processed.parquet', index=False)

print("✓ Processed datasets saved to outputs/")
print("  - enrolment_processed.parquet")
print("  - demographic_processed.parquet")
print("  - biometric_processed.parquet")

## 11. Summary

**Data Loading Summary:**
- Loaded enrolment, demographic, and biometric update datasets
- Validated data quality and identified issues
- Cleaned data (removed duplicates, invalid dates, negative values)
- Added derived features (temporal features, totals, proportions)
- Saved processed data for analysis

**Next Steps:**
1. Exploratory Data Analysis (EDA)
2. Temporal and Spatial Analysis
3. Anomaly Detection
4. Predictive Modeling
5. Insights Generation