# UIDAI Data Analysis
## Data Cleaning and Preprocessing

This notebook cleans the UIDAI datasets by:
- Converting all column names to lowercase
- Removing spaces from column names (replacing with underscores)

In [None]:
import pandas as pd
import numpy as np
import os

# Display settings for better viewing
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1. Load and Clean Demographic Data

In [None]:
# Load demographic data
print("Loading Merged_Aadhar_demographic.csv...")
df_demographic = pd.read_csv('Merged_Aadhar_demographic.csv')

print(f"\nOriginal shape: {df_demographic.shape}")
print(f"\nOriginal columns:")
print(df_demographic.columns.tolist())

In [None]:
# Clean column names
df_demographic.columns = df_demographic.columns.str.lower().str.replace(' ', '_', regex=False)

print(f"\nCleaned columns:")
print(df_demographic.columns.tolist())
print(f"\nPreview:")
df_demographic.head()

## 2. Load and Clean Biometric Data

In [None]:
# Load biometric data
print("Loading merged_aadhar_biometric.csv...")
df_biometric = pd.read_csv('merged_aadhar_biometric.csv')

print(f"\nOriginal shape: {df_biometric.shape}")
print(f"\nOriginal columns:")
print(df_biometric.columns.tolist())

In [None]:
# Clean column names
df_biometric.columns = df_biometric.columns.str.lower().str.replace(' ', '_', regex=False)

print(f"\nCleaned columns:")
print(df_biometric.columns.tolist())
print(f"\nPreview:")
df_biometric.head()

## 3. Load and Clean Enrollment Data

In [None]:
# Load enrollment data
print("Loading merged_enrollement.csv...")
df_enrollment = pd.read_csv('merged_enrollement.csv')

print(f"\nOriginal shape: {df_enrollment.shape}")
print(f"\nOriginal columns:")
print(df_enrollment.columns.tolist())

In [None]:
# Clean column names
df_enrollment.columns = df_enrollment.columns.str.lower().str.replace(' ', '_', regex=False)

print(f"\nCleaned columns:")
print(df_enrollment.columns.tolist())
print(f"\nPreview:")
df_enrollment.head()

## 4. Save Cleaned Data

In [None]:
# Save cleaned datasets
print("Saving cleaned datasets...")

df_demographic.to_csv('cleaned_demographic.csv', index=False)
print(f"✓ Saved cleaned_demographic.csv ({df_demographic.shape[0]:,} rows, {df_demographic.shape[1]} columns)")

df_biometric.to_csv('cleaned_biometric.csv', index=False)
print(f"✓ Saved cleaned_biometric.csv ({df_biometric.shape[0]:,} rows, {df_biometric.shape[1]} columns)")

df_enrollment.to_csv('cleaned_enrollment.csv', index=False)
print(f"✓ Saved cleaned_enrollment.csv ({df_enrollment.shape[0]:,} rows, {df_enrollment.shape[1]} columns)")

print("\n✓ All data cleaned successfully!")

## 5. Initial Data Exploration

In [None]:
# Summary statistics for demographic data
print("=== Demographic Data Summary ===")
df_demographic.info()
print("\nBasic statistics:")
df_demographic.describe()

In [None]:
# Check for missing values
print("=== Missing Values ===")
print("\nDemographic:")
print(df_demographic.isnull().sum())

print("\nBiometric:")
print(df_biometric.isnull().sum())

print("\nEnrollment:")
print(df_enrollment.isnull().sum())

## Next Steps

Now that the data is cleaned, you can:
- Perform exploratory data analysis (EDA)
- Create visualizations
- Analyze patterns and trends
- Merge datasets if needed
- Apply statistical analysis