Schema validation of the dataset,
Concatinated the base layer of data


In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from typing import List, Tuple


In [2]:
# Project root (adjust if needed)
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()

DATA_ROOT = PROJECT_ROOT / "data"

RAW_PATHS = {
    "biometric": DATA_ROOT / "aadhar biometric data",
    "demographic": DATA_ROOT / "aadhar demographic data",
    "enrollment": DATA_ROOT / "aadhar enrolment data"
}

INTERMEDIATE_PATH = DATA_ROOT / "intermediate"
INTERMEDIATE_PATH.mkdir(parents=True, exist_ok=True)

RAW_PATHS, INTERMEDIATE_PATH


({'biometric': WindowsPath('c:/Users/BIT/OneDrive/Desktop/UIDAI Hackathon/data/aadhar biometric data'),
  'demographic': WindowsPath('c:/Users/BIT/OneDrive/Desktop/UIDAI Hackathon/data/aadhar demographic data'),
  'enrollment': WindowsPath('c:/Users/BIT/OneDrive/Desktop/UIDAI Hackathon/data/aadhar enrolment data')},
 WindowsPath('c:/Users/BIT/OneDrive/Desktop/UIDAI Hackathon/data/intermediate'))

In [3]:
def load_csvs_from_folder(folder_path: Path) -> List[pd.DataFrame]:
    csv_files = sorted(folder_path.glob("*.csv"))
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in {folder_path}")
    
    dfs = []
    for file in csv_files:
        df = pd.read_csv(file)
        df["source_file"] = file.name  # traceability
        dfs.append(df)
    
    print(f"Loaded {len(dfs)} files from {folder_path.name}")
    return dfs


In [4]:
def check_schema_consistency(dfs: List[pd.DataFrame]) -> Tuple[bool, List[set]]:
    schemas = [set(df.columns) for df in dfs]
    base_schema = schemas[0]
    
    mismatches = [schema for schema in schemas if schema != base_schema]
    
    if mismatches:
        print("❌ Schema mismatch detected")
        return False, schemas
    
    print("✅ All schemas match")
    return True, schemas


In [5]:
def safe_concatenate(dfs: List[pd.DataFrame]) -> pd.DataFrame:
    return pd.concat(dfs, axis=0, ignore_index=True)


In [6]:
def process_dataset(dataset_name: str, folder_path: Path) -> pd.DataFrame:
    print(f"\n--- Processing {dataset_name.upper()} DATA ---")
    
    dfs = load_csvs_from_folder(folder_path)
    is_consistent, schemas = check_schema_consistency(dfs)
    
    if not is_consistent:
        raise ValueError(f"Schema mismatch in {dataset_name} dataset")
    
    df_final = safe_concatenate(dfs)
    
    print(f"Final shape: {df_final.shape}")
    print("Missing values (%):")
    display((df_final.isna().mean() * 100).sort_values(ascending=False).head(10))
    
    return df_final


In [7]:
biometric_df = process_dataset("biometric", RAW_PATHS["biometric"])
demographic_df = process_dataset("demographic", RAW_PATHS["demographic"])
enrollment_df = process_dataset("enrollment", RAW_PATHS["enrollment"])



--- Processing BIOMETRIC DATA ---
Loaded 4 files from aadhar biometric data
✅ All schemas match
Final shape: (1861108, 7)
Missing values (%):


date            0.0
state           0.0
district        0.0
pincode         0.0
bio_age_5_17    0.0
bio_age_17_     0.0
source_file     0.0
dtype: float64


--- Processing DEMOGRAPHIC DATA ---
Loaded 4 files from aadhar demographic data
✅ All schemas match
Final shape: (1571700, 7)
Missing values (%):


date             0.0
state            0.0
district         0.0
pincode          0.0
demo_age_5_17    0.0
demo_age_17_     0.0
source_file      0.0
dtype: float64


--- Processing ENROLLMENT DATA ---
Loaded 3 files from aadhar enrolment data
✅ All schemas match
Final shape: (1006029, 8)
Missing values (%):


date              0.0
state             0.0
district          0.0
pincode           0.0
age_0_5           0.0
age_5_17          0.0
age_18_greater    0.0
source_file       0.0
dtype: float64

In [8]:
biometric_path = INTERMEDIATE_PATH / "biometric_base.parquet"
demographic_path = INTERMEDIATE_PATH / "demographic_base.parquet"
enrollment_path = INTERMEDIATE_PATH / "enrollment_base.parquet"

biometric_df.to_parquet(biometric_path, index=False)
demographic_df.to_parquet(demographic_path, index=False)
enrollment_df.to_parquet(enrollment_path, index=False)

biometric_path, demographic_path, enrollment_path


(WindowsPath('c:/Users/BIT/OneDrive/Desktop/UIDAI Hackathon/data/intermediate/biometric_base.parquet'),
 WindowsPath('c:/Users/BIT/OneDrive/Desktop/UIDAI Hackathon/data/intermediate/demographic_base.parquet'),
 WindowsPath('c:/Users/BIT/OneDrive/Desktop/UIDAI Hackathon/data/intermediate/enrollment_base.parquet'))

In [9]:
summary = pd.DataFrame({
    "dataset": ["biometric", "demographic", "enrollment"],
    "rows": [len(biometric_df), len(demographic_df), len(enrollment_df)],
    "columns": [
        biometric_df.shape[1],
        demographic_df.shape[1],
        enrollment_df.shape[1]
    ]
})

summary


Unnamed: 0,dataset,rows,columns
0,biometric,1861108,7
1,demographic,1571700,7
2,enrollment,1006029,8
