In [5]:
import pandas as pd
import glob
import os

# Function to load and concatenate matching files from split CSVs
def load_dataset(pattern):
    # Look for files in ../data/raw/ matching the pattern
    search_path = os.path.join('../data/raw', pattern)
    files = glob.glob(search_path)
    print(f"Found {len(files)} files for pattern '{pattern}'")
    
    if not files:
        print(f"Warning: No files found for {pattern}")
        return pd.DataFrame()
    
    dfs = []
    for f in files:
        try:
            df = pd.read_csv(f)
            dfs.append(df)
            print(f"Loaded {os.path.basename(f)}: {len(df)} rows")
        except Exception as e:
            print(f"Error reading {f}: {e}")
            
    if not dfs:
        return pd.DataFrame()
        
    return pd.concat(dfs, ignore_index=True)

print("Loading datasets...")
enrolment = load_dataset('*enrolment*.csv')
demo_update = load_dataset('*demographic*.csv') 
bio_update = load_dataset('*biometric*.csv')
print("All datasets loaded.")

Loading datasets...
Found 3 files for pattern '*enrolment*.csv'
Loaded api_data_aadhar_enrolment_0_500000.csv: 500000 rows
Loaded api_data_aadhar_enrolment_1000000_1006029.csv: 6029 rows
Loaded api_data_aadhar_enrolment_500000_1000000.csv: 500000 rows
Found 5 files for pattern '*demographic*.csv'
Loaded api_data_aadhar_demographic_0_500000.csv: 500000 rows
Loaded api_data_aadhar_demographic_1000000_1500000.csv: 500000 rows
Loaded api_data_aadhar_demographic_1500000_2000000.csv: 500000 rows
Loaded api_data_aadhar_demographic_2000000_2071700.csv: 71700 rows
Loaded api_data_aadhar_demographic_500000_1000000.csv: 500000 rows
Found 4 files for pattern '*biometric*.csv'
Loaded api_data_aadhar_biometric_0_500000.csv: 500000 rows
Loaded api_data_aadhar_biometric_1000000_1500000.csv: 500000 rows
Loaded api_data_aadhar_biometric_1500000_1861108.csv: 361108 rows
Loaded api_data_aadhar_biometric_500000_1000000.csv: 500000 rows
All datasets loaded.


In [6]:
# Basic stats for each
def print_stats(name, df):
    if df.empty:
        print(f"\n=== {name} Dataset is EMPTY ===")
        return
        
    print(f"\n=== {name} Dataset ===")
    print(f"Rows: {len(df)}")
    if 'date' in df.columns:
        print(f"Date range: {df['date'].min()} to {df['date'].max()}")
    print(f"States: {df['state'].nunique() if 'state' in df.columns else 'N/A'}")
    print(f"Districts: {df['district'].nunique() if 'district' in df.columns else 'N/A'}")
    print(f"Age columns: {[c for c in df.columns if 'age' in c]}")

print_stats("Enrolment", enrolment)
print_stats("Demographic Update", demo_update)
print_stats("Biometric Update", bio_update)


=== Enrolment Dataset ===
Rows: 1006029
Date range: 01-04-2025 to 31-12-2025
States: 55
Districts: 985
Age columns: ['age_0_5', 'age_5_17', 'age_18_greater']

=== Demographic Update Dataset ===
Rows: 2071700
Date range: 01-03-2025 to 31-10-2025
States: 65
Districts: 983
Age columns: ['demo_age_5_17', 'demo_age_17_']

=== Biometric Update Dataset ===
Rows: 1861108
Date range: 01-03-2025 to 31-10-2025
States: 57
Districts: 974
Age columns: ['bio_age_5_17', 'bio_age_17_']


In [7]:
# Check for merging
if not enrolment.empty and not demo_update.empty and not bio_update.empty:
    print("\n=== Can we merge? ===")
    common_cols = set(enrolment.columns) & set(demo_update.columns) & set(bio_update.columns)
    print(f"Common columns: {common_cols}")
    # Should be: date, state, district, pincode


=== Can we merge? ===
Common columns: {'state', 'pincode', 'district', 'date'}


In [8]:
# Sample merge test
if not enrolment.empty:
    # Create lighter subsets for testing merge to avoid memory issues
    print("\nTesting merge on first 100 rows...")
    merged_sample = pd.merge(
        enrolment.head(100),
        demo_update.head(100),
        on=['date', 'state', 'district', 'pincode'],
        how='outer'
    )
    print(f"Merged shape: {merged_sample.shape}")


Testing merge on first 100 rows...
Merged shape: (200, 9)
