In [8]:
import pandas as pd
import glob
import os

In [11]:
BIO_PATH = r"D:\Apna College\Jupyter\Aadharcard\data\api_data_aadhar_biometric/*.csv"
DEMO_PATH = r"D:\Apna College\Jupyter\Aadharcard\data\api_data_aadhar_demographic/*.csv"
ENR_PATH  = r"D:\Apna College\Jupyter\Aadharcard\data\api_data_aadhar_enrolment/*.csv"

In [12]:
bio_files  = glob.glob(BIO_PATH)
demo_files = glob.glob(DEMO_PATH)
enr_files  = glob.glob(ENR_PATH)

print(f"Biometric files: {len(bio_files)}")
print(f"Demographic files: {len(demo_files)}")
print(f"Enrolment files: {len(enr_files)}")

# Safety check
if not bio_files or not demo_files or not enr_files:
    raise FileNotFoundError("One or more data folders are empty or paths are wrong.")

Biometric files: 4
Demographic files: 5
Enrolment files: 3


In [13]:
biometric_df = pd.concat([pd.read_csv(f) for f in bio_files], ignore_index=True)
demographic_df = pd.concat([pd.read_csv(f) for f in demo_files], ignore_index=True)
enrolment_df = pd.concat([pd.read_csv(f) for f in enr_files], ignore_index=True)

print("Loaded shapes:")
print("Biometric:", biometric_df.shape)
print("Demographic:", demographic_df.shape)
print("Enrolment:", enrolment_df.shape)

Loaded shapes:
Biometric: (1861108, 6)
Demographic: (2071700, 6)
Enrolment: (1006029, 7)


In [14]:
def clean_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = (
        df.columns
          .astype(str)
          .str.strip()
          .str.lower()
          .str.replace(r"[^\w]+", "_", regex=True)
          .str.strip("_")
    )
    return df

biometric_df   = clean_columns(biometric_df)
demographic_df = clean_columns(demographic_df)
enrolment_df   = clean_columns(enrolment_df)

In [15]:
for name, df in [("biometric", biometric_df),
                 ("demographic", demographic_df),
                 ("enrolment", enrolment_df)]:
    before = df.shape
    df.dropna(how="all", inplace=True)
    df.drop_duplicates(inplace=True)
    after = df.shape
    print(f"{name}: {before} -> {after}")

biometric: (1861108, 6) -> (1766212, 6)
demographic: (2071700, 6) -> (1598099, 6)
enrolment: (1006029, 7) -> (983072, 7)


In [16]:
print("\nBiometric columns:\n", biometric_df.columns.tolist())
print("\nDemographic columns:\n", demographic_df.columns.tolist())
print("\nEnrolment columns:\n", enrolment_df.columns.tolist())


Biometric columns:
 ['date', 'state', 'district', 'pincode', 'bio_age_5_17', 'bio_age_17']

Demographic columns:
 ['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17']

Enrolment columns:
 ['date', 'state', 'district', 'pincode', 'age_0_5', 'age_5_17', 'age_18_greater']


In [17]:
os.makedirs("../data", exist_ok=True)

biometric_df.to_csv("../data/clean_biometric.csv", index=False)
demographic_df.to_csv("../data/clean_demographic.csv", index=False)
enrolment_df.to_csv("../data/clean_enrolment.csv", index=False)

print("\nClean files saved:")
print("../data/clean_biometric.csv")
print("../data/clean_demographic.csv")
print("../data/clean_enrolment.csv")


Clean files saved:
../data/clean_biometric.csv
../data/clean_demographic.csv
../data/clean_enrolment.csv
