**Demographic Data Preprocessing and Maintaining the Consistency of States Names throughout the Dataset**

In [None]:
import pandas as pd

# List of all demographic CSV files
demo_files = [
    'api_data_aadhar_demographic_0_500000.csv',
    'api_data_aadhar_demographic_500000_1000000.csv',
    'api_data_aadhar_demographic_1000000_1500000.csv',
    'api_data_aadhar_demographic_1500000_2000000.csv',
    'api_data_aadhar_demographic_2000000_2071700.csv'
]

# Read and concatenate
dfs = [pd.read_csv(file) for file in demo_files]
merged_demo_df = pd.concat(dfs, ignore_index=True)

# Save to a single CSV
merged_demo_df.to_csv('merged_aadhar_demographic.csv', index=False)

print(f"Successfully merged {len(demo_files)} files.")
print(f"Total rows: {len(merged_demo_df)}")

Successfully merged 5 files.
Total rows: 2071700


In [None]:
import pandas as pd

# 1. Load your existing merged file (2,071,700 rows)
df = pd.read_csv('merged_aadhar_demographic.csv')

# 2. Standardize casing and strip whitespace to ensure matches
df['state'] = df['state'].astype(str).str.strip().str.title()

# 3. The Master Mapping Dictionary
# This redirects incorrect entries to the "Correct List"
mapping = {
    # Preferred Naming / Spelling fixes
    'Andaman & Nicobar Islands': 'Andaman and Nicobar Islands',
    'Andman & Nicobar': 'Andaman and Nicobar Islands',
    'Andaman And Nicobar': 'Andaman and Nicobar Islands',
    'Chhatisgarh': 'Chhattisgarh',
    'Orissa': 'Odisha',
    'Odissa': 'Odisha',
    'West Bangal': 'West Bengal',
    'West Bengli': 'West Bengal',
    'Westbengal': 'West Bengal',
    'West  Bengal': 'West Bengal',
    'Uttaranchal': 'Uttarakhand',
    'Jammu & Kashmir': 'Jammu and Kashmir',

    # Merging the Union Territory (Single Territory Fix)
    'Dadra & Nagar Haveli': 'Dadra and Nagar Haveli and Daman and Diu',
    'Daman & Diu': 'Dadra and Nagar Haveli and Daman and Diu',
    'Dadra And Nagar Haveli': 'Dadra and Nagar Haveli and Daman and Diu',
    'Daman And Diu': 'Dadra and Nagar Haveli and Daman and Diu',
    'Dnh And Dd': 'Dadra and Nagar Haveli and Daman and Diu',

    # City/District to State Re-routing (Preserving the 20L+ rows)
    'Nagpur': 'Maharashtra',
    'Jaipur': 'Rajasthan',
    'Darbhanga': 'Bihar',
    'Madanapalle': 'Andhra Pradesh',
    'Puttenahalli': 'Karnataka',
    'Raja Annamalai Puram': 'Tamil Nadu',
    'Balanagar': 'Telangana',
    'Gorakhpur': 'Uttar Pradesh',
    'Rajkot': 'Gujarat',
    'Tumakuru': 'Karnataka'
}

# Apply the mapping permanently
df['state'] = df['state'].replace(mapping)

# 4. Filter out any remaining non-state "noise" (like '100000')
# using OUR list of 36 official names
final_valid_list = [
    'Andaman And Nicobar Islands', 'Andhra Pradesh', 'Arunachal Pradesh',
    'Assam', 'Bihar', 'Chandigarh', 'Chhattisgarh',
    'Dadra And Nagar Haveli And Daman And Diu', 'Delhi', 'Goa', 'Gujarat',
    'Haryana', 'Himachal Pradesh', 'Jammu And Kashmir', 'Jharkhand',
    'Karnataka', 'Kerala', 'Ladakh', 'Lakshadweep', 'Madhya Pradesh',
    'Maharashtra', 'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha',
    'Puducherry', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu',
    'Telangana', 'Tripura', 'Uttar Pradesh', 'Uttarakhand', 'West Bengal'
]

# Keep rows matching your list (City-mapped rows are saved because they now match)
df = df[df['state'].isin(final_valid_list)]

# 5. Output Verification
print("--- FINAL VERIFIED STATE LIST (VERTICAL VIEW) ---")
for s in sorted(df['state'].unique()):
    print(s)

print(f"\nFinal Row Count: {len(df):,}")
print(f"Total Unique Categories: {df['state'].nunique()}")

# Save the final project file
df.to_csv('cleaned_demographic_ready.csv', index=False)

--- FINAL VERIFIED STATE LIST (VERTICAL VIEW) ---
Andaman And Nicobar Islands
Andhra Pradesh
Arunachal Pradesh
Assam
Bihar
Chandigarh
Chhattisgarh
Dadra And Nagar Haveli And Daman And Diu
Delhi
Goa
Gujarat
Haryana
Himachal Pradesh
Jammu And Kashmir
Jharkhand
Karnataka
Kerala
Ladakh
Lakshadweep
Madhya Pradesh
Maharashtra
Manipur
Meghalaya
Mizoram
Nagaland
Odisha
Puducherry
Punjab
Rajasthan
Sikkim
Tamil Nadu
Telangana
Tripura
Uttar Pradesh
Uttarakhand
West Bengal

Final Row Count: 2,067,537
Total Unique Categories: 36
