This file contains


1.   New Version of lat long sampler which ensures all points sampled are strictly within boundary and follow the population density distribution
2.   Code for assigning lat longs to households generated by the IPU method
3.   Code for assigning wards to households so that the number of people in each ward is roughly similar to the actual values
4.   Code for assigning ages to people in buckets
5.   Code for regression of age on height and weight


This notebook has been cleaned so that the variables names are not dependent on any city. Next time we need a population, we just need to adjust the files being read and nothing else needs to be changed to generate the output from the notebook

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip3 install geopandas



In [None]:
!wget https://data.worldpop.org/GIS/Population_Density/Global_2000_2020_1km/2020/IND/ind_pd_2020_1km_ASCII_XYZ.zip
!unzip ind_pd_2020_1km_ASCII_XYZ.zip

--2021-05-22 06:21:04--  https://data.worldpop.org/GIS/Population_Density/Global_2000_2020_1km/2020/IND/ind_pd_2020_1km_ASCII_XYZ.zip
Resolving data.worldpop.org (data.worldpop.org)... ::ffff:152.78.118.157, 152.78.118.157
Connecting to data.worldpop.org (data.worldpop.org)|::ffff:152.78.118.157|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 53063505 (51M) [application/zip]
Saving to: ‘ind_pd_2020_1km_ASCII_XYZ.zip’


2021-05-22 06:21:38 (1.52 MB/s) - ‘ind_pd_2020_1km_ASCII_XYZ.zip’ saved [53063505/53063505]

Archive:  ind_pd_2020_1km_ASCII_XYZ.zip
  inflating: ind_pd_2020_1km_ASCII_XYZ.csv  


In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
import random
from shapely.geometry import Point, MultiPoint
from shapely.ops import unary_union

from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [None]:
population_density_data = pd.read_csv("ind_pd_2020_1km_ASCII_XYZ.csv")
columns_rename = {"X":"longitude", "Y":"latitude", "Z":"population_density"}
population_density_data['X'] = population_density_data['X'].round(6)
population_density_data['Y'] = population_density_data['Y'].round(6)
population_density_data.rename(columns_rename, axis=1, inplace=True)
population_density_data['point_object'] = population_density_data.progress_apply(lambda x : Point(x['longitude'], x['latitude']), axis=1)

100%|██████████| 4010402/4010402 [01:45<00:00, 38046.10it/s]


In [None]:
population_density_data

Unnamed: 0,longitude,latitude,population_density,point_object
0,77.827916,35.503750,1.465800,POINT (77.827916 35.50375)
1,77.836250,35.503750,1.427846,POINT (77.83625000000001 35.50375)
2,77.844583,35.503750,0.473976,POINT (77.844583 35.50375)
3,77.819583,35.495417,1.207597,POINT (77.81958299999999 35.495417)
4,77.827916,35.495417,0.479271,POINT (77.827916 35.495417)
...,...,...,...,...
4010397,93.827916,6.762083,20.232878,POINT (93.827916 6.762083)
4010398,93.836250,6.762083,17.460630,POINT (93.83625000000001 6.762083)
4010399,93.819583,6.753750,7.082357,POINT (93.81958299999999 6.75375)
4010400,93.827916,6.753750,14.493382,POINT (93.827916 6.75375)


In [None]:
def add_point(latitude, longitude):
    distances = pow(population_density_data['latitude']-latitude, 2) + pow(population_density_data['longitude']-longitude,2)
    sorted_df = population_density_data.loc[distances.sort_values().index]
    mean_population_density = sorted_df.iloc[:4]['population_density'].mean()
    
    new_row_index = len(population_density_data)
    
    population_density_data.at[new_row_index, 'longitude'] =  longitude
    population_density_data.at[new_row_index, 'latitude'] = latitude
    population_density_data.at[new_row_index, 'population_density'] = mean_population_density
    population_density_data.at[new_row_index, 'point_object'] = Point(longitude, latitude)

In [None]:
def get_lat_long_samples(n, polygon):
    subset = population_density_data[population_density_data['point_object'].progress_apply(polygon.contains)]
    
    if(len(subset)==0):
        raise Exception("No points within the given polygon")
    
    sample = subset.sample(weights='population_density', n=(n*10), replace=True).copy()
    
    sample.reset_index(drop=True, inplace=True)
    
    sample['latitude'] = sample['latitude'] + np.random.uniform(-0.015, 0.015, size=sample.shape[0])
    
    sample['longitude'] = sample['longitude'] + np.random.uniform(-0.015, 0.015, size=sample.shape[0])
    
    points = sample.progress_apply(lambda x : Point(x['longitude'], x['latitude']), axis=1)
    
    contained = points.progress_apply(polygon.contains)
    
    return sample[contained][['longitude', 'latitude']].sample(n, replace=True).values

In [None]:
admin_units = gpd.read_file("https://raw.githubusercontent.com/datameet/Municipal_Spatial_Data/master/Mumbai/BMC_Wards.geojson")
admin_units.sort_values(by='name', inplace=True)
admin_units.reset_index(drop=True, inplace=True)
admin_units

Unnamed: 0,gid,name,geometry
0,1,A,"MULTIPOLYGON (((72.84025 18.94881, 72.84030 18..."
1,2,B,"MULTIPOLYGON (((72.84456 18.96342, 72.84461 18..."
2,3,C,"MULTIPOLYGON (((72.83198 18.96174, 72.83197 18..."
3,4,D,"MULTIPOLYGON (((72.81873 18.96901, 72.81878 18..."
4,5,E,"MULTIPOLYGON (((72.84677 18.98183, 72.84658 18..."
5,8,F/N,"MULTIPOLYGON (((72.87091 19.05119, 72.87103 19..."
6,6,F/S,"MULTIPOLYGON (((72.85625 19.01059, 72.85630 19..."
7,9,G/N,"MULTIPOLYGON (((72.86699 19.05237, 72.86738 19..."
8,7,G/S,"MULTIPOLYGON (((72.82689 19.01942, 72.82691 19..."
9,18,H/E,"MULTIPOLYGON (((72.85932 19.08400, 72.85932 19..."


In [None]:
for admin_unit in admin_units.iterrows():
    admin_unit_centroid = admin_unit[1]['geometry'].centroid
    add_point(admin_unit_centroid.y, admin_unit_centroid.x)

In [None]:
ward_wise_pop = """Ward A	18.9337657	72.8364969	185014
Ward B	18.9614551	72.8333212	127290
Ward C	18.946123	72.8249009	166161
Ward D	18.9626147	72.813162	346866
Ward E	18.9718778	72.8313269	393286
Ward F North	19.0294197	72.8546058	529034
Ward F South	19.0058779	72.8396881	360972
Ward G North	19.023175	72.8434324	599039
Ward G South	19.0083734	72.8304087	377749
Ward H East	19.0851064	72.8445455	557239
Ward H West	19.0561063	72.8352939	307581
Ward K East	19.1200923	72.8523868	823885
Ward K West	19.1195001	72.844486	748688
Ward L	19.0704672	72.8790936	902225
Ward M East	19.0564771	72.9215464	807720
Ward M West	19.0611012	72.8993043	411893
Ward N	19.0839316	72.9064422	622853
Ward P North	19.1877853	72.8423072	941366
Ward P South	19.1626595	72.8464575	463507
Ward R Central	19.2311189	72.8558279	562162
Ward R North	19.1200923	72.8523868	431368
Ward R South	19.2039634	72.8453958	691229
Ward S	19.1390295	72.9304517	743783
Ward T	19.1756249	72.950922	341463""".split("\n")
admin_unit_wise_population = pd.DataFrame(list(map(lambda x : x.split("\t"), ward_wise_pop)), columns = ['Name', 'Latitude', 'Longitude', 'TOT_P'])
admin_unit_wise_population['Name'] = admin_unit_wise_population['Name'].apply(lambda x : x[x.find(" ")+1:].replace(" North", "/N").replace(" South", "/S").replace(" East", "/E").replace(" West", "/W").replace(" Central", "/C"))
admin_unit_wise_population.sort_values(by='Name', inplace=True)
admin_unit_wise_population.reset_index(drop=True, inplace=True)
admin_unit_wise_population['TOT_P'] = admin_unit_wise_population['TOT_P'].apply(int)
admin_unit_wise_population

Unnamed: 0,Name,Latitude,Longitude,TOT_P
0,A,18.9337657,72.8364969,185014
1,B,18.9614551,72.8333212,127290
2,C,18.946123,72.8249009,166161
3,D,18.9626147,72.813162,346866
4,E,18.9718778,72.8313269,393286
5,F/N,19.0294197,72.8546058,529034
6,F/S,19.0058779,72.8396881,360972
7,G/N,19.023175,72.8434324,599039
8,G/S,19.0083734,72.8304087,377749
9,H/E,19.0851064,72.8445455,557239


In [None]:
admin_unit_wise_population['lower_limit'] = admin_unit_wise_population['TOT_P'].cumsum()-admin_unit_wise_population['TOT_P']
admin_unit_wise_population['upper_limit'] = admin_unit_wise_population['TOT_P'].cumsum()
admin_unit_wise_population

Unnamed: 0,Name,Latitude,Longitude,TOT_P,lower_limit,upper_limit
0,A,18.9337657,72.8364969,185014,0,185014
1,B,18.9614551,72.8333212,127290,185014,312304
2,C,18.946123,72.8249009,166161,312304,478465
3,D,18.9626147,72.813162,346866,478465,825331
4,E,18.9718778,72.8313269,393286,825331,1218617
5,F/N,19.0294197,72.8546058,529034,1218617,1747651
6,F/S,19.0058779,72.8396881,360972,1747651,2108623
7,G/N,19.023175,72.8434324,599039,2108623,2707662
8,G/S,19.0083734,72.8304087,377749,2707662,3085411
9,H/E,19.0851064,72.8445455,557239,3085411,3642650


In [None]:
total_population = int(np.ceil(admin_unit_wise_population['TOT_P'].sum()/10000)*10000)
total_population

12450000

In [None]:
city_households = pd.read_csv("/content/drive/MyDrive/syndata_ism/mumbai/base_population_data/mumbai_city_households2.csv")
suburban_households = pd.read_csv("/content/drive/MyDrive/syndata_ism/mumbai/base_population_data/mumbai_suburban_households2.csv")
synthetic_households = pd.concat([city_households,suburban_households])
synthetic_households.drop(synthetic_households.columns[0], axis=1, inplace=True)
synthetic_households.reset_index(drop=True, inplace=True)
synthetic_households.index = synthetic_households['household_id']
synthetic_households

Unnamed: 0_level_0,household_id,residence,geog,hhsize
household_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2433018,2433018,urban,Mumbai City,hhsize_4
2571328,2571328,urban,Mumbai City,hhsize_4
2222736,2222736,urban,Mumbai City,hhsize_6
2116355,2116355,urban,Mumbai City,hhsize_6
2235639,2235639,urban,Mumbai City,hhsize_710
...,...,...,...,...
1490518,1490518,urban,Mumbai Suburban,hhsize_710
1800564,1800564,urban,Mumbai Suburban,hhsize_6
1672493,1672493,urban,Mumbai Suburban,hhsize_710
392059,392059,urban,Mumbai Suburban,hhsize_710


In [None]:
city_population = pd.read_csv("/content/drive/MyDrive/syndata_ism/mumbai/base_population_data/mumbai_city_persons.csv")
suburban_population = pd.read_csv("/content/drive/MyDrive/syndata_ism/mumbai/base_population_data/mumbai_suburban_persons.csv")
synthetic_population = pd.concat([city_population,suburban_population])
synthetic_population.drop(synthetic_population.columns[0], axis=1, inplace=True)
synthetic_population.reset_index(drop=True, inplace=True)
synthetic_population

Unnamed: 0,mem_id,gender,age,literacy,religion,caste,residence,working,geog,household_id
0,1,male,40to44,literate,hindu,SC,urban,yes,Mumbai City,2097582
1,1,male,40to44,literate,hindu,SC,urban,yes,Mumbai City,2100136
2,1,male,40to44,literate,hindu,SC,urban,yes,Mumbai City,2103889
3,1,male,40to44,literate,hindu,SC,urban,yes,Mumbai City,2105417
4,1,male,40to44,literate,hindu,SC,urban,yes,Mumbai City,2105440
...,...,...,...,...,...,...,...,...,...,...
12488487,1,female,65to69,illiterate,hindu,other,urban,yes,Mumbai Suburban,2093191
12488488,1,female,65to69,illiterate,hindu,other,urban,yes,Mumbai Suburban,2093910
12488489,1,female,65to69,illiterate,hindu,other,urban,yes,Mumbai Suburban,2093979
12488490,1,female,65to69,illiterate,hindu,other,urban,yes,Mumbai Suburban,2094032


In [None]:
synthetic_population['age'] = synthetic_population['age'].apply(lambda x : random.randint(80,95) if (x=="80p") else int(x.split("to")[0])) + np.random.randint(0,5,size=len(synthetic_population))
synthetic_population

Unnamed: 0,mem_id,gender,age,literacy,religion,caste,residence,working,geog,household_id
0,1,male,41,literate,hindu,SC,urban,yes,Mumbai City,2097582
1,1,male,41,literate,hindu,SC,urban,yes,Mumbai City,2100136
2,1,male,42,literate,hindu,SC,urban,yes,Mumbai City,2103889
3,1,male,44,literate,hindu,SC,urban,yes,Mumbai City,2105417
4,1,male,40,literate,hindu,SC,urban,yes,Mumbai City,2105440
...,...,...,...,...,...,...,...,...,...,...
12488487,1,female,65,illiterate,hindu,other,urban,yes,Mumbai Suburban,2093191
12488488,1,female,67,illiterate,hindu,other,urban,yes,Mumbai Suburban,2093910
12488489,1,female,65,illiterate,hindu,other,urban,yes,Mumbai Suburban,2093979
12488490,1,female,65,illiterate,hindu,other,urban,yes,Mumbai Suburban,2094032


In [None]:
synthetic_households['hhsize'] = synthetic_population.groupby('household_id').size()

In [None]:
synthetic_households

Unnamed: 0_level_0,household_id,residence,geog,hhsize
household_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2433018,2433018,urban,Mumbai City,4
2571328,2571328,urban,Mumbai City,4
2222736,2222736,urban,Mumbai City,6
2116355,2116355,urban,Mumbai City,6
2235639,2235639,urban,Mumbai City,8
...,...,...,...,...
1490518,1490518,urban,Mumbai Suburban,10
1800564,1800564,urban,Mumbai Suburban,6
1672493,1672493,urban,Mumbai Suburban,8
392059,392059,urban,Mumbai Suburban,8


In [None]:
for admin_unit_wise_population_info in admin_unit_wise_population.iterrows():
    subset_index = (synthetic_households['hhsize'].cumsum()>=admin_unit_wise_population_info[1]['lower_limit']) & (synthetic_households['hhsize'].cumsum()<=admin_unit_wise_population_info[1]['upper_limit'])
    synthetic_households.loc[subset_index, 'AdminUnitName'] = admin_unit_wise_population_info[1]['Name']
    synthetic_households.loc[subset_index, 'AdminUnitLatitude'] = admin_unit_wise_population_info[1]['Latitude']
    synthetic_households.loc[subset_index, 'AdminUnitLongitude'] = admin_unit_wise_population_info[1]['Longitude']

In [None]:
synthetic_households.dropna(inplace=True)
synthetic_households

Unnamed: 0_level_0,household_id,residence,geog,hhsize,AdminUnitName,AdminUnitLatitude,AdminUnitLongitude
household_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2433018,2433018,urban,Mumbai City,4,A,18.9337657,72.8364969
2571328,2571328,urban,Mumbai City,4,A,18.9337657,72.8364969
2222736,2222736,urban,Mumbai City,6,A,18.9337657,72.8364969
2116355,2116355,urban,Mumbai City,6,A,18.9337657,72.8364969
2235639,2235639,urban,Mumbai City,8,A,18.9337657,72.8364969
...,...,...,...,...,...,...,...
210335,210335,urban,Mumbai Suburban,8,T,19.1756249,72.950922
1903782,1903782,urban,Mumbai Suburban,8,T,19.1756249,72.950922
1956510,1956510,urban,Mumbai Suburban,8,T,19.1756249,72.950922
429874,429874,urban,Mumbai Suburban,18,T,19.1756249,72.950922


In [None]:
synthetic_households[['H_Lat', 'H_Lon']] = None
synthetic_households

Unnamed: 0_level_0,household_id,residence,geog,hhsize,AdminUnitName,AdminUnitLatitude,AdminUnitLongitude,H_Lat,H_Lon
household_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2433018,2433018,urban,Mumbai City,4,A,18.9337657,72.8364969,,
2571328,2571328,urban,Mumbai City,4,A,18.9337657,72.8364969,,
2222736,2222736,urban,Mumbai City,6,A,18.9337657,72.8364969,,
2116355,2116355,urban,Mumbai City,6,A,18.9337657,72.8364969,,
2235639,2235639,urban,Mumbai City,8,A,18.9337657,72.8364969,,
...,...,...,...,...,...,...,...,...,...
210335,210335,urban,Mumbai Suburban,8,T,19.1756249,72.950922,,
1903782,1903782,urban,Mumbai Suburban,8,T,19.1756249,72.950922,,
1956510,1956510,urban,Mumbai Suburban,8,T,19.1756249,72.950922,,
429874,429874,urban,Mumbai Suburban,18,T,19.1756249,72.950922,,


In [None]:
for admin_unit_name in synthetic_households['AdminUnitName'].unique():
    print(admin_unit_name)
    admin_unit_polygon = admin_units[admin_units['name']==admin_unit_name]['geometry'].iloc[0]
    admin_unit_houses_index = synthetic_households['AdminUnitName']==admin_unit_name
    n_houses_in_admin_unit = len(synthetic_households[admin_unit_houses_index])
    points = get_lat_long_samples(n_houses_in_admin_unit, admin_unit_polygon)
    synthetic_households.loc[admin_unit_houses_index, ['H_Lon', 'H_Lat']] = points

  0%|          | 0/4010426 [00:00<?, ?it/s]

A


100%|██████████| 4010426/4010426 [00:18<00:00, 217576.11it/s]
100%|██████████| 224280/224280 [00:05<00:00, 40663.46it/s]
100%|██████████| 224280/224280 [00:29<00:00, 7644.16it/s]
  0%|          | 7484/4010426 [00:00<00:53, 74839.89it/s]

B


100%|██████████| 4010426/4010426 [00:16<00:00, 241029.34it/s]
100%|██████████| 153910/153910 [00:03<00:00, 41318.29it/s]
100%|██████████| 153910/153910 [00:03<00:00, 49501.07it/s]
  0%|          | 12106/4010426 [00:00<00:33, 121057.81it/s]

C


100%|██████████| 4010426/4010426 [00:16<00:00, 242269.39it/s]
100%|██████████| 201800/201800 [00:04<00:00, 41078.14it/s]
100%|██████████| 201800/201800 [00:03<00:00, 62225.87it/s]
  0%|          | 11210/4010426 [00:00<00:35, 112097.43it/s]

D


100%|██████████| 4010426/4010426 [00:16<00:00, 240958.70it/s]
100%|██████████| 421810/421810 [00:11<00:00, 36393.90it/s]
100%|██████████| 421810/421810 [00:26<00:00, 16164.68it/s]
  0%|          | 8547/4010426 [00:00<00:46, 85469.67it/s]

E


100%|██████████| 4010426/4010426 [00:16<00:00, 239758.69it/s]
100%|██████████| 477110/477110 [00:12<00:00, 37151.21it/s]
100%|██████████| 477110/477110 [00:24<00:00, 19258.50it/s]
  0%|          | 10429/4010426 [00:00<00:38, 104286.87it/s]

F/N


100%|██████████| 4010426/4010426 [00:16<00:00, 244508.88it/s]
100%|██████████| 644280/644280 [00:16<00:00, 39737.59it/s]
100%|██████████| 644280/644280 [00:44<00:00, 14430.19it/s]
  0%|          | 10741/4010426 [00:00<00:37, 107408.82it/s]

F/S


100%|██████████| 4010426/4010426 [00:16<00:00, 242528.20it/s]
100%|██████████| 438340/438340 [00:11<00:00, 38743.42it/s]
100%|██████████| 438340/438340 [00:21<00:00, 20613.94it/s]
  0%|          | 9537/4010426 [00:00<00:41, 95365.09it/s]

G/N


100%|██████████| 4010426/4010426 [00:16<00:00, 243212.73it/s]
100%|██████████| 725510/725510 [00:18<00:00, 38568.02it/s]
100%|██████████| 725510/725510 [00:23<00:00, 31536.23it/s]
  0%|          | 11340/4010426 [00:00<00:35, 113397.95it/s]

G/S


100%|██████████| 4010426/4010426 [00:16<00:00, 241883.32it/s]
100%|██████████| 457310/457310 [00:11<00:00, 39043.93it/s]
100%|██████████| 457310/457310 [00:18<00:00, 24268.30it/s]
  0%|          | 10302/4010426 [00:00<00:38, 103019.36it/s]

H/E


100%|██████████| 4010426/4010426 [00:16<00:00, 243236.50it/s]
100%|██████████| 669830/669830 [00:16<00:00, 40027.89it/s]
100%|██████████| 669830/669830 [00:27<00:00, 24697.25it/s]
  0%|          | 11071/4010426 [00:00<00:36, 110706.41it/s]

H/W


100%|██████████| 4010426/4010426 [00:16<00:00, 244845.39it/s]
100%|██████████| 368080/368080 [00:09<00:00, 39099.65it/s]
100%|██████████| 368080/368080 [00:16<00:00, 21809.15it/s]
  0%|          | 10444/4010426 [00:00<00:38, 104437.86it/s]

K/E


100%|██████████| 4010426/4010426 [00:16<00:00, 243616.59it/s]
100%|██████████| 985390/985390 [00:25<00:00, 38080.74it/s]
100%|██████████| 985390/985390 [01:17<00:00, 12706.12it/s]
  0%|          | 10264/4010426 [00:00<00:38, 102638.63it/s]

K/W


100%|██████████| 4010426/4010426 [00:16<00:00, 239278.58it/s]
100%|██████████| 897000/897000 [00:22<00:00, 40253.57it/s]
100%|██████████| 897000/897000 [00:55<00:00, 16194.59it/s]
  0%|          | 10847/4010426 [00:00<00:36, 108468.29it/s]

L


100%|██████████| 4010426/4010426 [00:16<00:00, 241510.77it/s]
100%|██████████| 1079400/1079400 [00:28<00:00, 38248.37it/s]
100%|██████████| 1079400/1079400 [01:17<00:00, 13904.17it/s]
  0%|          | 10911/4010426 [00:00<00:36, 109106.46it/s]

M/E


100%|██████████| 4010426/4010426 [00:16<00:00, 242531.58it/s]
100%|██████████| 966500/966500 [00:23<00:00, 40388.35it/s]
100%|██████████| 966500/966500 [00:49<00:00, 19628.32it/s]
  0%|          | 11036/4010426 [00:00<00:36, 110356.42it/s]

M/W


100%|██████████| 4010426/4010426 [00:16<00:00, 240585.39it/s]
100%|██████████| 493390/493390 [00:13<00:00, 37671.37it/s]
100%|██████████| 493390/493390 [00:36<00:00, 13588.25it/s]
  0%|          | 10333/4010426 [00:00<00:38, 103326.40it/s]

N


100%|██████████| 4010426/4010426 [00:16<00:00, 240017.22it/s]
100%|██████████| 744730/744730 [00:18<00:00, 39697.21it/s]
100%|██████████| 744730/744730 [00:48<00:00, 15298.08it/s]
  0%|          | 10938/4010426 [00:00<00:36, 109377.76it/s]

P/N


100%|██████████| 4010426/4010426 [00:16<00:00, 241938.90it/s]
100%|██████████| 1127600/1127600 [00:29<00:00, 38711.60it/s]
100%|██████████| 1127600/1127600 [03:00<00:00, 6257.61it/s]
  0%|          | 10731/4010426 [00:00<00:37, 107308.82it/s]

P/S


100%|██████████| 4010426/4010426 [00:16<00:00, 243799.12it/s]
100%|██████████| 554870/554870 [00:13<00:00, 39762.38it/s]
100%|██████████| 554870/554870 [00:37<00:00, 14798.18it/s]
  0%|          | 11546/4010426 [00:00<00:34, 115459.56it/s]

R/C


100%|██████████| 4010426/4010426 [00:16<00:00, 242825.51it/s]
100%|██████████| 672920/672920 [00:17<00:00, 38322.39it/s]
100%|██████████| 672920/672920 [00:57<00:00, 11744.68it/s]
  0%|          | 10876/4010426 [00:00<00:36, 108756.21it/s]

R/N


100%|██████████| 4010426/4010426 [00:16<00:00, 240065.56it/s]
100%|██████████| 515490/515490 [00:12<00:00, 41496.14it/s]
100%|██████████| 515490/515490 [00:25<00:00, 20386.51it/s]
  0%|          | 8846/4010426 [00:00<00:45, 88458.19it/s]

R/S


100%|██████████| 4010426/4010426 [00:16<00:00, 241820.96it/s]
100%|██████████| 826870/826870 [00:21<00:00, 37639.36it/s]
100%|██████████| 826870/826870 [01:05<00:00, 12692.51it/s]
  0%|          | 9804/4010426 [00:00<00:40, 98038.69it/s]

S


100%|██████████| 4010426/4010426 [00:16<00:00, 241455.89it/s]
100%|██████████| 888930/888930 [00:22<00:00, 39904.33it/s]
100%|██████████| 888930/888930 [01:11<00:00, 12509.93it/s]
  0%|          | 7758/4010426 [00:00<00:51, 77575.08it/s]

T


100%|██████████| 4010426/4010426 [00:16<00:00, 241962.60it/s]
100%|██████████| 409340/409340 [00:10<00:00, 39429.79it/s]
100%|██████████| 409340/409340 [00:37<00:00, 10785.59it/s]


In [None]:
synthetic_households.index.name = 'hh_index'
synthetic_households

Unnamed: 0_level_0,household_id,residence,geog,hhsize,AdminUnitName,AdminUnitLatitude,AdminUnitLongitude,H_Lat,H_Lon
hh_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2433018,2433018,urban,Mumbai City,4,A,18.9337657,72.8364969,18.9393,72.8285
2571328,2571328,urban,Mumbai City,4,A,18.9337657,72.8364969,18.9389,72.8425
2222736,2222736,urban,Mumbai City,6,A,18.9337657,72.8364969,18.919,72.8226
2116355,2116355,urban,Mumbai City,6,A,18.9337657,72.8364969,18.9218,72.8322
2235639,2235639,urban,Mumbai City,8,A,18.9337657,72.8364969,18.938,72.8309
...,...,...,...,...,...,...,...,...,...
210335,210335,urban,Mumbai Suburban,8,T,19.1756249,72.950922,19.1673,72.9668
1903782,1903782,urban,Mumbai Suburban,8,T,19.1756249,72.950922,19.1685,72.9422
1956510,1956510,urban,Mumbai Suburban,8,T,19.1756249,72.950922,19.1791,72.9388
429874,429874,urban,Mumbai Suburban,18,T,19.1756249,72.950922,19.1757,72.9598


In [None]:
columns_to_join = ['household_id', 'H_Lat', 'H_Lon', 'AdminUnitName', 'AdminUnitLatitude', 'AdminUnitLongitude']
merged_df = pd.merge(synthetic_population, synthetic_households[columns_to_join] ,on='household_id')
merged_df

Unnamed: 0,mem_id,gender,age,literacy,religion,caste,residence,working,geog,household_id,H_Lat,H_Lon,AdminUnitName,AdminUnitLatitude,AdminUnitLongitude
0,1,male,41,literate,hindu,SC,urban,yes,Mumbai City,2097582,18.9866,72.8527,F/S,19.0058779,72.8396881
1,1,male,44,literate,hindu,SC,urban,yes,Mumbai City,2097582,18.9866,72.8527,F/S,19.0058779,72.8396881
2,2,female,39,literate,hindu,SC,urban,no,Mumbai City,2097582,18.9866,72.8527,F/S,19.0058779,72.8396881
3,2,female,36,literate,hindu,SC,urban,no,Mumbai City,2097582,18.9866,72.8527,F/S,19.0058779,72.8396881
4,3,female,15,literate,hindu,SC,urban,no,Mumbai City,2097582,18.9866,72.8527,F/S,19.0058779,72.8396881
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12442367,1,female,65,illiterate,hindu,other,urban,yes,Mumbai Suburban,2093979,19.1995,72.816,P/N,19.1877853,72.8423072
12442368,1,female,65,illiterate,hindu,other,urban,yes,Mumbai Suburban,2094032,19.1426,72.8101,K/W,19.1195001,72.844486
12442369,1,female,65,illiterate,hindu,other,urban,yes,Mumbai Suburban,2094032,19.1426,72.8101,K/W,19.1195001,72.844486
12442370,1,female,69,illiterate,hindu,other,urban,yes,Mumbai Suburban,2094072,19.2074,72.8528,R/S,19.2039634,72.8453958


In [None]:
merged_df.to_csv("/content/drive/MyDrive/syndata_ism/mumbai/base_population_data/with_hlat_hlong.csv")