This notebook contains the code for


1.   Generating new workplaces, schools, public places
2.   Assigning lat longs to these places which are strictly within the bounds of the city and follow the population density distribution
3.   Assigning workplaces, schools, public places to individuals with an option for multicore process acceleration

This notebook has been cleaned so that the variables names are not dependent on any city. Next time we need a population, we just need to adjust the files being read and nothing else needs to be changed to generate the output from the notebook

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip3 install geopandas

Collecting geopandas
[?25l  Downloading https://files.pythonhosted.org/packages/d7/bf/e9cefb69d39155d122b6ddca53893b61535fa6ffdad70bf5ef708977f53f/geopandas-0.9.0-py2.py3-none-any.whl (994kB)
[K     |████████████████████████████████| 1.0MB 32.3MB/s 
[?25hCollecting pyproj>=2.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/b1/72/d52e9ca81caef056062d71991b0e9b1d16af042245627c5d0e4916a36c4f/pyproj-3.0.1-cp37-cp37m-manylinux2010_x86_64.whl (6.5MB)
[K     |████████████████████████████████| 6.5MB 39.7MB/s 
Collecting fiona>=1.8
[?25l  Downloading https://files.pythonhosted.org/packages/ea/2a/404b22883298a3efe9c6ef8d67acbf2c38443fa366ee9cd4cd34e17626ea/Fiona-1.8.19-cp37-cp37m-manylinux1_x86_64.whl (15.3MB)
[K     |████████████████████████████████| 15.3MB 236kB/s 
Collecting munch
  Downloading https://files.pythonhosted.org/packages/cc/ab/85d8da5c9a45e072301beb37ad7f833cd344e04c817d97e0cc75681d248f/munch-2.5.0-py2.py3-none-any.whl
Collecting cligj>=0.5
  Downloading http

In [None]:
!wget https://data.worldpop.org/GIS/Population_Density/Global_2000_2020_1km/2020/IND/ind_pd_2020_1km_ASCII_XYZ.zip
!unzip ind_pd_2020_1km_ASCII_XYZ.zip

--2021-05-22 12:25:41--  https://data.worldpop.org/GIS/Population_Density/Global_2000_2020_1km/2020/IND/ind_pd_2020_1km_ASCII_XYZ.zip
Resolving data.worldpop.org (data.worldpop.org)... 152.78.118.157, ::ffff:152.78.118.157
Connecting to data.worldpop.org (data.worldpop.org)|152.78.118.157|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 53063505 (51M) [application/zip]
Saving to: ‘ind_pd_2020_1km_ASCII_XYZ.zip’


2021-05-22 12:26:15 (1.53 MB/s) - ‘ind_pd_2020_1km_ASCII_XYZ.zip’ saved [53063505/53063505]

Archive:  ind_pd_2020_1km_ASCII_XYZ.zip
  inflating: ind_pd_2020_1km_ASCII_XYZ.csv  


In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
import random
from shapely.geometry import Point, MultiPoint
from shapely.ops import unary_union

from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [None]:
population_density_data = pd.read_csv("ind_pd_2020_1km_ASCII_XYZ.csv")
columns_rename = {"X":"longitude", "Y":"latitude", "Z":"population_density"}
population_density_data['X'] = population_density_data['X'].round(6)
population_density_data['Y'] = population_density_data['Y'].round(6)
population_density_data.rename(columns_rename, axis=1, inplace=True)
population_density_data['point_object'] = population_density_data.progress_apply(lambda x : Point(x['longitude'], x['latitude']), axis=1)
population_density_data

100%|██████████| 4010402/4010402 [01:43<00:00, 38797.65it/s]


Unnamed: 0,longitude,latitude,population_density,point_object
0,77.827916,35.503750,1.465800,POINT (77.827916 35.50375)
1,77.836250,35.503750,1.427846,POINT (77.83625000000001 35.50375)
2,77.844583,35.503750,0.473976,POINT (77.844583 35.50375)
3,77.819583,35.495417,1.207597,POINT (77.81958299999999 35.495417)
4,77.827916,35.495417,0.479271,POINT (77.827916 35.495417)
...,...,...,...,...
4010397,93.827916,6.762083,20.232878,POINT (93.827916 6.762083)
4010398,93.836250,6.762083,17.460630,POINT (93.83625000000001 6.762083)
4010399,93.819583,6.753750,7.082357,POINT (93.81958299999999 6.75375)
4010400,93.827916,6.753750,14.493382,POINT (93.827916 6.75375)


In [None]:
def add_point(latitude, longitude):
    distances = pow(population_density_data['latitude']-latitude, 2) + pow(population_density_data['longitude']-longitude,2)
    sorted_df = population_density_data.loc[distances.sort_values().index]
    mean_population_density = sorted_df.iloc[:4]['population_density'].mean()
    
    new_row_index = len(population_density_data)
    
    population_density_data.at[new_row_index, 'longitude'] =  longitude
    population_density_data.at[new_row_index, 'latitude'] = latitude
    population_density_data.at[new_row_index, 'population_density'] = mean_population_density
    population_density_data.at[new_row_index, 'point_object'] = Point(longitude, latitude)

def get_lat_long_samples(n, polygon):
    subset = population_density_data[population_density_data['point_object'].progress_apply(polygon.contains)]
    
    if(len(subset)==0):
        raise Exception("No points within the given polygon")
    
    sample = subset.sample(weights='population_density', n=(n*10), replace=True).copy()
    
    sample.reset_index(drop=True, inplace=True)
    
    sample['latitude'] = sample['latitude'] + np.random.uniform(-0.015, 0.015, size=sample.shape[0])
    
    sample['longitude'] = sample['longitude'] + np.random.uniform(-0.015, 0.015, size=sample.shape[0])
    
    points = sample.progress_apply(lambda x : Point(x['longitude'], x['latitude']), axis=1)
    
    contained = points.progress_apply(polygon.contains)
    
    return sample[contained][['longitude', 'latitude']].sample(n, replace=True).values

In [None]:
admin_units = gpd.read_file("https://raw.githubusercontent.com/datameet/Municipal_Spatial_Data/master/Mumbai/BMC_Wards.geojson")
admin_units.sort_values(by='name', inplace=True)
admin_units.reset_index(drop=True, inplace=True)
admin_units

Unnamed: 0,gid,name,geometry
0,1,A,"MULTIPOLYGON (((72.84025 18.94881, 72.84030 18..."
1,2,B,"MULTIPOLYGON (((72.84456 18.96342, 72.84461 18..."
2,3,C,"MULTIPOLYGON (((72.83198 18.96174, 72.83197 18..."
3,4,D,"MULTIPOLYGON (((72.81873 18.96901, 72.81878 18..."
4,5,E,"MULTIPOLYGON (((72.84677 18.98183, 72.84658 18..."
5,8,F/N,"MULTIPOLYGON (((72.87091 19.05119, 72.87103 19..."
6,6,F/S,"MULTIPOLYGON (((72.85625 19.01059, 72.85630 19..."
7,9,G/N,"MULTIPOLYGON (((72.86699 19.05237, 72.86738 19..."
8,7,G/S,"MULTIPOLYGON (((72.82689 19.01942, 72.82691 19..."
9,18,H/E,"MULTIPOLYGON (((72.85932 19.08400, 72.85932 19..."


In [None]:
combined_boundary = unary_union(admin_units['geometry'])

In [None]:
within_combined_boundary_indicies = population_density_data['point_object'].progress_apply(combined_boundary.contains)

100%|██████████| 4010402/4010402 [00:16<00:00, 238006.65it/s]


In [None]:
population_density_data = population_density_data[within_combined_boundary_indicies].reset_index(drop=True)

In [None]:
synthetic_population = pd.read_csv("/content/drive/MyDrive/syndata_ism/mumbai/base_population_data/with_hlat_hlong_job.csv")
synthetic_population

Unnamed: 0.1,Unnamed: 0,mem_id,gender,age,literacy,religion,caste,residence,working,geog,household_id,H_Lat,H_Lon,AdminUnitName,AdminUnitLatitude,AdminUnitLongitude,JobLabel,JobID,essential_worker,PublicTransport_Jobs,Adherence_to_Intervention
0,0,1,male,41,literate,hindu,SC,urban,yes,Mumbai City,2097582,18.986606,72.852748,F/S,19.005878,72.839688,Ag labour,63,0,1,0.9
1,1,1,male,44,literate,hindu,SC,urban,yes,Mumbai City,2097582,18.986606,72.852748,F/S,19.005878,72.839688,Ag labour,63,0,1,0.9
2,2,2,female,39,literate,hindu,SC,urban,no,Mumbai City,2097582,18.986606,72.852748,F/S,19.005878,72.839688,Loaders,97,1,1,0.0
3,3,2,female,36,literate,hindu,SC,urban,no,Mumbai City,2097582,18.986606,72.852748,F/S,19.005878,72.839688,Ag labour,63,0,1,0.0
4,4,3,female,15,literate,hindu,SC,urban,no,Mumbai City,2097582,18.986606,72.852748,F/S,19.005878,72.839688,Student,199,0,1,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12442367,12442367,1,female,65,illiterate,hindu,other,urban,yes,Mumbai Suburban,2093979,19.199453,72.816009,P/N,19.187785,72.842307,Construction,95,0,1,1.0
12442368,12442368,1,female,65,illiterate,hindu,other,urban,yes,Mumbai Suburban,2094032,19.142611,72.810093,K/W,19.119500,72.844486,Clerical supe,30,0,1,1.0
12442369,12442369,1,female,65,illiterate,hindu,other,urban,yes,Mumbai Suburban,2094032,19.142611,72.810093,K/W,19.119500,72.844486,Plantation lab,64,0,1,1.0
12442370,12442370,1,female,69,illiterate,hindu,other,urban,yes,Mumbai Suburban,2094072,19.207354,72.852750,R/S,19.203963,72.845396,Tailors,79,0,1,1.0


In [None]:
import numpy as np
import pandas as pd
from multiprocessing import Pool
import math
import logging

def parallelize_dataframe(df, func, n_cores=8):
	df_split = np.array_split(df, n_cores)
	pool = Pool(n_cores)
	df = pd.concat(pool.map(func, df_split))
	pool.close()
	pool.join()
	return df

class Places:
	def __init__(self, city_id, city_population, n_process = 1):
		assert(type(city_id)==int)
		self.city_id = city_id
		self.city_population = city_population
		self.n_process = n_process

	def generate_workplaces(self, workplace_type_list):
		n_random_workplaces = int(self.city_population * np.random.normal(0.5,0.1)/100)
		random_workplace_types = np.random.choice(workplace_type_list, n_random_workplaces, replace=True)
		workplace_types = list(random_workplace_types)+list(set(workplace_type_list))+['Teachers']
		lat_lon_pairs = get_lat_long_samples(len(workplace_types), combined_boundary)
		workplace_lats = lat_lon_pairs.T[1]
		workplace_longs = lat_lon_pairs.T[0]
		workplace_names = [2*pow(10,12)+self.city_id*pow(10,9)+counter for counter in range(len(workplace_types))]
		self.workplaces = pd.DataFrame([workplace_names, workplace_types, workplace_lats, workplace_longs]).T
		self.workplaces.columns = ['workplace_name', 'workplace_type', 'workplace_lat', 'workplace_long']

	def generate_schools(self):
		teachers_workplaces = self.workplaces[self.workplaces['workplace_type']=='Teachers']
		self.schools = pd.DataFrame([teachers_workplaces['workplace_name'], teachers_workplaces['workplace_lat'], teachers_workplaces['workplace_long']]).T.copy()
		self.schools.columns = ['school_name', 'school_lat', 'school_long']
		self.schools['school_type'] = pd.Series(['school' for _ in range(self.schools.shape[1])])

	def generate_public_places(self):
		public_places_number = int(self.city_population * np.random.normal(0.5,0.1)/1000)
		lat_lon_pairs = get_lat_long_samples(public_places_number, combined_boundary)
		public_place_lats = lat_lon_pairs.T[1]
		public_place_longs = lat_lon_pairs.T[0]
		public_place_names = [3*pow(10,12)+self.city_id*pow(10,9)+counter for counter in range(public_places_number)]
		public_place_types = np.random.choice(['park', 'mall', 'gym'], public_places_number, replace=True)
		self.public_places = pd.DataFrame([public_place_names, public_place_types, public_place_lats, public_place_longs]).T
		self.public_places.columns = ['public_place_name', 'public_place_type', 'public_place_lat', 'public_place_long']

	def save_places(self):
		self.workplaces.to_csv(f"workplaces_{self.city_id}.csv")
		self.schools.to_csv(f"schools_{self.city_id}.csv")
		self.public_places.to_csv(f"public_places_{self.city_id}.csv")

	def assign_workplace_individual(self, individual):
		if(individual['WorksAtSameCategory']):
			possible_workplaces = self.workplaces[(self.workplaces['workplace_type']==individual['JobLabel'])]
		else:
			possible_workplaces = self.workplaces[(self.workplaces['workplace_type']!=individual['JobLabel'])]
		distances = 1/(pow(possible_workplaces['workplace_lat']-individual['H_Lat'],2)+pow(possible_workplaces['workplace_long']-individual['H_Lon'],2)).astype(np.float64).apply(np.sqrt)
		try:
			return possible_workplaces.sample(weights = distances).iloc[0][['workplace_name', 'workplace_lat', 'workplace_long']]
		except exception as ex:
			logging.exception(f"{str(dict(individual))}_{str(possible_workplaces)}", exc_info=ex)
			return self.workplaces.sample().iloc[0][['workplace_name', 'workplace_lat', 'workplace_long']]


	def _assign_workplaces(self, population):
		return population.progress_apply(self.assign_workplace_individual, axis=1)

	def assign_workplaces(self, adult_population):
		to_different_category = 0.05
		same_category = np.random.random(size=len(adult_population))>to_different_category
		adult_population['WorksAtSameCategory'] = same_category
		adult_population[['WorkPlaceID', 'W_Lat', 'W_Lon']] = parallelize_dataframe(adult_population, self._assign_workplaces, self.n_process)		
		return adult_population

	def assign_school_individual(self, individual):
		distances = 1/(pow(self.schools['school_lat']-individual['H_Lat'],2)+pow(self.schools['school_long']-individual['H_Lon'],2)).astype(np.float64).apply(np.sqrt)
		return self.schools.sample(weights=distances).iloc[0][['school_name', 'school_lat', 'school_long']]

	def _assign_schools(self, population):
		return population.progress_apply(self.assign_school_individual, axis=1)

	def assign_schools(self, children_population):
		children_population[['school_id', 'school_lat', 'school_long']] = parallelize_dataframe(children_population, self._assign_schools, self.n_process)
		return children_population

	def assign_public_place_individual(self, individual):
		distances = 1/(pow(self.public_places['public_place_lat']-individual['H_Lat'],2)+pow(self.public_places['public_place_long']-individual['H_Lon'],2)).astype(np.float64).apply(np.sqrt)
		return self.public_places.sample(weights=distances).iloc[0][['public_place_name', 'public_place_lat', 'public_place_long']]
	
	def _assign_public_places(self, population):
		return population.progress_apply(self.assign_public_place_individual, axis=1)

	def assign_public_places(self, population):
		population[['public_place_id', 'public_place_lat', 'public_place_long']] = parallelize_dataframe(population, self._assign_public_places, self.n_process)
		return population

In [None]:
synthetic_population = synthetic_population.sample(n=10000).reset_index()

n_processes = 16 #Multiprocessing Adjustments

places_object = Places(1, len(synthetic_population), n_processes)

In [None]:
synthetic_population.drop(synthetic_population.columns[0], axis=1, inplace=True)
synthetic_population.drop(synthetic_population.columns[0], axis=1, inplace=True)
synthetic_population

Unnamed: 0,mem_id,gender,age,literacy,religion,caste,residence,working,geog,household_id,H_Lat,H_Lon,AdminUnitName,AdminUnitLatitude,AdminUnitLongitude,JobLabel,JobID,essential_worker,PublicTransport_Jobs,Adherence_to_Intervention
0,2,female,38,literate,hindu,other,urban,yes,Mumbai City,2541390,19.048830,72.869675,F/N,19.029420,72.854606,Potters,89,0,1,0.0
1,3,female,16,literate,muslim,other,urban,no,Mumbai Suburban,153487,19.199427,72.857291,R/S,19.203963,72.845396,Student,199,0,1,0.4
2,2,female,59,literate,hindu,other,urban,no,Mumbai Suburban,452836,19.162647,72.879208,P/S,19.162660,72.846457,Ag labour,63,0,1,0.9
3,2,female,33,literate,hindu,SC,urban,no,Mumbai Suburban,1724738,19.063315,72.874674,L,19.070467,72.879094,Ag labour,63,0,1,0.1
4,5,male,15,literate,christian,other,urban,no,Mumbai Suburban,570046,19.069061,72.868042,H/E,19.085106,72.844545,Student,199,0,1,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,3,female,2,illiterate,hindu,other,urban,no,Mumbai Suburban,437225,19.040627,72.923347,M/E,19.056477,72.921546,Student,199,0,1,1.0
9996,3,female,5,literate,hindu,other,urban,no,Mumbai Suburban,1455864,19.052359,72.907775,M/W,19.061101,72.899304,Student,199,0,1,1.0
9997,2,female,26,literate,muslim,other,urban,no,Mumbai Suburban,1716272,19.094562,72.870998,K/E,19.120092,72.852387,Ag labour,63,0,1,0.2
9998,1,male,33,literate,muslim,other,urban,yes,Mumbai City,2647741,18.986156,72.846892,F/S,19.005878,72.839688,Painters,93,0,1,0.1


In [None]:
places_object.generate_workplaces(list(synthetic_population['JobLabel']))
places_object.generate_schools()
places_object.generate_public_places()

100%|██████████| 556/556 [00:00<00:00, 1267.26it/s]
100%|██████████| 1150/1150 [00:00<00:00, 36722.11it/s]
100%|██████████| 1150/1150 [00:00<00:00, 1280.76it/s]
100%|██████████| 556/556 [00:00<00:00, 1268.56it/s]
100%|██████████| 30/30 [00:00<00:00, 4810.90it/s]
100%|██████████| 30/30 [00:00<00:00, 1142.80it/s]


In [None]:
adults = synthetic_population[synthetic_population['age']>18]
adults = places_object.assign_workplaces(adults)

children = synthetic_population[synthetic_population['age']<19]
children = places_object.assign_schools(children)

total_population = pd.concat([adults,children], axis=0)
total_population = places_object.assign_public_places(total_population)

total_population#.to_csv("pune_123.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
100%|██████████| 1800/1800 [00:30<00:00, 59.25it/s]
100%|██████████| 1800/1800 [00:31<00:00, 56.45it/s] 
100%|██████████| 1800/1800 [00:31<00:00, 56.46it/s]
100%|██████████| 1800/1800 [00:31<00:00, 56.34it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
100%|██████████| 700/700 [00:09<00:00, 70.75it/s]
100%|██████████| 700/700 [00:10<00:00, 68.47it/s]
100%|██████████| 700/700 [00:10<00:00, 68.68it/s]
100%|██████████| 700/700 [00:10<00:00, 67.98it/s]
A value is trying to be set on a copy of a

Unnamed: 0.1,index,Unnamed: 0,mem_id,gender,age,literacy,religion,caste,residence,working,geog,household_id,H_Lat,H_Lon,AdminUnitName,AdminUnitLatitude,AdminUnitLongitude,JobLabel,JobID,essential_worker,PublicTransport_Jobs,Adherence_to_Intervention,WorksAtSameCategory,WorkPlaceID,W_Lat,W_Lon,school_id,school_lat,school_long,public_place_id,public_place_lat,public_place_long
0,111916,111916,2,female,38,literate,hindu,other,urban,yes,Mumbai City,2541390,19.048830,72.869675,F/N,19.029420,72.854606,Potters,89,0,1,0.0,True,2.001000e+12,18.948861,72.794831,,,,3001000000001,19.021563,72.841455
2,8629631,8629631,2,female,59,literate,hindu,other,urban,no,Mumbai Suburban,452836,19.162647,72.879208,P/S,19.162660,72.846457,Ag labour,63,0,1,0.9,True,2.001000e+12,19.139668,72.955835,,,,3001000000002,19.176185,72.808249
3,3199042,3199042,2,female,33,literate,hindu,SC,urban,no,Mumbai Suburban,1724738,19.063315,72.874674,L,19.070467,72.879094,Ag labour,63,0,1,0.1,True,2.001000e+12,19.139668,72.955835,,,,3001000000001,19.021563,72.841455
6,7628841,7628841,2,female,42,literate,hindu,other,urban,no,Mumbai Suburban,23388,19.169678,72.937440,T,19.175625,72.950922,Book-keepers,33,0,1,0.9,True,2.001000e+12,18.992717,72.842269,,,,3001000000002,19.176185,72.808249
7,5541794,5541794,2,female,47,literate,hindu,other,urban,no,Mumbai Suburban,1595466,19.258260,72.864760,R/N,19.120092,72.852387,Construction,95,0,1,0.9,True,2.001000e+12,19.137121,72.810252,,,,3001000000000,18.961757,72.814448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9976,11276792,11276792,3,male,18,literate,buddhist,other,urban,no,Mumbai Suburban,1303675,19.081860,72.900899,N,19.083932,72.906442,Ag labour,63,0,1,0.4,,,,,2.001000e+12,19.133085,72.839827,3001000000001,19.021563,72.841455
9987,3511197,3511197,4,female,11,literate,hindu,other,urban,no,Mumbai Suburban,1211860,19.191207,72.819567,P/N,19.187785,72.842307,Student,199,0,1,0.8,,,,,2.001000e+12,19.133085,72.839827,3001000000002,19.176185,72.808249
9989,6236989,6236989,3,male,14,literate,hindu,other,urban,no,Mumbai Suburban,1052965,19.050090,72.933624,M/E,19.056477,72.921546,Student,199,0,1,0.8,,,,,2.001000e+12,19.133085,72.839827,3001000000000,18.961757,72.814448
9995,8104481,8104481,3,female,2,illiterate,hindu,other,urban,no,Mumbai Suburban,437225,19.040627,72.923347,M/E,19.056477,72.921546,Student,199,0,1,1.0,,,,,2.001000e+12,19.133085,72.839827,3001000000002,19.176185,72.808249
