<a href="https://colab.research.google.com/github/sonleh96/wb-gpbp-ldt/blob/dev-ghinwa/HealthCareFacilityAccessibility.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Accessing Data on Google Buckets**

In [None]:
!pip install google-cloud-storage
!pip install gcsfs
import pandas as pd
import gcsfs
import geopandas as gpd
import json
from shapely.geometry import Polygon,MultiPolygon
import time
import itertools

In [2]:
from google.colab import auth
auth.authenticate_user()

In [3]:
from google.cloud import storage

# Create a client
client = storage.Client()

#Access the Google Bucket
bucket_name = 'wb-ldt'
bucket = client.get_bucket(bucket_name)

#Prints files in the google bucket
'''
blobs = bucket.list_blobs()

for blob in blobs:
    print(blob.name)
'''

'\nblobs = bucket.list_blobs()\n\nfor blob in blobs:\n    print(blob.name)\n'

In [4]:
#Name of Shape File
file_path = "shapefiles/gadm41_SRB_2.json"
gcs_file_path = 'gs://' + bucket_name + '/' + file_path

#Read Shape File --> The shape file gives a MultiPolygon Geometry Column
gdf = gpd.read_file(gcs_file_path)

#Adjust for GeoSpatial Data
center = gpd.GeoDataFrame(gdf[['GID_2', 'NAME_2']])

#Change the MultiPolygon Geometry Column to make it more useful
center['geometry'] = gdf.centroid
center = center.to_crs(gdf.crs)
center['lat'] = center.geometry.y
center['lon'] = center.geometry.x
#center


  center['geometry'] = gdf.centroid
  center['geometry'] = gdf.centroid


**Reading Meta Population Data**

In [None]:
!pip install rasterio

import rasterio
from shapely.geometry import Point
import numpy as np

In [6]:
#Name of Shape File
file_path = "RS/raw-data/geospatial/population_data_meta/Population Serbia 2019.csv"
pop_file_path = 'gs://' + bucket_name + '/' + file_path

In [7]:
pop_df = pd.read_csv(pop_file_path)

# Assuming your CSV has 'latitude' and 'longitude' columns, create a geometry column
pop_df['geometry'] = pop_df.apply(lambda row: Point(row['Lon'], row['Lat']), axis=1)

# Convert the DataFrame to a GeoDataFrame, specifying the coordinate reference system (CRS)
pop_gdf = gpd.GeoDataFrame(pop_df, geometry='geometry')

# Optionally, set the CRS (e.g., WGS84 which is commonly used for latitude and longitude)
pop_gdf.set_crs(epsg=4326, inplace=True)

Unnamed: 0,Lat,Lon,Population,geometry
0,44.215417,19.839028,0.496571,POINT (19.83903 44.21542)
1,44.917639,20.304028,6.588225,POINT (20.30403 44.91764)
2,44.900972,20.282361,6.588225,POINT (20.28236 44.90097)
3,44.903472,20.287083,6.588225,POINT (20.28708 44.90347)
4,44.857917,20.335417,6.588225,POINT (20.33542 44.85792)
...,...,...,...,...
3666472,44.612083,21.199028,2.827553,POINT (21.19903 44.61208)
3666473,44.609583,21.194306,2.827553,POINT (21.19431 44.60958)
3666474,44.623472,21.195972,2.827553,POINT (21.19597 44.62347)
3666475,44.619861,21.201250,2.827553,POINT (21.20125 44.61986)


In [8]:
pop_gdf['ID'] = pop_df.index
pop_gdf.head()

Unnamed: 0,Lat,Lon,Population,geometry,ID
0,44.215417,19.839028,0.496571,POINT (19.83903 44.21542),0
1,44.917639,20.304028,6.588225,POINT (20.30403 44.91764),1
2,44.900972,20.282361,6.588225,POINT (20.28236 44.90097),2
3,44.903472,20.287083,6.588225,POINT (20.28708 44.90347),3
4,44.857917,20.335417,6.588225,POINT (20.33542 44.85792),4


In [10]:
print('Total Population:',round(pop_gdf['Population'].sum()/1000000,2),'million')

Total Population: 6.64 million


In [11]:
pop_gdf = pop_gdf.rename(columns={'Lat': 'lat', 'Lon': 'lon'})
gdf = gdf[['GID_2', 'NAME_2', 'ENGTYPE_2', 'geometry']]

In [12]:
population_aoi = gpd.sjoin(pop_gdf, gdf, predicate='within',
                           how = 'inner')

In [None]:
#population_aoi.head()
print('Total Population:',round(population_aoi['Population'].sum()/1000000,2),'million')

Total Population: 6.64 million


**Extracting Hospital Data from OpenStreetMap using Overpass API**

In [14]:
import requests

overpass_url = "http://overpass-api.de/api/interpreter"
overpass_query = """
[out:json];
area["ISO3166-1"="RS"];
(node["amenity"="hospital"](area);
 way["amenity"="hospital"](area);
 rel["amenity"="hospital"](area);
);
out center;
"""
response = requests.get(overpass_url,
                        params={'data': overpass_query})
data = response.json()

df_hospitals = pd.DataFrame(data['elements'])

df_hospitals['name'] = df_hospitals['tags'].apply(lambda x:x['name'] if 'name' in list(x.keys()) else None)

df_hospitals = df_hospitals[['id','lat','lon','name']].drop_duplicates()

overpass_url = "http://overpass-api.de/api/interpreter"
overpass_query = """
[out:json];
area["ISO3166-1"="RS"];
(node["amenity"="clinic"](area);
 way["amenity"="clinic"](area);
 rel["amenity"="clinic"](area);
);
out center;
"""
response = requests.get(overpass_url,
                        params={'data': overpass_query})
data = response.json()

df_clinics = pd.DataFrame(data['elements'])
df_clinics['name'] = df_clinics['tags'].apply(lambda x:x['name'] if 'name' in list(x.keys()) else None)
df_clinics['amenity'] = df_clinics['tags'].apply(lambda x: x['healthcare'] if 'healthcare' in list(x.keys()) else None)

df_clinics = df_clinics[['id','lat','lon','name','amenity']].drop_duplicates()

df_health_osm = pd.concat([df_hospitals,df_clinics])
df_health_osm = gpd.GeoDataFrame(df_health_osm, geometry=gpd.points_from_xy(df_health_osm.lon, df_health_osm.lat))
df_health_osm = df_health_osm[['id','name','geometry']]

print('Number of hospitals and clinics extracted:',len(df_health_osm))

Number of hospitals and clinics extracted: 534


In [16]:
##### Relook at what is happening here, doesn't feel right ####
df_health_osm = df_health_osm.set_crs(gdf.crs)
selected_hosp = gpd.sjoin(df_health_osm, gdf, predicate='within')

In [17]:
len(selected_hosp)

234

In [32]:
#Save the HealthCare Facilities in Serbia CSV file
from io import BytesIO

# Convert DataFrame to CSV in memory
csv_buffer = BytesIO()
selected_hosp.to_csv(csv_buffer, index=False)

# Move the buffer's position to the beginning
csv_buffer.seek(0)

def upload_csv_to_gcs(bucket_name, destination_blob_name, file_buffer):
    """Uploads a file from a buffer to Google Cloud Storage."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    # Upload the file-like object (buffer)
    blob.upload_from_file(file_buffer, content_type='text/csv')
    print(f"File uploaded to {destination_blob_name}.")

# Define your GCS bucket name and destination path
destination_blob_name = 'RS/processed-data/healthcare-facilities-withpopulationaccess.csv'

# Upload the CSV from the buffer directly
upload_csv_to_gcs(bucket_name, destination_blob_name, csv_buffer)


File uploaded to RS/processed-data/healthcare-facilities-withpopulationaccess.csv.


**Accessibility Analysis - Using Open Route Service API**

In [19]:
def get_isochrone_osm (each_hosp,travel_time_secs):
  body = {"locations":[[each_hosp.x,each_hosp.y]],"range":[travel_time_secs],"range_type":'time'}
  headers = {
      'Accept': 'application/json, application/geo+json, application/gpx+xml, img/png; charset=utf-8',
      'Authorization': '5b3ce3597851110001cf624866056989d35f40c7bbb5d14df6969a1f',
      'Content-Type': 'application/json; charset=utf-8'
  }
  call = requests.post('https://api.openrouteservice.org/v2/isochrones/foot-walking', json=body, headers=headers)

  if(call.status_code==200):
    #print("here")
    geom = (json.loads(call.text)['features'][0]['geometry'])
    polygon_geom = Polygon(geom['coordinates'][0])
    attempts_per_minute =+1
    return polygon_geom

  else:
    #print("wtf is this")
    return None

In [None]:
# How do we want to define the Travel Time as "Accessible"?

#The API is limited to 20 Attempts per minute
#Each iteration, time.sleep(3)
cachement_area_osm = list()

for geom in selected_hosp['geometry']:
  accesible_area = get_isochrone_osm(geom, travel_time_secs=3600)
  cachement_area_osm.append(accesible_area)
  time.sleep(3)

selected_hosp['cachment_area_osm'] = cachement_area_osm

In [23]:
#Get Population Count of People with Access to Hospital
def get_pop_count(cachment,pop_data):
  if(cachment!=None):
    pop_access = pop_data[pop_data.within(cachment)]
    id_values = (pop_access['ID'].values)
    pop_with_access = (pop_access['Population'].sum().round())
    return id_values,pop_with_access
  else:
    return [None,None]

In [25]:
#!pip install tqdm
from tqdm import tqdm
tqdm.pandas()

selected_hosp['id_with_access'], selected_hosp['pop_with_access'] = zip(*selected_hosp['cachment_area_osm'].progress_apply(get_pop_count, pop_data=population_aoi))

100%|██████████| 234/234 [04:56<00:00,  1.27s/it]


**Aggregate Per Region to Identify % of Population with Access**

In [None]:
'''
# This is how we make sure we aren't double counting
# This is done for the entire poppulation, need to do it per Municipal Region

list_ids_access = list(selected_hosp_temp['id_with_access'].values)

#This function takes a single iterable as an argument and all the elements of the input iterable should also be iterable
#and it returns a flattened iterable containing all the elements of the input iterable
list_ids_access = list(itertools.chain.from_iterable(list_ids_access))

#Limits the df to the population id that have access
pop_with_access = population_aoi[population_aoi['ID'].isin(list_ids_access)]
pop_without_access = population_aoi[~population_aoi['ID'].isin(list_ids_access)]

original_access = round(pop_with_access['Population'].sum()*100/population_aoi['Population'].sum(),2)

#print('Population with Access:',round(pop_with_access['Population'].sum()*100/population_aoi['Population'].sum(),2),'%')
'''

In [76]:
accessibility_df = list()

In [77]:
year = '2024'
municipalities = population_aoi['GID_2'].unique()

for municipality in municipalities:
  selected_hosp_temp = selected_hosp[selected_hosp['GID_2'] == municipality]
  population_aoi_temp = population_aoi[population_aoi['GID_2'] == municipality]

  list_ids_access = list(selected_hosp_temp['id_with_access'].values)

  if not list_ids_access:
    pop_accessibility = 0

  else:

    # Assuming list_ids_access is a list of lists and may contain None
    list_ids_access = [x for x in list_ids_access if x is not None]
    list_ids_access = list(itertools.chain.from_iterable(list_ids_access))

    pop_with_access = population_aoi_temp[population_aoi_temp['ID'].isin(list_ids_access)]
    pop_without_access = population_aoi_temp[~population_aoi_temp['ID'].isin(list_ids_access)]

    pop_accessibility = round(pop_with_access['Population'].sum()*100/population_aoi_temp['Population'].sum(),2)

  accessibility_df.append([municipality, year, pop_accessibility])

  pop_accessibility = round(pop_with_access['Population'].sum()*100/population_aoi_temp['Population'].sum(),2)


In [81]:
# Define column names
columns = ['GID_2', 'Year', 'healthcare-accessibility']

# Create DataFrame
df = pd.DataFrame(accessibility_df, columns=columns)

          GID_2  Year  healthcare-accessibility
0     SRB.7.6_1  2024                      0.00
1     SRB.3.8_1  2024                     89.45
2    SRB.3.16_1  2024                     46.77
3    SRB.20.6_1  2024                      0.00
4    SRB.20.4_1  2024                      0.00
..          ...   ...                       ...
156   SRB.5.3_1  2024                     52.50
157  SRB.18.6_1  2024                      0.00
158  SRB.18.1_1  2024                      0.00
159  SRB.18.2_1  2024                     26.14
160   SRB.4.2_1  2024                      0.00

[161 rows x 3 columns]


In [89]:
df = df.merge(center[['GID_2', 'NAME_2']],
              how = 'left')

# Define new column order
new_column_order = ['GID_2', 'NAME_2', 'Year', 'healthcare-accessibility']

# Reorder columns
df = df[new_column_order]

In [91]:
#Save the HealthCare Facilities in Serbia CSV file
from io import BytesIO

# Convert DataFrame to CSV in memory
csv_buffer = BytesIO()
df.to_csv(csv_buffer, index=False)

# Move the buffer's position to the beginning
csv_buffer.seek(0)

def upload_csv_to_gcs(bucket_name, destination_blob_name, file_buffer):
    """Uploads a file from a buffer to Google Cloud Storage."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    # Upload the file-like object (buffer)
    blob.upload_from_file(file_buffer, content_type='text/csv')
    print(f"File uploaded to {destination_blob_name}.")

# Define your GCS bucket name and destination path
destination_blob_name = 'RS/processed-data/SRB_access_tohealthcarefacilities.csv'

# Upload the CSV from the buffer directly
upload_csv_to_gcs(bucket_name, destination_blob_name, csv_buffer)

File uploaded to RS/processed-data/SRB_access_tohealthcarefacilities.csv.
