<a href="https://colab.research.google.com/github/sonleh96/wb-gpbp-ldt/blob/dev-ghinwa/School_Accesibility.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Accessing Data on Google Buckets**

In [1]:
!pip install google-cloud-storage
!pip install gcsfs
import pandas as pd
import gcsfs
import geopandas as gpd
import json
from shapely.geometry import Polygon,MultiPolygon
import time
import itertools



In [2]:
from google.colab import auth
auth.authenticate_user()

In [3]:
from google.cloud import storage

# Create a client
client = storage.Client()

#Access the Google Bucket
bucket_name = 'wb-ldt'
bucket = client.get_bucket(bucket_name)

#Prints files in the google bucket
'''
blobs = bucket.list_blobs()

for blob in blobs:
    print(blob.name)
'''

'\nblobs = bucket.list_blobs()\n\nfor blob in blobs:\n    print(blob.name)\n'

In [4]:
#Name of Shape File
file_path = "shapefiles/gadm41_SRB_2.json"
gcs_file_path = 'gs://' + bucket_name + '/' + file_path

#Read Shape File --> The shape file gives a MultiPolygon Geometry Column
gdf = gpd.read_file(gcs_file_path)

#Adjust for GeoSpatial Data
center = gpd.GeoDataFrame(gdf[['GID_2', 'NAME_2']])

#Change the MultiPolygon Geometry Column to make it more useful
center['geometry'] = gdf.centroid
center = center.to_crs(gdf.crs)
center['lat'] = center.geometry.y
center['lon'] = center.geometry.x
#center


  center['geometry'] = gdf.centroid
  center['geometry'] = gdf.centroid


**Reading Meta Population Data**

In [5]:
!pip install rasterio

import rasterio
from shapely.geometry import Point
import numpy as np

Collecting rasterio
  Downloading rasterio-1.3.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting affine (from rasterio)
  Downloading affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting snuggs>=1.4.1 (from rasterio)
  Downloading snuggs-1.4.7-py3-none-any.whl.metadata (3.4 kB)
Downloading rasterio-1.3.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.7/21.7 MB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading snuggs-1.4.7-py3-none-any.whl (5.4 kB)
Downloading affine-2.4.0-py3-none-any.whl (15 kB)
Installing collected packages: snuggs, affine, rasterio
Successfully installed affine-2.4.0 rasterio-1.3.11 snuggs-1.4.7


In [6]:
#Name of Shape File
file_path = "RS/raw-data/geospatial/population_data_meta/Population Serbia 2019.csv"
pop_file_path = 'gs://' + bucket_name + '/' + file_path

In [7]:
pop_df = pd.read_csv(pop_file_path)

# Assuming your CSV has 'latitude' and 'longitude' columns, create a geometry column
pop_df['geometry'] = pop_df.apply(lambda row: Point(row['Lon'], row['Lat']), axis=1)

# Convert the DataFrame to a GeoDataFrame, specifying the coordinate reference system (CRS)
pop_gdf = gpd.GeoDataFrame(pop_df, geometry='geometry')

# Optionally, set the CRS (e.g., WGS84 which is commonly used for latitude and longitude)
pop_gdf.set_crs(epsg=4326, inplace=True)

Unnamed: 0,Lat,Lon,Population,geometry
0,44.215417,19.839028,0.496571,POINT (19.83903 44.21542)
1,44.917639,20.304028,6.588225,POINT (20.30403 44.91764)
2,44.900972,20.282361,6.588225,POINT (20.28236 44.90097)
3,44.903472,20.287083,6.588225,POINT (20.28708 44.90347)
4,44.857917,20.335417,6.588225,POINT (20.33542 44.85792)
...,...,...,...,...
3666472,44.612083,21.199028,2.827553,POINT (21.19903 44.61208)
3666473,44.609583,21.194306,2.827553,POINT (21.19431 44.60958)
3666474,44.623472,21.195972,2.827553,POINT (21.19597 44.62347)
3666475,44.619861,21.201250,2.827553,POINT (21.20125 44.61986)


In [8]:
pop_gdf['ID'] = pop_df.index
pop_gdf.head()

Unnamed: 0,Lat,Lon,Population,geometry,ID
0,44.215417,19.839028,0.496571,POINT (19.83903 44.21542),0
1,44.917639,20.304028,6.588225,POINT (20.30403 44.91764),1
2,44.900972,20.282361,6.588225,POINT (20.28236 44.90097),2
3,44.903472,20.287083,6.588225,POINT (20.28708 44.90347),3
4,44.857917,20.335417,6.588225,POINT (20.33542 44.85792),4


In [9]:
print('Total Population:',round(pop_gdf['Population'].sum()/1000000,2),'million')

Total Population: 6.64 million


In [10]:
pop_gdf = pop_gdf.rename(columns={'Lat': 'lat', 'Lon': 'lon'})
gdf = gdf[['GID_2', 'NAME_2', 'ENGTYPE_2', 'geometry']]

In [11]:
population_aoi = gpd.sjoin(pop_gdf, gdf, predicate='within',
                           how = 'inner')

In [12]:
#population_aoi.head()
print('Total Population:',round(population_aoi['Population'].sum()/1000000,2),'million')

Total Population: 6.64 million


**Extracting Hospital Data from OpenStreetMap using Overpass API**

In [13]:
import requests
import pandas as pd
import geopandas as gpd

# Define the Overpass API URL
overpass_url = "http://overpass-api.de/api/interpreter"

# Query to extract all schools in Serbia
overpass_query = """
[out:json];
area["ISO3166-1"="RS"];
(node["amenity"="school"](area);
 way["amenity"="school"](area);
 rel["amenity"="school"](area);
);
out center;
"""

# Send request to Overpass API
response = requests.get(overpass_url, params={'data': overpass_query})
data = response.json()

# Create DataFrame from the Overpass API response
df_schools = pd.DataFrame(data['elements'])

# Extract the name of the school if available
df_schools['name'] = df_schools['tags'].apply(lambda x: x['name'] if 'name' in x.keys() else None)

# Create a GeoDataFrame with geometry points from lat/lon
df_schools = df_schools[['id', 'lat', 'lon', 'name']].drop_duplicates()
df_schools = gpd.GeoDataFrame(df_schools, geometry=gpd.points_from_xy(df_schools.lon, df_schools.lat))

# Keep only the columns of interest
df_schools = df_schools[['id', 'name', 'geometry']]

# Print the number of schools extracted
print('Number of schools extracted:', len(df_schools))

# Optional: display the first few rows
df_schools.head()

Number of schools extracted: 514


Unnamed: 0,id,name,geometry
0,417543508,Јован Јовановић Змај,POINT (22.32119 44.11268)
1,464243294,Десанка Максимовић,POINT (22.11784 42.90879)
2,472405230,,POINT (22.52527 44.29292)
3,477410829,Десанка Максимовић,POINT (22.12062 42.85896)
4,498744097,,POINT (20.42124 42.57631)


In [14]:
df_schools = df_schools.set_crs(gdf.crs)
schools_serbia = gpd.sjoin(df_schools, gdf, predicate='within')

In [15]:
len(schools_serbia)

295

In [24]:
#Save the HealthCare Facilities in Serbia CSV file
from io import BytesIO

# Convert DataFrame to CSV in memory
csv_buffer = BytesIO()
schools_serbia.to_csv(csv_buffer, index=False)

# Move the buffer's position to the beginning
csv_buffer.seek(0)

def upload_csv_to_gcs(bucket_name, destination_blob_name, file_buffer):
    """Uploads a file from a buffer to Google Cloud Storage."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    # Upload the file-like object (buffer)
    blob.upload_from_file(file_buffer, content_type='text/csv')
    print(f"File uploaded to {destination_blob_name}.")

# Define your GCS bucket name and destination path
destination_blob_name = 'RS/processed-nonfinal-data/schoolsinserbia_accesibilty.csv'

# Upload the CSV from the buffer directly
upload_csv_to_gcs(bucket_name, destination_blob_name, csv_buffer)


File uploaded to RS/processed-nonfinal-data/schoolsinserbia_accesibilty.csv.


**Accessibility Analysis - Using Open Route Service API**

In [18]:
def get_isochrone_osm (each_hosp,travel_time_secs):
  body = {"locations":[[each_hosp.x,each_hosp.y]],"range":[travel_time_secs],"range_type":'time'}
  headers = {
      'Accept': 'application/json, application/geo+json, application/gpx+xml, img/png; charset=utf-8',
      'Authorization': '5b3ce3597851110001cf624866056989d35f40c7bbb5d14df6969a1f',
      'Content-Type': 'application/json; charset=utf-8'
  }
  call = requests.post('https://api.openrouteservice.org/v2/isochrones/foot-walking', json=body, headers=headers)

  if(call.status_code==200):
    #print("here")
    geom = (json.loads(call.text)['features'][0]['geometry'])
    polygon_geom = Polygon(geom['coordinates'][0])
    attempts_per_minute =+1
    return polygon_geom

  else:
    #print("wtf is this")
    return None

In [20]:
# How do we want to define the Travel Time as "Accessible"?

#The API is limited to 20 Attempts per minute
#Each iteration, time.sleep(3)
cachement_area_osm = list()

for geom in schools_serbia['geometry']:
  accesible_area = get_isochrone_osm(geom, travel_time_secs=3600)
  cachement_area_osm.append(accesible_area)
  time.sleep(3)

schools_serbia['cachment_area_osm'] = cachement_area_osm

In [22]:
#Get Population Count of People with Access to Hospital
def get_pop_count(cachment,pop_data):
  if(cachment!=None):
    pop_access = pop_data[pop_data.within(cachment)]
    id_values = (pop_access['ID'].values)
    pop_with_access = (pop_access['Population'].sum().round())
    return id_values,pop_with_access
  else:
    return [None,None]

In [23]:
#!pip install tqdm
from tqdm import tqdm
tqdm.pandas()

schools_serbia['id_with_access'], schools_serbia['pop_with_access'] = zip(*schools_serbia['cachment_area_osm'].progress_apply(get_pop_count, pop_data=population_aoi))

100%|██████████| 295/295 [05:14<00:00,  1.07s/it]


**Aggregate Per Region to Identify % of Population with Access**

In [None]:
'''
# This is how we make sure we aren't double counting
# This is done for the entire poppulation, need to do it per Municipal Region

list_ids_access = list(selected_hosp_temp['id_with_access'].values)

#This function takes a single iterable as an argument and all the elements of the input iterable should also be iterable
#and it returns a flattened iterable containing all the elements of the input iterable
list_ids_access = list(itertools.chain.from_iterable(list_ids_access))

#Limits the df to the population id that have access
pop_with_access = population_aoi[population_aoi['ID'].isin(list_ids_access)]
pop_without_access = population_aoi[~population_aoi['ID'].isin(list_ids_access)]

original_access = round(pop_with_access['Population'].sum()*100/population_aoi['Population'].sum(),2)

#print('Population with Access:',round(pop_with_access['Population'].sum()*100/population_aoi['Population'].sum(),2),'%')
'''

In [25]:
accessibility_df = list()

In [26]:
year = '2024'
municipalities = population_aoi['GID_2'].unique()

for municipality in municipalities:
  selected_hosp_temp = schools_serbia[schools_serbia['GID_2'] == municipality]
  population_aoi_temp = population_aoi[population_aoi['GID_2'] == municipality]

  list_ids_access = list(selected_hosp_temp['id_with_access'].values)

  if not list_ids_access:
    pop_accessibility = 0

  else:

    # Assuming list_ids_access is a list of lists and may contain None
    list_ids_access = [x for x in list_ids_access if x is not None]
    list_ids_access = list(itertools.chain.from_iterable(list_ids_access))

    pop_with_access = population_aoi_temp[population_aoi_temp['ID'].isin(list_ids_access)]
    pop_without_access = population_aoi_temp[~population_aoi_temp['ID'].isin(list_ids_access)]

    pop_accessibility = round(pop_with_access['Population'].sum()*100/population_aoi_temp['Population'].sum(),2)

  accessibility_df.append([municipality, year, pop_accessibility])

  pop_accessibility = round(pop_with_access['Population'].sum()*100/population_aoi_temp['Population'].sum(),2)


In [27]:
# Define column names
columns = ['GID_2', 'Year', 'school-accessibility']

# Create DataFrame
df = pd.DataFrame(accessibility_df, columns=columns)

In [28]:
df = df.merge(center[['GID_2', 'NAME_2']],
              how = 'left')

# Define new column order
new_column_order = ['GID_2', 'NAME_2', 'Year', 'school-accessibility']

# Reorder columns
df = df[new_column_order]

In [29]:
#Save the HealthCare Facilities in Serbia CSV file
from io import BytesIO

# Convert DataFrame to CSV in memory
csv_buffer = BytesIO()
df.to_csv(csv_buffer, index=False)

# Move the buffer's position to the beginning
csv_buffer.seek(0)

def upload_csv_to_gcs(bucket_name, destination_blob_name, file_buffer):
    """Uploads a file from a buffer to Google Cloud Storage."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    # Upload the file-like object (buffer)
    blob.upload_from_file(file_buffer, content_type='text/csv')
    print(f"File uploaded to {destination_blob_name}.")

# Define your GCS bucket name and destination path
destination_blob_name = 'RS/processed-data/SRB_access_toschool.csv'

# Upload the CSV from the buffer directly
upload_csv_to_gcs(bucket_name, destination_blob_name, csv_buffer)

File uploaded to RS/processed-data/SRB_access_toschool.csv.


In [31]:
df['school-accessibility'].unique()

array([  0.94,  73.69,   0.  ,   2.35,  40.07,   3.06,   2.86,  71.46,
         6.93,  78.89,   6.19,  67.38,   5.07,  75.91,  69.25,   5.45,
        31.11,  40.38,  82.88,   7.5 ,   3.18,  67.21,  65.33,  70.55,
        41.07,  29.19,   2.3 ,  39.65,  40.61,   8.11,  51.59,  16.03,
         8.31,   8.8 ,  51.55,  36.54,   3.2 ,   2.43,  79.8 ,  64.77,
        18.11,  55.27,  10.51,  43.99,  66.7 ,  56.8 ,   5.61,  81.23,
         5.5 ,   2.65,  43.56,  15.96,   8.26,  53.66,   7.69,  22.93,
         2.68,  26.21,   2.77,  70.03,  29.31,  54.66,  99.57,  95.02,
       100.  ,  62.57,  74.49,  53.78,  46.19,  11.94,   4.32,  76.44,
         2.46,   2.89,  58.97,   3.77,   8.7 ,  55.12,  61.29,  79.85,
        53.75,   0.77,  53.47,  35.17,  43.52,   2.94,    nan,   5.04,
        12.62,   5.51,  58.98,  67.23,  44.97,  55.81,  66.61,  74.22,
        57.83,   6.24,  16.14,  19.68,   6.07,  89.07,  27.44])